1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  48                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  49   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  50   Register flag = t1;
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmp1Reg;
  54   Register tmp = tmp2Reg;
  55   Label object_has_monitor;
  56   // Finish fast lock successfully. MUST branch to with flag == 0
  57   Label locked;
  58   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
  59   Label slow_path;
  60 
  61   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  62   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  63 
  64   mv(flag, 1);
  65 
  66   // Load markWord from object into displaced_header.
  67   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  68 
  69   if (DiagnoseSyncOnValueBasedClasses != 0) {
  70     load_klass(tmp, oop);
  71     lbu(tmp, Address(tmp, Klass::misc_flags_offset()));
  72     test_bit(tmp, tmp, exact_log2(KlassFlags::_misc_is_value_based_class));
  73     bnez(tmp, slow_path);
  74   }
  75 




  76   if (LockingMode == LM_MONITOR) {
  77     j(slow_path);
  78   } else {
  79     assert(LockingMode == LM_LEGACY, "must be");
  80 
  81     // Check for existing monitor
  82     test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value));
  83     bnez(tmp, object_has_monitor);
  84 
  85     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  86     ori(tmp, disp_hdr, markWord::unlocked_value);
  87 
  88     // Initialize the box. (Must happen before we update the object mark!)
  89     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  90 
  91     // Compare object markWord with an unlocked value (tmp) and if
  92     // equal exchange the stack address of our box with object markWord.
  93     // On failure disp_hdr contains the possibly locked markWord.
  94     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64,
  95             Assembler::aq, Assembler::rl, /*result*/disp_hdr);
  96     beq(disp_hdr, tmp, locked);
  97 
  98     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  99 
 100     // If the compare-and-exchange succeeded, then we found an unlocked
 101     // object, will have now locked it will continue at label locked
 102     // We did not see an unlocked object so try the fast recursive case.
 103 
 104     // Check if the owner is self by comparing the value in the
 105     // markWord of object (disp_hdr) with the stack pointer.
 106     sub(disp_hdr, disp_hdr, sp);
 107     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 108     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label
 109     // locked, hence we can store 0 as the displaced header in the box, which indicates that it
 110     // is a recursive lock.
 111     andr(tmp/*==0?*/, disp_hdr, tmp);
 112     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 113     beqz(tmp, locked);
 114     j(slow_path);
 115   }
 116 
 117   // Handle existing monitor.
 118   bind(object_has_monitor);
 119 
 120   // The object's monitor m is unlocked iff m->owner == nullptr,
 121   // otherwise m->owner may contain a thread id, a stack address for LM_LEGACY,
 122   // the ANONYMOUS_OWNER constant for LM_LIGHTWEIGHT.
 123   //
 124   // Try to CAS m->owner from null to current thread id.
 125   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 126   Register tid = disp_hdr;
 127   ld(tid, Address(xthread, JavaThread::lock_id_offset()));
 128   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/tid, Assembler::int64,
 129           Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected)
 130 
 131   // Store a non-null value into the box to avoid looking like a re-entrant
 132   // lock. The fast-path monitor unlock code checks for
 133   // markWord::monitor_value so use markWord::unused_mark which has the
 134   // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 135   mv(tmp, (address)markWord::unused_mark().value());
 136   sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 137 
 138   beqz(tmp3Reg, locked); // CAS success means locking succeeded
 139 
 140   bne(tmp3Reg, tid, slow_path); // Check for recursive locking
 141 
 142   // Recursive lock case
 143   // Reload markWord from object into displaced_header.
 144   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 145   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg);
 146 
 147   bind(locked);
 148   mv(flag, zr);
 149   inc_held_monitor_count();
 150 
 151 #ifdef ASSERT
 152   // Check that locked label is reached with flag == 0.
 153   Label flag_correct;
 154   beqz(flag, flag_correct);
 155   stop("Fast Lock Flag != 0");
 156 #endif
 157 
 158   bind(slow_path);
 159 #ifdef ASSERT
 160   // Check that slow_path label is reached with flag != 0.
 161   bnez(flag, flag_correct);
 162   stop("Fast Lock Flag == 0");
 163   bind(flag_correct);
 164 #endif
 165   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 166 }
 167 
 168 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 169                                     Register tmp1Reg, Register tmp2Reg) {
 170   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 171   Register flag = t1;
 172   Register oop = objectReg;
 173   Register box = boxReg;
 174   Register disp_hdr = tmp1Reg;
 175   Register owner_addr = tmp1Reg;
 176   Register tmp = tmp2Reg;
 177   Label object_has_monitor;
 178   // Finish fast lock successfully. MUST branch to with flag == 0
 179   Label unlocked;
 180   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 181   Label slow_path;
 182 
 183   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 184   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 185 
 186   mv(flag, 1);
 187 
 188   if (LockingMode == LM_MONITOR) {
 189     j(slow_path);
 190   } else {
 191     assert(LockingMode == LM_LEGACY, "must be");


 192   }
 193 
 194   // Find the lock address and load the displaced header from the stack.
 195   ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 196 
 197   // If the displaced header is 0, we have a recursive unlock.
 198   beqz(disp_hdr, unlocked);
 199 
 200   // Handle existing monitor.
 201   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 202   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 203   bnez(t0, object_has_monitor);
 204 
 205   // Check if it is still a light weight lock, this is true if we
 206   // see the stack address of the basicLock in the markWord of the
 207   // object.
 208   cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64,
 209           Assembler::relaxed, Assembler::rl, /*result*/tmp);
 210   beq(box, tmp, unlocked); // box == tmp if cas succeeds
 211   j(slow_path);






 212 
 213   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 214 
 215   // Handle existing monitor.
 216   bind(object_has_monitor);
 217   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 218   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 219 
 220   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 221 
 222   Label notRecursive;
 223   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 224 
 225   // Recursive lock
 226   addi(disp_hdr, disp_hdr, -1);
 227   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 228   j(unlocked);
 229 
 230   bind(notRecursive);
 231   // Compute owner address.
 232   la(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 233 
 234   // Set owner to null.
 235   // Release to satisfy the JMM
 236   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 237   sd(zr, Address(owner_addr));
 238   // We need a full fence after clearing owner to avoid stranding.
 239   // StoreLoad achieves this.
 240   membar(StoreLoad);
 241 
 242   // Check if the entry lists are empty.
 243   ld(t0, Address(tmp, ObjectMonitor::EntryList_offset()));
 244   ld(tmp1Reg, Address(tmp, ObjectMonitor::cxq_offset()));
 245   orr(t0, t0, tmp1Reg);
 246   beqz(t0, unlocked); // If so we are done.
 247 
 248   // Check if there is a successor.
 249   ld(t0, Address(tmp, ObjectMonitor::succ_offset()));
 250   bnez(t0, unlocked); // If so we are done.
 251 
 252   // Save the monitor pointer in the current thread, so we can try to
 253   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 254   sd(tmp, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
 255 
 256   mv(flag, 1);
 257   j(slow_path);
 258 
 259   bind(unlocked);
 260   mv(flag, zr);
 261   dec_held_monitor_count();
 262 
 263 #ifdef ASSERT
 264   // Check that unlocked label is reached with flag == 0.
 265   Label flag_correct;
 266   beqz(flag, flag_correct);
 267   stop("Fast Lock Flag != 0");
 268 #endif
 269 
 270   bind(slow_path);
 271 #ifdef ASSERT
 272   // Check that slow_path label is reached with flag != 0.
 273   bnez(flag, flag_correct);
 274   stop("Fast Lock Flag == 0");
 275   bind(flag_correct);
 276 #endif
 277   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 278 }
 279 
 280 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box,
 281                                               Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
 282   // Flag register, zero for success; non-zero for failure.
 283   Register flag = t1;
 284 
 285   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 286   assert_different_registers(obj, box, tmp1, tmp2, tmp3, tmp4, flag, t0);
 287 
 288   mv(flag, 1);
 289 
 290   // Handle inflated monitor.
 291   Label inflated;
 292   // Finish fast lock successfully. MUST branch to with flag == 0
 293   Label locked;
 294   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 295   Label slow_path;
 296 
 297   if (UseObjectMonitorTable) {
 298     // Clear cache in case fast locking succeeds.
 299     sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 300   }
 301 
 302   if (DiagnoseSyncOnValueBasedClasses != 0) {
 303     load_klass(tmp1, obj);
 304     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
 305     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
 306     bnez(tmp1, slow_path);
 307   }
 308 
 309   const Register tmp1_mark = tmp1;
 310   const Register tmp3_t = tmp3;
 311 
 312   { // Lightweight locking
 313 
 314     // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
 315     Label push;
 316 
 317     const Register tmp2_top = tmp2;
 318 
 319     // Check if lock-stack is full.
 320     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 321     mv(tmp3_t, (unsigned)LockStack::end_offset());
 322     bge(tmp2_top, tmp3_t, slow_path);
 323 
 324     // Check if recursive.
 325     add(tmp3_t, xthread, tmp2_top);
 326     ld(tmp3_t, Address(tmp3_t, -oopSize));
 327     beq(obj, tmp3_t, push);
 328 
 329     // Relaxed normal load to check for monitor. Optimization for monitor case.
 330     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 331     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 332     bnez(tmp3_t, inflated);
 333 
 334     // Not inflated
 335     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
 336 
 337     // Try to lock. Transition lock-bits 0b01 => 0b00
 338     ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
 339     xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 340     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 341             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
 342     bne(tmp1_mark, tmp3_t, slow_path);
 343 
 344     bind(push);
 345     // After successful lock, push object on lock-stack.
 346     add(tmp3_t, xthread, tmp2_top);
 347     sd(obj, Address(tmp3_t));
 348     addw(tmp2_top, tmp2_top, oopSize);
 349     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 350     j(locked);
 351   }
 352 
 353   { // Handle inflated monitor.
 354     bind(inflated);
 355 
 356     const Register tmp1_monitor = tmp1;
 357 
 358     if (!UseObjectMonitorTable) {
 359       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 360     } else {
 361       Label monitor_found;
 362 
 363       // Load cache address
 364       la(tmp3_t, Address(xthread, JavaThread::om_cache_oops_offset()));
 365 
 366       const int num_unrolled = 2;
 367       for (int i = 0; i < num_unrolled; i++) {
 368         ld(tmp1, Address(tmp3_t));
 369         beq(obj, tmp1, monitor_found);
 370         add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference()));
 371       }
 372 
 373       Label loop;
 374 
 375       // Search for obj in cache.
 376       bind(loop);
 377 
 378       // Check for match.
 379       ld(tmp1, Address(tmp3_t));
 380       beq(obj, tmp1, monitor_found);
 381 
 382       // Search until null encountered, guaranteed _null_sentinel at end.
 383       add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference()));
 384       bnez(tmp1, loop);
 385       // Cache Miss. Take the slowpath.
 386       j(slow_path);
 387 
 388       bind(monitor_found);
 389       ld(tmp1_monitor, Address(tmp3_t, OMCache::oop_to_monitor_difference()));
 390     }
 391 
 392     const Register tmp2_owner_addr = tmp2;
 393     const Register tmp3_owner = tmp3;
 394 
 395     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 396     const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 397     const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 398 
 399     Label monitor_locked;
 400 
 401     // Compute owner address.
 402     la(tmp2_owner_addr, owner_address);
 403 
 404     // CAS owner (null => current thread).
 405     ld(tmp4, Address(xthread, JavaThread::lock_id_offset()));
 406     cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tmp4, Assembler::int64,
 407             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
 408     beqz(tmp3_owner, monitor_locked);
 409 
 410     // Check if recursive.
 411     bne(tmp3_owner, tmp4, slow_path);
 412 
 413     // Recursive.
 414     increment(recursions_address, 1, tmp2, tmp3);
 415 
 416     bind(monitor_locked);
 417     if (UseObjectMonitorTable) {
 418       sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 419     }
 420   }
 421 
 422   bind(locked);
 423   mv(flag, zr);

 424 
 425 #ifdef ASSERT
 426   // Check that locked label is reached with flag == 0.
 427   Label flag_correct;
 428   beqz(flag, flag_correct);
 429   stop("Fast Lock Flag != 0");
 430 #endif
 431 
 432   bind(slow_path);
 433 #ifdef ASSERT
 434   // Check that slow_path label is reached with flag != 0.
 435   bnez(flag, flag_correct);
 436   stop("Fast Lock Flag == 0");
 437   bind(flag_correct);
 438 #endif
 439   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 440 }
 441 
 442 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box,
 443                                                 Register tmp1, Register tmp2, Register tmp3) {
 444   // Flag register, zero for success; non-zero for failure.
 445   Register flag = t1;
 446 
 447   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 448   assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0);
 449 
 450   mv(flag, 1);
 451 
 452   // Handle inflated monitor.
 453   Label inflated, inflated_load_mark;
 454   // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
 455   Label unlocked;
 456   // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
 457   Label slow_path;
 458 
 459   const Register tmp1_mark = tmp1;
 460   const Register tmp2_top = tmp2;
 461   const Register tmp3_t = tmp3;
 462 
 463   { // Lightweight unlock
 464     Label push_and_slow_path;
 465 
 466     // Check if obj is top of lock-stack.
 467     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 468     subw(tmp2_top, tmp2_top, oopSize);
 469     add(tmp3_t, xthread, tmp2_top);
 470     ld(tmp3_t, Address(tmp3_t));
 471     // Top of lock stack was not obj. Must be monitor.
 472     bne(obj, tmp3_t, inflated_load_mark);
 473 
 474     // Pop lock-stack.
 475     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 476     DEBUG_ONLY(sd(zr, Address(tmp3_t));)
 477     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 478 
 479     // Check if recursive.
 480     add(tmp3_t, xthread, tmp2_top);
 481     ld(tmp3_t, Address(tmp3_t, -oopSize));
 482     beq(obj, tmp3_t, unlocked);
 483 
 484     // Not recursive.
 485     // Load Mark.
 486     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 487 
 488     // Check header for monitor (0b10).
 489     // Because we got here by popping (meaning we pushed in locked)
 490     // there will be no monitor in the box. So we need to push back the obj
 491     // so that the runtime can fix any potential anonymous owner.
 492     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 493     bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated);
 494 
 495     // Try to unlock. Transition lock bits 0b00 => 0b01
 496     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 497     ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 498     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 499             /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
 500     beq(tmp1_mark, tmp3_t, unlocked);
 501 
 502     bind(push_and_slow_path);
 503     // Compare and exchange failed.
 504     // Restore lock-stack and handle the unlock in runtime.
 505     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 506     DEBUG_ONLY(sd(obj, Address(tmp3_t));)
 507     addw(tmp2_top, tmp2_top, oopSize);
 508     sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 509     j(slow_path);
 510   }
 511 
 512   { // Handle inflated monitor.
 513     bind(inflated_load_mark);
 514     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 515 #ifdef ASSERT
 516     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 517     bnez(tmp3_t, inflated);
 518     stop("Fast Unlock not monitor");
 519 #endif
 520 
 521     bind(inflated);
 522 
 523 #ifdef ASSERT
 524     Label check_done;
 525     subw(tmp2_top, tmp2_top, oopSize);
 526     mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
 527     blt(tmp2_top, tmp3_t, check_done);
 528     add(tmp3_t, xthread, tmp2_top);
 529     ld(tmp3_t, Address(tmp3_t));
 530     bne(obj, tmp3_t, inflated);
 531     stop("Fast Unlock lock on stack");
 532     bind(check_done);
 533 #endif
 534 
 535     const Register tmp1_monitor = tmp1;
 536 
 537     if (!UseObjectMonitorTable) {
 538       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 539       // Untag the monitor.
 540       add(tmp1_monitor, tmp1_mark, -(int)markWord::monitor_value);
 541     } else {
 542       ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 543       // No valid pointer below alignof(ObjectMonitor*). Take the slow path.
 544       mv(tmp3_t, alignof(ObjectMonitor*));
 545       bltu(tmp1_monitor, tmp3_t, slow_path);
 546     }
 547 
 548     const Register tmp2_recursions = tmp2;
 549     Label not_recursive;
 550 
 551     // Check if recursive.
 552     ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 553     beqz(tmp2_recursions, not_recursive);
 554 
 555     // Recursive unlock.
 556     addi(tmp2_recursions, tmp2_recursions, -1);
 557     sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 558     j(unlocked);
 559 
 560     bind(not_recursive);
 561 
 562     const Register tmp2_owner_addr = tmp2;
 563 
 564     // Compute owner address.
 565     la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
 566 
 567     // Set owner to null.
 568     // Release to satisfy the JMM
 569     membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 570     sd(zr, Address(tmp2_owner_addr));
 571     // We need a full fence after clearing owner to avoid stranding.
 572     // StoreLoad achieves this.
 573     membar(StoreLoad);
 574 
 575     // Check if the entry lists are empty.
 576     ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset()));
 577     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset()));
 578     orr(t0, t0, tmp3_t);
 579     beqz(t0, unlocked); // If so we are done.
 580 
 581     // Check if there is a successor.
 582     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset()));
 583     bnez(tmp3_t, unlocked); // If so we are done.
 584 
 585     // Save the monitor pointer in the current thread, so we can try
 586     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
 587     sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
 588 
 589     mv(flag, 1);
 590     j(slow_path);
 591   }
 592 
 593   bind(unlocked);
 594   mv(flag, zr);

 595 
 596 #ifdef ASSERT
 597   // Check that unlocked label is reached with flag == 0.
 598   Label flag_correct;
 599   beqz(flag, flag_correct);
 600   stop("Fast Lock Flag != 0");
 601 #endif
 602 
 603   bind(slow_path);
 604 #ifdef ASSERT
 605   // Check that slow_path label is reached with flag != 0.
 606   bnez(flag, flag_correct);
 607   stop("Fast Lock Flag == 0");
 608   bind(flag_correct);
 609 #endif
 610   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 611 }
 612 
 613 // short string
 614 // StringUTF16.indexOfChar
 615 // StringLatin1.indexOfChar
 616 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 617                                                   Register ch, Register result,
 618                                                   bool isL)
 619 {
 620   Register ch1 = t0;
 621   Register index = t1;
 622 
 623   BLOCK_COMMENT("string_indexof_char_short {");
 624 
 625   Label LOOP, LOOP1, LOOP4, LOOP8;
 626   Label MATCH,  MATCH1, MATCH2, MATCH3,
 627         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 628 
 629   mv(result, -1);
 630   mv(index, zr);
 631 
 632   bind(LOOP);
 633   addi(t0, index, 8);
 634   ble(t0, cnt1, LOOP8);
 635   addi(t0, index, 4);
 636   ble(t0, cnt1, LOOP4);
 637   j(LOOP1);
 638 
 639   bind(LOOP8);
 640   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 641   beq(ch, ch1, MATCH);
 642   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 643   beq(ch, ch1, MATCH1);
 644   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 645   beq(ch, ch1, MATCH2);
 646   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 647   beq(ch, ch1, MATCH3);
 648   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 649   beq(ch, ch1, MATCH4);
 650   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 651   beq(ch, ch1, MATCH5);
 652   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 653   beq(ch, ch1, MATCH6);
 654   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 655   beq(ch, ch1, MATCH7);
 656   addi(index, index, 8);
 657   addi(str1, str1, isL ? 8 : 16);
 658   blt(index, cnt1, LOOP);
 659   j(NOMATCH);
 660 
 661   bind(LOOP4);
 662   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 663   beq(ch, ch1, MATCH);
 664   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 665   beq(ch, ch1, MATCH1);
 666   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 667   beq(ch, ch1, MATCH2);
 668   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 669   beq(ch, ch1, MATCH3);
 670   addi(index, index, 4);
 671   addi(str1, str1, isL ? 4 : 8);
 672   bge(index, cnt1, NOMATCH);
 673 
 674   bind(LOOP1);
 675   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 676   beq(ch, ch1, MATCH);
 677   addi(index, index, 1);
 678   addi(str1, str1, isL ? 1 : 2);
 679   blt(index, cnt1, LOOP1);
 680   j(NOMATCH);
 681 
 682   bind(MATCH1);
 683   addi(index, index, 1);
 684   j(MATCH);
 685 
 686   bind(MATCH2);
 687   addi(index, index, 2);
 688   j(MATCH);
 689 
 690   bind(MATCH3);
 691   addi(index, index, 3);
 692   j(MATCH);
 693 
 694   bind(MATCH4);
 695   addi(index, index, 4);
 696   j(MATCH);
 697 
 698   bind(MATCH5);
 699   addi(index, index, 5);
 700   j(MATCH);
 701 
 702   bind(MATCH6);
 703   addi(index, index, 6);
 704   j(MATCH);
 705 
 706   bind(MATCH7);
 707   addi(index, index, 7);
 708 
 709   bind(MATCH);
 710   mv(result, index);
 711   bind(NOMATCH);
 712   BLOCK_COMMENT("} string_indexof_char_short");
 713 }
 714 
 715 // StringUTF16.indexOfChar
 716 // StringLatin1.indexOfChar
 717 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 718                                             Register ch, Register result,
 719                                             Register tmp1, Register tmp2,
 720                                             Register tmp3, Register tmp4,
 721                                             bool isL)
 722 {
 723   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 724   Register ch1 = t0;
 725   Register orig_cnt = t1;
 726   Register mask1 = tmp3;
 727   Register mask2 = tmp2;
 728   Register match_mask = tmp1;
 729   Register trailing_char = tmp4;
 730   Register unaligned_elems = tmp4;
 731 
 732   BLOCK_COMMENT("string_indexof_char {");
 733   beqz(cnt1, NOMATCH);
 734 
 735   addi(t0, cnt1, isL ? -32 : -16);
 736   bgtz(t0, DO_LONG);
 737   string_indexof_char_short(str1, cnt1, ch, result, isL);
 738   j(DONE);
 739 
 740   bind(DO_LONG);
 741   mv(orig_cnt, cnt1);
 742   if (AvoidUnalignedAccesses) {
 743     Label ALIGNED;
 744     andi(unaligned_elems, str1, 0x7);
 745     beqz(unaligned_elems, ALIGNED);
 746     sub(unaligned_elems, unaligned_elems, 8);
 747     neg(unaligned_elems, unaligned_elems);
 748     if (!isL) {
 749       srli(unaligned_elems, unaligned_elems, 1);
 750     }
 751     // do unaligned part per element
 752     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 753     bgez(result, DONE);
 754     mv(orig_cnt, cnt1);
 755     sub(cnt1, cnt1, unaligned_elems);
 756     bind(ALIGNED);
 757   }
 758 
 759   // duplicate ch
 760   if (isL) {
 761     slli(ch1, ch, 8);
 762     orr(ch, ch1, ch);
 763   }
 764   slli(ch1, ch, 16);
 765   orr(ch, ch1, ch);
 766   slli(ch1, ch, 32);
 767   orr(ch, ch1, ch);
 768 
 769   if (!isL) {
 770     slli(cnt1, cnt1, 1);
 771   }
 772 
 773   uint64_t mask0101 = UCONST64(0x0101010101010101);
 774   uint64_t mask0001 = UCONST64(0x0001000100010001);
 775   mv(mask1, isL ? mask0101 : mask0001);
 776   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 777   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 778   mv(mask2, isL ? mask7f7f : mask7fff);
 779 
 780   bind(CH1_LOOP);
 781   ld(ch1, Address(str1));
 782   addi(str1, str1, 8);
 783   addi(cnt1, cnt1, -8);
 784   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 785   bnez(match_mask, HIT);
 786   bgtz(cnt1, CH1_LOOP);
 787   j(NOMATCH);
 788 
 789   bind(HIT);
 790   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 791   srli(trailing_char, trailing_char, 3);
 792   addi(cnt1, cnt1, 8);
 793   ble(cnt1, trailing_char, NOMATCH);
 794   // match case
 795   if (!isL) {
 796     srli(cnt1, cnt1, 1);
 797     srli(trailing_char, trailing_char, 1);
 798   }
 799 
 800   sub(result, orig_cnt, cnt1);
 801   add(result, result, trailing_char);
 802   j(DONE);
 803 
 804   bind(NOMATCH);
 805   mv(result, -1);
 806 
 807   bind(DONE);
 808   BLOCK_COMMENT("} string_indexof_char");
 809 }
 810 
 811 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 812 
 813 // Search for needle in haystack and return index or -1
 814 // x10: result
 815 // x11: haystack
 816 // x12: haystack_len
 817 // x13: needle
 818 // x14: needle_len
 819 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 820                                        Register haystack_len, Register needle_len,
 821                                        Register tmp1, Register tmp2,
 822                                        Register tmp3, Register tmp4,
 823                                        Register tmp5, Register tmp6,
 824                                        Register result, int ae)
 825 {
 826   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 827 
 828   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 829 
 830   Register ch1 = t0;
 831   Register ch2 = t1;
 832   Register nlen_tmp = tmp1; // needle len tmp
 833   Register hlen_tmp = tmp2; // haystack len tmp
 834   Register result_tmp = tmp4;
 835 
 836   bool isLL = ae == StrIntrinsicNode::LL;
 837 
 838   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 839   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 840   int needle_chr_shift = needle_isL ? 0 : 1;
 841   int haystack_chr_shift = haystack_isL ? 0 : 1;
 842   int needle_chr_size = needle_isL ? 1 : 2;
 843   int haystack_chr_size = haystack_isL ? 1 : 2;
 844   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 845                               (load_chr_insn)&MacroAssembler::lhu;
 846   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 847                                 (load_chr_insn)&MacroAssembler::lhu;
 848 
 849   BLOCK_COMMENT("string_indexof {");
 850 
 851   // Note, inline_string_indexOf() generates checks:
 852   // if (pattern.count > src.count) return -1;
 853   // if (pattern.count == 0) return 0;
 854 
 855   // We have two strings, a source string in haystack, haystack_len and a pattern string
 856   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 857 
 858   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 859   // With a small pattern and source we use linear scan.
 860 
 861   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 862   sub(result_tmp, haystack_len, needle_len);
 863   // needle_len < 8, use linear scan
 864   sub(t0, needle_len, 8);
 865   bltz(t0, LINEARSEARCH);
 866   // needle_len >= 256, use linear scan
 867   sub(t0, needle_len, 256);
 868   bgez(t0, LINEARSTUB);
 869   // needle_len >= haystack_len/4, use linear scan
 870   srli(t0, haystack_len, 2);
 871   bge(needle_len, t0, LINEARSTUB);
 872 
 873   // Boyer-Moore-Horspool introduction:
 874   // The Boyer Moore alogorithm is based on the description here:-
 875   //
 876   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 877   //
 878   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 879   // and the 'Good Suffix' rule.
 880   //
 881   // These rules are essentially heuristics for how far we can shift the
 882   // pattern along the search string.
 883   //
 884   // The implementation here uses the 'Bad Character' rule only because of the
 885   // complexity of initialisation for the 'Good Suffix' rule.
 886   //
 887   // This is also known as the Boyer-Moore-Horspool algorithm:
 888   //
 889   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 890   //
 891   // #define ASIZE 256
 892   //
 893   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 894   //      int i, j;
 895   //      unsigned c;
 896   //      unsigned char bc[ASIZE];
 897   //
 898   //      /* Preprocessing */
 899   //      for (i = 0; i < ASIZE; ++i)
 900   //        bc[i] = m;
 901   //      for (i = 0; i < m - 1; ) {
 902   //        c = pattern[i];
 903   //        ++i;
 904   //        // c < 256 for Latin1 string, so, no need for branch
 905   //        #ifdef PATTERN_STRING_IS_LATIN1
 906   //        bc[c] = m - i;
 907   //        #else
 908   //        if (c < ASIZE) bc[c] = m - i;
 909   //        #endif
 910   //      }
 911   //
 912   //      /* Searching */
 913   //      j = 0;
 914   //      while (j <= n - m) {
 915   //        c = src[i+j];
 916   //        if (pattern[m-1] == c)
 917   //          int k;
 918   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 919   //          if (k < 0) return j;
 920   //          // c < 256 for Latin1 string, so, no need for branch
 921   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 922   //          // LL case: (c< 256) always true. Remove branch
 923   //          j += bc[pattern[j+m-1]];
 924   //          #endif
 925   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 926   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 927   //          if (c < ASIZE)
 928   //            j += bc[pattern[j+m-1]];
 929   //          else
 930   //            j += 1
 931   //          #endif
 932   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 933   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 934   //          if (c < ASIZE)
 935   //            j += bc[pattern[j+m-1]];
 936   //          else
 937   //            j += m
 938   //          #endif
 939   //      }
 940   //      return -1;
 941   //    }
 942 
 943   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 944   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 945         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 946 
 947   Register haystack_end = haystack_len;
 948   Register skipch = tmp2;
 949 
 950   // pattern length is >=8, so, we can read at least 1 register for cases when
 951   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 952   // UL case. We'll re-read last character in inner pre-loop code to have
 953   // single outer pre-loop load
 954   const int firstStep = isLL ? 7 : 3;
 955 
 956   const int ASIZE = 256;
 957   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 958 
 959   sub(sp, sp, ASIZE);
 960 
 961   // init BC offset table with default value: needle_len
 962   slli(t0, needle_len, 8);
 963   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 964   slli(tmp1, t0, 16);
 965   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 966   slli(tmp1, t0, 32);
 967   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 968 
 969   mv(ch1, sp);  // ch1 is t0
 970   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 971 
 972   bind(BM_INIT_LOOP);
 973   // for (i = 0; i < ASIZE; ++i)
 974   //   bc[i] = m;
 975   for (int i = 0; i < 4; i++) {
 976     sd(tmp5, Address(ch1, i * wordSize));
 977   }
 978   add(ch1, ch1, 32);
 979   sub(tmp6, tmp6, 4);
 980   bgtz(tmp6, BM_INIT_LOOP);
 981 
 982   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 983   Register orig_haystack = tmp5;
 984   mv(orig_haystack, haystack);
 985   // result_tmp = tmp4
 986   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 987   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 988   mv(tmp3, needle);
 989 
 990   //  for (i = 0; i < m - 1; ) {
 991   //    c = pattern[i];
 992   //    ++i;
 993   //    // c < 256 for Latin1 string, so, no need for branch
 994   //    #ifdef PATTERN_STRING_IS_LATIN1
 995   //    bc[c] = m - i;
 996   //    #else
 997   //    if (c < ASIZE) bc[c] = m - i;
 998   //    #endif
 999   //  }
1000   bind(BCLOOP);
1001   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
1002   add(tmp3, tmp3, needle_chr_size);
1003   if (!needle_isL) {
1004     // ae == StrIntrinsicNode::UU
1005     mv(tmp6, ASIZE);
1006     bgeu(ch1, tmp6, BCSKIP);
1007   }
1008   add(tmp4, sp, ch1);
1009   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
1010 
1011   bind(BCSKIP);
1012   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
1013   bgtz(ch2, BCLOOP);
1014 
1015   // tmp6: pattern end, address after needle
1016   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
1017   if (needle_isL == haystack_isL) {
1018     // load last 8 bytes (8LL/4UU symbols)
1019     ld(tmp6, Address(tmp6, -wordSize));
1020   } else {
1021     // UL: from UTF-16(source) search Latin1(pattern)
1022     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
1023     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
1024     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
1025     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
1026     slli(ch2, tmp6, XLEN - 24);
1027     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
1028     slli(ch1, tmp6, XLEN - 16);
1029     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
1030     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
1031     slli(ch2, ch2, 16);
1032     orr(ch2, ch2, ch1); // 0x00000b0c
1033     slli(result, tmp3, 48); // use result as temp register
1034     orr(tmp6, tmp6, result); // 0x0a00000d
1035     slli(result, ch2, 16);
1036     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
1037   }
1038 
1039   // i = m - 1;
1040   // skipch = j + i;
1041   // if (skipch == pattern[m - 1]
1042   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
1043   // else
1044   //   move j with bad char offset table
1045   bind(BMLOOPSTR2);
1046   // compare pattern to source string backward
1047   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
1048   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
1049   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
1050   if (needle_isL == haystack_isL) {
1051     // re-init tmp3. It's for free because it's executed in parallel with
1052     // load above. Alternative is to initialize it before loop, but it'll
1053     // affect performance on in-order systems with 2 or more ld/st pipelines
1054     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
1055   }
1056   if (!isLL) { // UU/UL case
1057     slli(ch2, nlen_tmp, 1); // offsets in bytes
1058   }
1059   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
1060   add(result, haystack, isLL ? nlen_tmp : ch2);
1061   // load 8 bytes from source string
1062   // if isLL is false then read granularity can be 2
1063   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
1064   mv(ch1, tmp6);
1065   if (isLL) {
1066     j(BMLOOPSTR1_AFTER_LOAD);
1067   } else {
1068     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
1069     j(BMLOOPSTR1_CMP);
1070   }
1071 
1072   bind(BMLOOPSTR1);
1073   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
1074   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1075   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
1076   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1077 
1078   bind(BMLOOPSTR1_AFTER_LOAD);
1079   sub(nlen_tmp, nlen_tmp, 1);
1080   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
1081 
1082   bind(BMLOOPSTR1_CMP);
1083   beq(ch1, ch2, BMLOOPSTR1);
1084 
1085   bind(BMSKIP);
1086   if (!isLL) {
1087     // if we've met UTF symbol while searching Latin1 pattern, then we can
1088     // skip needle_len symbols
1089     if (needle_isL != haystack_isL) {
1090       mv(result_tmp, needle_len);
1091     } else {
1092       mv(result_tmp, 1);
1093     }
1094     mv(t0, ASIZE);
1095     bgeu(skipch, t0, BMADV);
1096   }
1097   add(result_tmp, sp, skipch);
1098   lbu(result_tmp, Address(result_tmp)); // load skip offset
1099 
1100   bind(BMADV);
1101   sub(nlen_tmp, needle_len, 1);
1102   // move haystack after bad char skip offset
1103   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
1104   ble(haystack, haystack_end, BMLOOPSTR2);
1105   add(sp, sp, ASIZE);
1106   j(NOMATCH);
1107 
1108   bind(BMLOOPSTR1_LASTCMP);
1109   bne(ch1, ch2, BMSKIP);
1110 
1111   bind(BMMATCH);
1112   sub(result, haystack, orig_haystack);
1113   if (!haystack_isL) {
1114     srli(result, result, 1);
1115   }
1116   add(sp, sp, ASIZE);
1117   j(DONE);
1118 
1119   bind(LINEARSTUB);
1120   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
1121   bltz(t0, LINEARSEARCH);
1122   mv(result, zr);
1123   RuntimeAddress stub = nullptr;
1124   if (isLL) {
1125     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
1126     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
1127   } else if (needle_isL) {
1128     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
1129     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
1130   } else {
1131     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
1132     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
1133   }
1134   address call = reloc_call(stub);
1135   if (call == nullptr) {
1136     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
1137     ciEnv::current()->record_failure("CodeCache is full");
1138     return;
1139   }
1140   j(DONE);
1141 
1142   bind(NOMATCH);
1143   mv(result, -1);
1144   j(DONE);
1145 
1146   bind(LINEARSEARCH);
1147   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
1148 
1149   bind(DONE);
1150   BLOCK_COMMENT("} string_indexof");
1151 }
1152 
1153 // string_indexof
1154 // result: x10
1155 // src: x11
1156 // src_count: x12
1157 // pattern: x13
1158 // pattern_count: x14 or 1/2/3/4
1159 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
1160                                                Register haystack_len, Register needle_len,
1161                                                Register tmp1, Register tmp2,
1162                                                Register tmp3, Register tmp4,
1163                                                int needle_con_cnt, Register result, int ae)
1164 {
1165   // Note:
1166   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
1167   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
1168   assert(needle_con_cnt <= 4, "Invalid needle constant count");
1169   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1170 
1171   Register ch1 = t0;
1172   Register ch2 = t1;
1173   Register hlen_neg = haystack_len, nlen_neg = needle_len;
1174   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
1175 
1176   bool isLL = ae == StrIntrinsicNode::LL;
1177 
1178   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
1179   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
1180   int needle_chr_shift = needle_isL ? 0 : 1;
1181   int haystack_chr_shift = haystack_isL ? 0 : 1;
1182   int needle_chr_size = needle_isL ? 1 : 2;
1183   int haystack_chr_size = haystack_isL ? 1 : 2;
1184 
1185   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
1186                               (load_chr_insn)&MacroAssembler::lhu;
1187   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
1188                                 (load_chr_insn)&MacroAssembler::lhu;
1189   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
1190   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
1191 
1192   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
1193 
1194   Register first = tmp3;
1195 
1196   if (needle_con_cnt == -1) {
1197     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1198 
1199     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1200     bltz(t0, DOSHORT);
1201 
1202     (this->*needle_load_1chr)(first, Address(needle), noreg);
1203     slli(t0, needle_len, needle_chr_shift);
1204     add(needle, needle, t0);
1205     neg(nlen_neg, t0);
1206     slli(t0, result_tmp, haystack_chr_shift);
1207     add(haystack, haystack, t0);
1208     neg(hlen_neg, t0);
1209 
1210     bind(FIRST_LOOP);
1211     add(t0, haystack, hlen_neg);
1212     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1213     beq(first, ch2, STR1_LOOP);
1214 
1215     bind(STR2_NEXT);
1216     add(hlen_neg, hlen_neg, haystack_chr_size);
1217     blez(hlen_neg, FIRST_LOOP);
1218     j(NOMATCH);
1219 
1220     bind(STR1_LOOP);
1221     add(nlen_tmp, nlen_neg, needle_chr_size);
1222     add(hlen_tmp, hlen_neg, haystack_chr_size);
1223     bgez(nlen_tmp, MATCH);
1224 
1225     bind(STR1_NEXT);
1226     add(ch1, needle, nlen_tmp);
1227     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1228     add(ch2, haystack, hlen_tmp);
1229     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1230     bne(ch1, ch2, STR2_NEXT);
1231     add(nlen_tmp, nlen_tmp, needle_chr_size);
1232     add(hlen_tmp, hlen_tmp, haystack_chr_size);
1233     bltz(nlen_tmp, STR1_NEXT);
1234     j(MATCH);
1235 
1236     bind(DOSHORT);
1237     if (needle_isL == haystack_isL) {
1238       sub(t0, needle_len, 2);
1239       bltz(t0, DO1);
1240       bgtz(t0, DO3);
1241     }
1242   }
1243 
1244   if (needle_con_cnt == 4) {
1245     Label CH1_LOOP;
1246     (this->*load_4chr)(ch1, Address(needle), noreg);
1247     sub(result_tmp, haystack_len, 4);
1248     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1249     add(haystack, haystack, tmp3);
1250     neg(hlen_neg, tmp3);
1251     if (AvoidUnalignedAccesses) {
1252       // preload first value, then we will read by 1 character per loop, instead of four
1253       // just shifting previous ch2 right by size of character in bits
1254       add(tmp3, haystack, hlen_neg);
1255       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1256       if (isLL) {
1257         // need to erase 1 most significant byte in 32-bit value of ch2
1258         slli(ch2, ch2, 40);
1259         srli(ch2, ch2, 32);
1260       } else {
1261         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1262       }
1263     }
1264 
1265     bind(CH1_LOOP);
1266     add(tmp3, haystack, hlen_neg);
1267     if (AvoidUnalignedAccesses) {
1268       srli(ch2, ch2, isLL ? 8 : 16);
1269       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1270       slli(tmp3, tmp3, isLL ? 24 : 48);
1271       add(ch2, ch2, tmp3);
1272     } else {
1273       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1274     }
1275     beq(ch1, ch2, MATCH);
1276     add(hlen_neg, hlen_neg, haystack_chr_size);
1277     blez(hlen_neg, CH1_LOOP);
1278     j(NOMATCH);
1279   }
1280 
1281   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1282     Label CH1_LOOP;
1283     BLOCK_COMMENT("string_indexof DO2 {");
1284     bind(DO2);
1285     (this->*load_2chr)(ch1, Address(needle), noreg);
1286     if (needle_con_cnt == 2) {
1287       sub(result_tmp, haystack_len, 2);
1288     }
1289     slli(tmp3, result_tmp, haystack_chr_shift);
1290     add(haystack, haystack, tmp3);
1291     neg(hlen_neg, tmp3);
1292     if (AvoidUnalignedAccesses) {
1293       // preload first value, then we will read by 1 character per loop, instead of two
1294       // just shifting previous ch2 right by size of character in bits
1295       add(tmp3, haystack, hlen_neg);
1296       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1297       slli(ch2, ch2, isLL ? 8 : 16);
1298     }
1299     bind(CH1_LOOP);
1300     add(tmp3, haystack, hlen_neg);
1301     if (AvoidUnalignedAccesses) {
1302       srli(ch2, ch2, isLL ? 8 : 16);
1303       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1304       slli(tmp3, tmp3, isLL ? 8 : 16);
1305       add(ch2, ch2, tmp3);
1306     } else {
1307       (this->*load_2chr)(ch2, Address(tmp3), noreg);
1308     }
1309     beq(ch1, ch2, MATCH);
1310     add(hlen_neg, hlen_neg, haystack_chr_size);
1311     blez(hlen_neg, CH1_LOOP);
1312     j(NOMATCH);
1313     BLOCK_COMMENT("} string_indexof DO2");
1314   }
1315 
1316   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1317     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1318     BLOCK_COMMENT("string_indexof DO3 {");
1319 
1320     bind(DO3);
1321     (this->*load_2chr)(first, Address(needle), noreg);
1322     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1323     if (needle_con_cnt == 3) {
1324       sub(result_tmp, haystack_len, 3);
1325     }
1326     slli(hlen_tmp, result_tmp, haystack_chr_shift);
1327     add(haystack, haystack, hlen_tmp);
1328     neg(hlen_neg, hlen_tmp);
1329 
1330     bind(FIRST_LOOP);
1331     add(ch2, haystack, hlen_neg);
1332     if (AvoidUnalignedAccesses) {
1333       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1334       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1335       slli(tmp2, tmp2, isLL ? 8 : 16);
1336       add(ch2, ch2, tmp2);
1337     } else {
1338       (this->*load_2chr)(ch2, Address(ch2), noreg);
1339     }
1340     beq(first, ch2, STR1_LOOP);
1341 
1342     bind(STR2_NEXT);
1343     add(hlen_neg, hlen_neg, haystack_chr_size);
1344     blez(hlen_neg, FIRST_LOOP);
1345     j(NOMATCH);
1346 
1347     bind(STR1_LOOP);
1348     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1349     add(ch2, haystack, hlen_tmp);
1350     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1351     bne(ch1, ch2, STR2_NEXT);
1352     j(MATCH);
1353     BLOCK_COMMENT("} string_indexof DO3");
1354   }
1355 
1356   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1357     Label DO1_LOOP;
1358 
1359     BLOCK_COMMENT("string_indexof DO1 {");
1360     bind(DO1);
1361     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1362     sub(result_tmp, haystack_len, 1);
1363     slli(tmp3, result_tmp, haystack_chr_shift);
1364     add(haystack, haystack, tmp3);
1365     neg(hlen_neg, tmp3);
1366 
1367     bind(DO1_LOOP);
1368     add(tmp3, haystack, hlen_neg);
1369     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1370     beq(ch1, ch2, MATCH);
1371     add(hlen_neg, hlen_neg, haystack_chr_size);
1372     blez(hlen_neg, DO1_LOOP);
1373     BLOCK_COMMENT("} string_indexof DO1");
1374   }
1375 
1376   bind(NOMATCH);
1377   mv(result, -1);
1378   j(DONE);
1379 
1380   bind(MATCH);
1381   srai(t0, hlen_neg, haystack_chr_shift);
1382   add(result, result_tmp, t0);
1383 
1384   bind(DONE);
1385 }
1386 
1387 // Compare strings.
1388 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1389                                        Register cnt1, Register cnt2, Register result,
1390                                        Register tmp1, Register tmp2, Register tmp3,
1391                                        int ae)
1392 {
1393   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1394         DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1395         SHORT_LOOP_START, TAIL_CHECK, L;
1396 
1397   const int STUB_THRESHOLD = 64 + 8;
1398   bool isLL = ae == StrIntrinsicNode::LL;
1399   bool isLU = ae == StrIntrinsicNode::LU;
1400   bool isUL = ae == StrIntrinsicNode::UL;
1401 
1402   bool str1_isL = isLL || isLU;
1403   bool str2_isL = isLL || isUL;
1404 
1405   // for L strings, 1 byte for 1 character
1406   // for U strings, 2 bytes for 1 character
1407   int str1_chr_size = str1_isL ? 1 : 2;
1408   int str2_chr_size = str2_isL ? 1 : 2;
1409   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1410 
1411   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1412   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1413 
1414   BLOCK_COMMENT("string_compare {");
1415 
1416   // Bizarrely, the counts are passed in bytes, regardless of whether they
1417   // are L or U strings, however the result is always in characters.
1418   if (!str1_isL) {
1419     sraiw(cnt1, cnt1, 1);
1420   }
1421   if (!str2_isL) {
1422     sraiw(cnt2, cnt2, 1);
1423   }
1424 
1425   // Compute the minimum of the string lengths and save the difference in result.
1426   sub(result, cnt1, cnt2);
1427   bgt(cnt1, cnt2, L);
1428   mv(cnt2, cnt1);
1429   bind(L);
1430 
1431   // A very short string
1432   mv(t0, minCharsInWord);
1433   ble(cnt2, t0, SHORT_STRING);
1434 
1435   // Compare longwords
1436   // load first parts of strings and finish initialization while loading
1437   {
1438     if (str1_isL == str2_isL) { // LL or UU
1439       // check if str1 and str2 is same pointer
1440       beq(str1, str2, DONE);
1441       // load 8 bytes once to compare
1442       ld(tmp1, Address(str1));
1443       ld(tmp2, Address(str2));
1444       mv(t0, STUB_THRESHOLD);
1445       bge(cnt2, t0, STUB);
1446       sub(cnt2, cnt2, minCharsInWord);
1447       beqz(cnt2, TAIL_CHECK);
1448       // convert cnt2 from characters to bytes
1449       if (!str1_isL) {
1450         slli(cnt2, cnt2, 1);
1451       }
1452       add(str2, str2, cnt2);
1453       add(str1, str1, cnt2);
1454       sub(cnt2, zr, cnt2);
1455     } else if (isLU) { // LU case
1456       lwu(tmp1, Address(str1));
1457       ld(tmp2, Address(str2));
1458       mv(t0, STUB_THRESHOLD);
1459       bge(cnt2, t0, STUB);
1460       addi(cnt2, cnt2, -4);
1461       add(str1, str1, cnt2);
1462       sub(cnt1, zr, cnt2);
1463       slli(cnt2, cnt2, 1);
1464       add(str2, str2, cnt2);
1465       inflate_lo32(tmp3, tmp1);
1466       mv(tmp1, tmp3);
1467       sub(cnt2, zr, cnt2);
1468       addi(cnt1, cnt1, 4);
1469     } else { // UL case
1470       ld(tmp1, Address(str1));
1471       lwu(tmp2, Address(str2));
1472       mv(t0, STUB_THRESHOLD);
1473       bge(cnt2, t0, STUB);
1474       addi(cnt2, cnt2, -4);
1475       slli(t0, cnt2, 1);
1476       sub(cnt1, zr, t0);
1477       add(str1, str1, t0);
1478       add(str2, str2, cnt2);
1479       inflate_lo32(tmp3, tmp2);
1480       mv(tmp2, tmp3);
1481       sub(cnt2, zr, cnt2);
1482       addi(cnt1, cnt1, 8);
1483     }
1484     addi(cnt2, cnt2, isUL ? 4 : 8);
1485     bne(tmp1, tmp2, DIFFERENCE);
1486     bgez(cnt2, TAIL);
1487 
1488     // main loop
1489     bind(NEXT_WORD);
1490     if (str1_isL == str2_isL) { // LL or UU
1491       add(t0, str1, cnt2);
1492       ld(tmp1, Address(t0));
1493       add(t0, str2, cnt2);
1494       ld(tmp2, Address(t0));
1495       addi(cnt2, cnt2, 8);
1496     } else if (isLU) { // LU case
1497       add(t0, str1, cnt1);
1498       lwu(tmp1, Address(t0));
1499       add(t0, str2, cnt2);
1500       ld(tmp2, Address(t0));
1501       addi(cnt1, cnt1, 4);
1502       inflate_lo32(tmp3, tmp1);
1503       mv(tmp1, tmp3);
1504       addi(cnt2, cnt2, 8);
1505     } else { // UL case
1506       add(t0, str2, cnt2);
1507       lwu(tmp2, Address(t0));
1508       add(t0, str1, cnt1);
1509       ld(tmp1, Address(t0));
1510       inflate_lo32(tmp3, tmp2);
1511       mv(tmp2, tmp3);
1512       addi(cnt1, cnt1, 8);
1513       addi(cnt2, cnt2, 4);
1514     }
1515     bne(tmp1, tmp2, DIFFERENCE);
1516     bltz(cnt2, NEXT_WORD);
1517     bind(TAIL);
1518     if (str1_isL == str2_isL) { // LL or UU
1519       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1520       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1521     } else if (isLU) { // LU case
1522       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1523       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1524       inflate_lo32(tmp3, tmp1);
1525       mv(tmp1, tmp3);
1526     } else { // UL case
1527       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1528       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1529       inflate_lo32(tmp3, tmp2);
1530       mv(tmp2, tmp3);
1531     }
1532     bind(TAIL_CHECK);
1533     beq(tmp1, tmp2, DONE);
1534 
1535     // Find the first different characters in the longwords and
1536     // compute their difference.
1537     bind(DIFFERENCE);
1538     xorr(tmp3, tmp1, tmp2);
1539     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1540     srl(tmp1, tmp1, result);
1541     srl(tmp2, tmp2, result);
1542     if (isLL) {
1543       andi(tmp1, tmp1, 0xFF);
1544       andi(tmp2, tmp2, 0xFF);
1545     } else {
1546       andi(tmp1, tmp1, 0xFFFF);
1547       andi(tmp2, tmp2, 0xFFFF);
1548     }
1549     sub(result, tmp1, tmp2);
1550     j(DONE);
1551   }
1552 
1553   bind(STUB);
1554   RuntimeAddress stub = nullptr;
1555   switch (ae) {
1556     case StrIntrinsicNode::LL:
1557       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1558       break;
1559     case StrIntrinsicNode::UU:
1560       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1561       break;
1562     case StrIntrinsicNode::LU:
1563       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1564       break;
1565     case StrIntrinsicNode::UL:
1566       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1567       break;
1568     default:
1569       ShouldNotReachHere();
1570   }
1571   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1572   address call = reloc_call(stub);
1573   if (call == nullptr) {
1574     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1575     ciEnv::current()->record_failure("CodeCache is full");
1576     return;
1577   }
1578   j(DONE);
1579 
1580   bind(SHORT_STRING);
1581   // Is the minimum length zero?
1582   beqz(cnt2, DONE);
1583   // arrange code to do most branches while loading and loading next characters
1584   // while comparing previous
1585   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1586   addi(str1, str1, str1_chr_size);
1587   addi(cnt2, cnt2, -1);
1588   beqz(cnt2, SHORT_LAST_INIT);
1589   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1590   addi(str2, str2, str2_chr_size);
1591   j(SHORT_LOOP_START);
1592   bind(SHORT_LOOP);
1593   addi(cnt2, cnt2, -1);
1594   beqz(cnt2, SHORT_LAST);
1595   bind(SHORT_LOOP_START);
1596   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1597   addi(str1, str1, str1_chr_size);
1598   (this->*str2_load_chr)(t0, Address(str2), t0);
1599   addi(str2, str2, str2_chr_size);
1600   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1601   addi(cnt2, cnt2, -1);
1602   beqz(cnt2, SHORT_LAST2);
1603   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1604   addi(str1, str1, str1_chr_size);
1605   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1606   addi(str2, str2, str2_chr_size);
1607   beq(tmp2, t0, SHORT_LOOP);
1608   sub(result, tmp2, t0);
1609   j(DONE);
1610   bind(SHORT_LOOP_TAIL);
1611   sub(result, tmp1, cnt1);
1612   j(DONE);
1613   bind(SHORT_LAST2);
1614   beq(tmp2, t0, DONE);
1615   sub(result, tmp2, t0);
1616 
1617   j(DONE);
1618   bind(SHORT_LAST_INIT);
1619   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1620   addi(str2, str2, str2_chr_size);
1621   bind(SHORT_LAST);
1622   beq(tmp1, cnt1, DONE);
1623   sub(result, tmp1, cnt1);
1624 
1625   bind(DONE);
1626 
1627   BLOCK_COMMENT("} string_compare");
1628 }
1629 
1630 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1631                                       Register tmp1, Register tmp2, Register tmp3,
1632                                       Register result, int elem_size) {
1633   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1634   assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1635 
1636   int elem_per_word = wordSize/elem_size;
1637   int log_elem_size = exact_log2(elem_size);
1638   int length_offset = arrayOopDesc::length_offset_in_bytes();
1639   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1640 
1641   Register cnt1 = tmp3;
1642   Register cnt2 = tmp1;  // cnt2 only used in array length compare
1643   Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1644 
1645   BLOCK_COMMENT("arrays_equals {");
1646 
1647   // if (a1 == a2), return true
1648   beq(a1, a2, SAME);
1649 
1650   mv(result, false);
1651   // if (a1 == nullptr || a2 == nullptr)
1652   //     return false;
1653   beqz(a1, DONE);
1654   beqz(a2, DONE);
1655 
1656   // if (a1.length != a2.length)
1657   //      return false;
1658   lwu(cnt1, Address(a1, length_offset));
1659   lwu(cnt2, Address(a2, length_offset));
1660   bne(cnt1, cnt2, DONE);
1661 
1662   la(a1, Address(a1, base_offset));
1663   la(a2, Address(a2, base_offset));
1664   // Check for short strings, i.e. smaller than wordSize.
1665   addi(cnt1, cnt1, -elem_per_word);
1666   bltz(cnt1, SHORT);
1667 
1668   // Main 8 byte comparison loop.
1669   bind(NEXT_WORD); {
1670     ld(tmp1, Address(a1));
1671     ld(tmp2, Address(a2));
1672     addi(cnt1, cnt1, -elem_per_word);
1673     addi(a1, a1, wordSize);
1674     addi(a2, a2, wordSize);
1675     bne(tmp1, tmp2, DONE);
1676   } bgez(cnt1, NEXT_WORD);
1677 
1678   addi(tmp1, cnt1, elem_per_word);
1679   beqz(tmp1, SAME);
1680 
1681   bind(SHORT);
1682   test_bit(tmp1, cnt1, 2 - log_elem_size);
1683   beqz(tmp1, TAIL03); // 0-7 bytes left.
1684   {
1685     lwu(tmp1, Address(a1));
1686     lwu(tmp2, Address(a2));
1687     addi(a1, a1, 4);
1688     addi(a2, a2, 4);
1689     bne(tmp1, tmp2, DONE);
1690   }
1691 
1692   bind(TAIL03);
1693   test_bit(tmp1, cnt1, 1 - log_elem_size);
1694   beqz(tmp1, TAIL01); // 0-3 bytes left.
1695   {
1696     lhu(tmp1, Address(a1));
1697     lhu(tmp2, Address(a2));
1698     addi(a1, a1, 2);
1699     addi(a2, a2, 2);
1700     bne(tmp1, tmp2, DONE);
1701   }
1702 
1703   bind(TAIL01);
1704   if (elem_size == 1) { // Only needed when comparing byte arrays.
1705     test_bit(tmp1, cnt1, 0);
1706     beqz(tmp1, SAME); // 0-1 bytes left.
1707     {
1708       lbu(tmp1, Address(a1));
1709       lbu(tmp2, Address(a2));
1710       bne(tmp1, tmp2, DONE);
1711     }
1712   }
1713 
1714   bind(SAME);
1715   mv(result, true);
1716   // That's it.
1717   bind(DONE);
1718 
1719   BLOCK_COMMENT("} arrays_equals");
1720 }
1721 
1722 // Compare Strings
1723 
1724 // For Strings we're passed the address of the first characters in a1 and a2
1725 // and the length in cnt1. There are two implementations.
1726 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1727 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1728 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1729 
1730 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1731                                       Register result, Register cnt1)
1732 {
1733   Label SAME, DONE, SHORT, NEXT_WORD;
1734   Register tmp1 = t0;
1735   Register tmp2 = t1;
1736 
1737   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1738 
1739   BLOCK_COMMENT("string_equals {");
1740 
1741   mv(result, false);
1742 
1743   // Check for short strings, i.e. smaller than wordSize.
1744   addi(cnt1, cnt1, -wordSize);
1745   bltz(cnt1, SHORT);
1746 
1747   // Main 8 byte comparison loop.
1748   bind(NEXT_WORD); {
1749     ld(tmp1, Address(a1));
1750     ld(tmp2, Address(a2));
1751     addi(cnt1, cnt1, -wordSize);
1752     addi(a1, a1, wordSize);
1753     addi(a2, a2, wordSize);
1754     bne(tmp1, tmp2, DONE);
1755   } bgez(cnt1, NEXT_WORD);
1756 
1757   addi(tmp1, cnt1, wordSize);
1758   beqz(tmp1, SAME);
1759 
1760   bind(SHORT);
1761   Label TAIL03, TAIL01;
1762 
1763   // 0-7 bytes left.
1764   test_bit(tmp1, cnt1, 2);
1765   beqz(tmp1, TAIL03);
1766   {
1767     lwu(tmp1, Address(a1));
1768     lwu(tmp2, Address(a2));
1769     addi(a1, a1, 4);
1770     addi(a2, a2, 4);
1771     bne(tmp1, tmp2, DONE);
1772   }
1773 
1774   bind(TAIL03);
1775   // 0-3 bytes left.
1776   test_bit(tmp1, cnt1, 1);
1777   beqz(tmp1, TAIL01);
1778   {
1779     lhu(tmp1, Address(a1));
1780     lhu(tmp2, Address(a2));
1781     addi(a1, a1, 2);
1782     addi(a2, a2, 2);
1783     bne(tmp1, tmp2, DONE);
1784   }
1785 
1786   bind(TAIL01);
1787   // 0-1 bytes left.
1788   test_bit(tmp1, cnt1, 0);
1789   beqz(tmp1, SAME);
1790   {
1791     lbu(tmp1, Address(a1));
1792     lbu(tmp2, Address(a2));
1793     bne(tmp1, tmp2, DONE);
1794   }
1795 
1796   // Arrays are equal.
1797   bind(SAME);
1798   mv(result, true);
1799 
1800   // That's it.
1801   bind(DONE);
1802   BLOCK_COMMENT("} string_equals");
1803 }
1804 
1805 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1806 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1807                                         Register tmp1, Register tmp2, Register tmp3,
1808                                         Register tmp4, Register tmp5, Register tmp6,
1809                                         BasicType eltype)
1810 {
1811   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1812 
1813   const int elsize = arrays_hashcode_elsize(eltype);
1814   const int chunks_end_shift = exact_log2(elsize);
1815 
1816   switch (eltype) {
1817   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1818   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1819   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1820   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1821   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1822   default:
1823     ShouldNotReachHere();
1824   }
1825 
1826   const int stride = 4;
1827   const Register pow31_4 = tmp1;
1828   const Register pow31_3 = tmp2;
1829   const Register pow31_2 = tmp3;
1830   const Register chunks  = tmp4;
1831   const Register chunks_end = chunks;
1832 
1833   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1834 
1835   // result has a value initially
1836 
1837   beqz(cnt, DONE);
1838 
1839   andi(chunks, cnt, ~(stride-1));
1840   beqz(chunks, TAIL);
1841 
1842   mv(pow31_4, 923521);           // [31^^4]
1843   mv(pow31_3,  29791);           // [31^^3]
1844   mv(pow31_2,    961);           // [31^^2]
1845 
1846   slli(chunks_end, chunks, chunks_end_shift);
1847   add(chunks_end, ary, chunks_end);
1848   andi(cnt, cnt, stride-1);      // don't forget about tail!
1849 
1850   bind(WIDE_LOOP);
1851   mulw(result, result, pow31_4); // 31^^4 * h
1852   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1853   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1854   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1855   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1856   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1857   addw(result, result, t0);
1858   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1859   addw(result, result, t1);
1860   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1861   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1862   addw(result, result, tmp5);
1863   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1864                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1865   addi(ary, ary, elsize * stride);
1866   bne(ary, chunks_end, WIDE_LOOP);
1867   beqz(cnt, DONE);
1868 
1869   bind(TAIL);
1870   slli(chunks_end, cnt, chunks_end_shift);
1871   add(chunks_end, ary, chunks_end);
1872 
1873   bind(TAIL_LOOP);
1874   arrays_hashcode_elload(t0, Address(ary), eltype);
1875   slli(t1, result, 5);           // optimize 31 * result
1876   subw(result, t1, result);      // with result<<5 - result
1877   addw(result, result, t0);
1878   addi(ary, ary, elsize);
1879   bne(ary, chunks_end, TAIL_LOOP);
1880 
1881   bind(DONE);
1882   BLOCK_COMMENT("} // arrays_hashcode");
1883 }
1884 
1885 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1886   switch (eltype) {
1887   case T_BOOLEAN: return sizeof(jboolean);
1888   case T_BYTE:    return sizeof(jbyte);
1889   case T_SHORT:   return sizeof(jshort);
1890   case T_CHAR:    return sizeof(jchar);
1891   case T_INT:     return sizeof(jint);
1892   default:
1893     ShouldNotReachHere();
1894     return -1;
1895   }
1896 }
1897 
1898 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1899   switch (eltype) {
1900   // T_BOOLEAN used as surrogate for unsigned byte
1901   case T_BOOLEAN: lbu(dst, src);   break;
1902   case T_BYTE:     lb(dst, src);   break;
1903   case T_SHORT:    lh(dst, src);   break;
1904   case T_CHAR:    lhu(dst, src);   break;
1905   case T_INT:      lw(dst, src);   break;
1906   default:
1907     ShouldNotReachHere();
1908   }
1909 }
1910 
1911 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1912 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1913                                                               bool is_far, bool is_unordered);
1914 
1915 static conditional_branch_insn conditional_branches[] =
1916 {
1917   /* SHORT branches */
1918   (conditional_branch_insn)&MacroAssembler::beq,
1919   (conditional_branch_insn)&MacroAssembler::bgt,
1920   nullptr, // BoolTest::overflow
1921   (conditional_branch_insn)&MacroAssembler::blt,
1922   (conditional_branch_insn)&MacroAssembler::bne,
1923   (conditional_branch_insn)&MacroAssembler::ble,
1924   nullptr, // BoolTest::no_overflow
1925   (conditional_branch_insn)&MacroAssembler::bge,
1926 
1927   /* UNSIGNED branches */
1928   (conditional_branch_insn)&MacroAssembler::beq,
1929   (conditional_branch_insn)&MacroAssembler::bgtu,
1930   nullptr,
1931   (conditional_branch_insn)&MacroAssembler::bltu,
1932   (conditional_branch_insn)&MacroAssembler::bne,
1933   (conditional_branch_insn)&MacroAssembler::bleu,
1934   nullptr,
1935   (conditional_branch_insn)&MacroAssembler::bgeu
1936 };
1937 
1938 static float_conditional_branch_insn float_conditional_branches[] =
1939 {
1940   /* FLOAT SHORT branches */
1941   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1942   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1943   nullptr,  // BoolTest::overflow
1944   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1945   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1946   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1947   nullptr, // BoolTest::no_overflow
1948   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1949 
1950   /* DOUBLE SHORT branches */
1951   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1952   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1953   nullptr,
1954   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1955   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1956   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1957   nullptr,
1958   (float_conditional_branch_insn)&MacroAssembler::double_bge
1959 };
1960 
1961 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1962   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1963          "invalid conditional branch index");
1964   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1965 }
1966 
1967 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1968 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1969 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1970   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1971          "invalid float conditional branch index");
1972   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1973   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1974     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1975 }
1976 
1977 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1978   switch (cmpFlag) {
1979     case BoolTest::eq:
1980     case BoolTest::le:
1981       beqz(op1, L, is_far);
1982       break;
1983     case BoolTest::ne:
1984     case BoolTest::gt:
1985       bnez(op1, L, is_far);
1986       break;
1987     default:
1988       ShouldNotReachHere();
1989   }
1990 }
1991 
1992 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1993   switch (cmpFlag) {
1994     case BoolTest::eq:
1995       beqz(op1, L, is_far);
1996       break;
1997     case BoolTest::ne:
1998       bnez(op1, L, is_far);
1999       break;
2000     default:
2001       ShouldNotReachHere();
2002   }
2003 }
2004 
2005 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
2006   Label L;
2007   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
2008   mv(dst, src);
2009   bind(L);
2010 }
2011 
2012 // Set dst to NaN if any NaN input.
2013 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
2014                                   bool is_double, bool is_min) {
2015   assert_different_registers(dst, src1, src2);
2016 
2017   Label Done, Compare;
2018 
2019   is_double ? fclass_d(t0, src1)
2020             : fclass_s(t0, src1);
2021   is_double ? fclass_d(t1, src2)
2022             : fclass_s(t1, src2);
2023   orr(t0, t0, t1);
2024   andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2025   beqz(t0, Compare);
2026   is_double ? fadd_d(dst, src1, src2)
2027             : fadd_s(dst, src1, src2);
2028   j(Done);
2029 
2030   bind(Compare);
2031   if (is_double) {
2032     is_min ? fmin_d(dst, src1, src2)
2033            : fmax_d(dst, src1, src2);
2034   } else {
2035     is_min ? fmin_s(dst, src1, src2)
2036            : fmax_s(dst, src1, src2);
2037   }
2038 
2039   bind(Done);
2040 }
2041 
2042 // According to Java SE specification, for floating-point round operations, if
2043 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
2044 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
2045 // round out-of-range values to the nearest max or min value), therefore special
2046 // handling is needed by NaN, +/-Infinity, +/-0.
2047 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
2048                                           Register tmp1, Register tmp2, Register tmp3) {
2049 
2050   assert_different_registers(dst, src);
2051   assert_different_registers(tmp1, tmp2, tmp3);
2052 
2053   // Set rounding mode for conversions
2054   // Here we use similar modes to double->long and long->double conversions
2055   // Different mode for long->double conversion matter only if long value was not representable as double,
2056   // we got long value as a result of double->long conversion so, it is definitely representable
2057   RoundingMode rm;
2058   switch (round_mode) {
2059     case RoundDoubleModeNode::rmode_ceil:
2060       rm = RoundingMode::rup;
2061       break;
2062     case RoundDoubleModeNode::rmode_floor:
2063       rm = RoundingMode::rdn;
2064       break;
2065     case RoundDoubleModeNode::rmode_rint:
2066       rm = RoundingMode::rne;
2067       break;
2068     default:
2069       ShouldNotReachHere();
2070   }
2071 
2072   // tmp1 - is a register to store double converted to long int
2073   // tmp2 - is a register to create constant for comparison
2074   // tmp3 - is a register where we store modified result of double->long conversion
2075   Label done, bad_val;
2076 
2077   // Conversion from double to long
2078   fcvt_l_d(tmp1, src, rm);
2079 
2080   // Generate constant (tmp2)
2081   // tmp2 = 100...0000
2082   addi(tmp2, zr, 1);
2083   slli(tmp2, tmp2, 63);
2084 
2085   // Prepare converted long (tmp1)
2086   // as a result when conversion overflow we got:
2087   // tmp1 = 011...1111 or 100...0000
2088   // Convert it to: tmp3 = 100...0000
2089   addi(tmp3, tmp1, 1);
2090   andi(tmp3, tmp3, -2);
2091   beq(tmp3, tmp2, bad_val);
2092 
2093   // Conversion from long to double
2094   fcvt_d_l(dst, tmp1, rm);
2095   // Add sign of input value to result for +/- 0 cases
2096   fsgnj_d(dst, dst, src);
2097   j(done);
2098 
2099   // If got conversion overflow return src
2100   bind(bad_val);
2101   fmv_d(dst, src);
2102 
2103   bind(done);
2104 }
2105 
2106 // According to Java SE specification, for floating-point signum operations, if
2107 // on input we have NaN or +/-0.0 value we should return it,
2108 // otherwise return +/- 1.0 using sign of input.
2109 // one - gives us a floating-point 1.0 (got from matching rule)
2110 // bool is_double - specifies single or double precision operations will be used.
2111 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2112   Label done;
2113 
2114   is_double ? fclass_d(t0, dst)
2115             : fclass_s(t0, dst);
2116 
2117   // check if input is -0, +0, signaling NaN or quiet NaN
2118   andi(t0, t0, fclass_mask::zero | fclass_mask::nan);
2119 
2120   bnez(t0, done);
2121 
2122   // use floating-point 1.0 with a sign of input
2123   is_double ? fsgnj_d(dst, one, dst)
2124             : fsgnj_s(dst, one, dst);
2125 
2126   bind(done);
2127 }
2128 
2129 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2130 #define __ masm.
2131   FloatRegister dst = stub.data<0>();
2132   Register src = stub.data<1>();
2133   Register tmp = stub.data<2>();
2134   __ bind(stub.entry());
2135 
2136   // following instructions mainly focus on NaN, as riscv does not handle
2137   // NaN well with fcvt, but the code also works for Inf at the same time.
2138 
2139   // construct a NaN in 32 bits from the NaN in 16 bits,
2140   // we need the payloads of non-canonical NaNs to be preserved.
2141   __ mv(tmp, 0x7f800000);
2142   // sign-bit was already set via sign-extension if necessary.
2143   __ slli(t0, src, 13);
2144   __ orr(tmp, t0, tmp);
2145   __ fmv_w_x(dst, tmp);
2146 
2147   __ j(stub.continuation());
2148 #undef __
2149 }
2150 
2151 // j.l.Float.float16ToFloat
2152 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2153   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2154 
2155   // On riscv, NaN needs a special process as fcvt does not work in that case.
2156   // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2157   // but we consider to get the slow path to process NaN and Inf at the same time,
2158   // as both of them are rare cases, and if we try to get the slow path to handle
2159   // only NaN case it would sacrifise the performance for normal cases,
2160   // i.e. non-NaN and non-Inf cases.
2161 
2162   // check whether it's a NaN or +/- Inf.
2163   mv(t0, 0x7c00);
2164   andr(tmp, src, t0);
2165   // jump to stub processing NaN and Inf cases.
2166   beq(t0, tmp, stub->entry());
2167 
2168   // non-NaN or non-Inf cases, just use built-in instructions.
2169   fmv_h_x(dst, src);
2170   fcvt_s_h(dst, dst);
2171 
2172   bind(stub->continuation());
2173 }
2174 
2175 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2176 #define __ masm.
2177   Register dst = stub.data<0>();
2178   FloatRegister src = stub.data<1>();
2179   Register tmp = stub.data<2>();
2180   __ bind(stub.entry());
2181 
2182   __ fmv_x_w(dst, src);
2183 
2184   // preserve the payloads of non-canonical NaNs.
2185   __ srai(dst, dst, 13);
2186   // preserve the sign bit.
2187   __ srai(tmp, dst, 13);
2188   __ slli(tmp, tmp, 10);
2189   __ mv(t0, 0x3ff);
2190   __ orr(tmp, tmp, t0);
2191 
2192   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2193   __ andr(dst, dst, tmp);
2194 
2195   __ j(stub.continuation());
2196 #undef __
2197 }
2198 
2199 // j.l.Float.floatToFloat16
2200 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2201   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path);
2202 
2203   // On riscv, NaN needs a special process as fcvt does not work in that case.
2204 
2205   // check whether it's a NaN.
2206   // replace fclass with feq as performance optimization.
2207   feq_s(t0, src, src);
2208   // jump to stub processing NaN cases.
2209   beqz(t0, stub->entry());
2210 
2211   // non-NaN cases, just use built-in instructions.
2212   fcvt_h_s(ftmp, src);
2213   fmv_x_h(dst, ftmp);
2214 
2215   bind(stub->continuation());
2216 }
2217 
2218 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2219 #define __ masm.
2220   VectorRegister dst = stub.data<0>();
2221   VectorRegister src = stub.data<1>();
2222   uint vector_length = stub.data<2>();
2223   __ bind(stub.entry());
2224 
2225   // following instructions mainly focus on NaN, as riscv does not handle
2226   // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2227   //
2228   // construct NaN's in 32 bits from the NaN's in 16 bits,
2229   // we need the payloads of non-canonical NaNs to be preserved.
2230 
2231   // adjust vector type to 2 * SEW.
2232   __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2233   // widen and sign-extend src data.
2234   __ vsext_vf2(dst, src, Assembler::v0_t);
2235   __ mv(t0, 0x7f800000);
2236   // sign-bit was already set via sign-extension if necessary.
2237   __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2238   __ vor_vx(dst, dst, t0, Assembler::v0_t);
2239 
2240   __ j(stub.continuation());
2241 #undef __
2242 }
2243 
2244 // j.l.Float.float16ToFloat
2245 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2246   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2247               (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2248   assert_different_registers(dst, src);
2249 
2250   // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2251   // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2252   // but we consider to get the slow path to process NaN and Inf at the same time,
2253   // as both of them are rare cases, and if we try to get the slow path to handle
2254   // only NaN case it would sacrifise the performance for normal cases,
2255   // i.e. non-NaN and non-Inf cases.
2256 
2257   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2258 
2259   // check whether there is a NaN or +/- Inf.
2260   mv(t0, 0x7c00);
2261   vand_vx(v0, src, t0);
2262   // v0 will be used as mask in slow path.
2263   vmseq_vx(v0, v0, t0);
2264   vcpop_m(t0, v0);
2265 
2266   // For non-NaN or non-Inf cases, just use built-in instructions.
2267   vfwcvt_f_f_v(dst, src);
2268 
2269   // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2270   bnez(t0, stub->entry());
2271 
2272   bind(stub->continuation());
2273 }
2274 
2275 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2276                                          C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2277 #define __ masm.
2278   VectorRegister dst = stub.data<0>();
2279   VectorRegister src = stub.data<1>();
2280   VectorRegister tmp = stub.data<2>();
2281   __ bind(stub.entry());
2282 
2283   // mul is already set to mf2 in float_to_float16_v.
2284 
2285   // preserve the payloads of non-canonical NaNs.
2286   __ vnsra_wi(dst, src, 13, Assembler::v0_t);
2287 
2288   // preserve the sign bit.
2289   __ vnsra_wi(tmp, src, 26, Assembler::v0_t);
2290   __ vsll_vi(tmp, tmp, 10, Assembler::v0_t);
2291   __ mv(t0, 0x3ff);
2292   __ vor_vx(tmp, tmp, t0, Assembler::v0_t);
2293 
2294   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2295   __ vand_vv(dst, dst, tmp, Assembler::v0_t);
2296 
2297   __ j(stub.continuation());
2298 #undef __
2299 }
2300 
2301 // j.l.Float.float16ToFloat
2302 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp,
2303                                            Register tmp, uint vector_length) {
2304   assert_different_registers(dst, src, vtmp);
2305 
2306   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2307               (dst, src, vtmp, 28, float_to_float16_v_slow_path);
2308 
2309   // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2310 
2311   vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2312 
2313   // check whether there is a NaN.
2314   // replace v_fclass with vmseq_vv as performance optimization.
2315   vmfne_vv(v0, src, src);
2316   vcpop_m(t0, v0);
2317 
2318   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2319 
2320   // For non-NaN cases, just use built-in instructions.
2321   vfncvt_f_f_w(dst, src);
2322 
2323   // jump to stub processing NaN cases.
2324   bnez(t0, stub->entry());
2325 
2326   bind(stub->continuation());
2327 }
2328 
2329 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2330   vsetvli_helper(bt, vlen);
2331 
2332   // check if input is -0, +0, signaling NaN or quiet NaN
2333   vfclass_v(v0, dst);
2334   mv(t0, fclass_mask::zero | fclass_mask::nan);
2335   vand_vx(v0, v0, t0);
2336   vmseq_vi(v0, v0, 0);
2337 
2338   // use floating-point 1.0 with a sign of input
2339   vfsgnj_vv(dst, one, dst, v0_t);
2340 }
2341 
2342 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
2343   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2344   // intrinsic is enabled when MaxVectorSize >= 16
2345   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2346   long len = is_long ? 64 : 32;
2347 
2348   // load the src data(in bits) to be compressed.
2349   vsetivli(x0, 1, sew, Assembler::m1);
2350   vmv_s_x(v0, src);
2351   // reset the src data(in bytes) to zero.
2352   mv(t0, len);
2353   vsetvli(x0, t0, Assembler::e8, lmul);
2354   vmv_v_i(v4, 0);
2355   // convert the src data from bits to bytes.
2356   vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
2357   // reset the dst data(in bytes) to zero.
2358   vmv_v_i(v8, 0);
2359   // load the mask data(in bits).
2360   vsetivli(x0, 1, sew, Assembler::m1);
2361   vmv_s_x(v0, mask);
2362   // compress the src data(in bytes) to dst(in bytes).
2363   vsetvli(x0, t0, Assembler::e8, lmul);
2364   vcompress_vm(v8, v4, v0);
2365   // convert the dst data from bytes to bits.
2366   vmseq_vi(v0, v8, 1);
2367   // store result back.
2368   vsetivli(x0, 1, sew, Assembler::m1);
2369   vmv_x_s(dst, v0);
2370 }
2371 
2372 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
2373   compress_bits_v(dst, src, mask, /* is_long */ false);
2374 }
2375 
2376 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
2377   compress_bits_v(dst, src, mask, /* is_long */ true);
2378 }
2379 
2380 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
2381   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2382   // intrinsic is enabled when MaxVectorSize >= 16
2383   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2384   long len = is_long ? 64 : 32;
2385 
2386   // load the src data(in bits) to be expanded.
2387   vsetivli(x0, 1, sew, Assembler::m1);
2388   vmv_s_x(v0, src);
2389   // reset the src data(in bytes) to zero.
2390   mv(t0, len);
2391   vsetvli(x0, t0, Assembler::e8, lmul);
2392   vmv_v_i(v4, 0);
2393   // convert the src data from bits to bytes.
2394   vmerge_vim(v4, v4, 1); // v0 as implicit mask register
2395   // reset the dst data(in bytes) to zero.
2396   vmv_v_i(v12, 0);
2397   // load the mask data(in bits).
2398   vsetivli(x0, 1, sew, Assembler::m1);
2399   vmv_s_x(v0, mask);
2400   // expand the src data(in bytes) to dst(in bytes).
2401   vsetvli(x0, t0, Assembler::e8, lmul);
2402   viota_m(v8, v0);
2403   vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
2404   // convert the dst data from bytes to bits.
2405   vmseq_vi(v0, v12, 1);
2406   // store result back.
2407   vsetivli(x0, 1, sew, Assembler::m1);
2408   vmv_x_s(dst, v0);
2409 }
2410 
2411 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
2412   expand_bits_v(dst, src, mask, /* is_long */ false);
2413 }
2414 
2415 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
2416   expand_bits_v(dst, src, mask, /* is_long */ true);
2417 }
2418 
2419 // j.l.Math.round(float)
2420 //  Returns the closest int to the argument, with ties rounding to positive infinity.
2421 // We need to handle 3 special cases defined by java api spec:
2422 //    NaN,
2423 //    float >= Integer.MAX_VALUE,
2424 //    float <= Integer.MIN_VALUE.
2425 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2426                                            BasicType bt, uint vector_length) {
2427   // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined,
2428   // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g.
2429   //  RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted
2430   //    to 2, instead of 2 and 3 respectively.
2431   //  RUP does not work either, although java api requires "rounding to positive infinity",
2432   //    but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively.
2433   //
2434   // The optimal solution for non-NaN cases is:
2435   //    src+0.5 => dst, with rdn rounding mode,
2436   //    convert dst from float to int, with rnd rounding mode.
2437   // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE.
2438   //
2439   // But, we still need to handle NaN explicilty with vector mask instructions.
2440   //
2441   // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details.
2442 
2443   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2444   vsetvli_helper(bt, vector_length);
2445 
2446   // don't rearrage the instructions sequence order without performance testing.
2447   // check MacroAssembler::java_round_float in riscv64 for more details.
2448   mv(t0, jint_cast(0.5f));
2449   fmv_w_x(ftmp, t0);
2450 
2451   // replacing vfclass with feq as performance optimization
2452   vmfeq_vv(v0, src, src);
2453   // set dst = 0 in cases of NaN
2454   vmv_v_x(dst, zr);
2455 
2456   // dst = (src + 0.5) rounded down towards negative infinity
2457   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2458   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2459 
2460   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2461 }
2462 
2463 // java.lang.Math.round(double a)
2464 // Returns the closest long to the argument, with ties rounding to positive infinity.
2465 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2466                                             BasicType bt, uint vector_length) {
2467   // check C2_MacroAssembler::java_round_float_v above for more details.
2468 
2469   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2470   vsetvli_helper(bt, vector_length);
2471 
2472   mv(t0, julong_cast(0.5));
2473   fmv_d_x(ftmp, t0);
2474 
2475   // replacing vfclass with feq as performance optimization
2476   vmfeq_vv(v0, src, src);
2477   // set dst = 0 in cases of NaN
2478   vmv_v_x(dst, zr);
2479 
2480   // dst = (src + 0.5) rounded down towards negative infinity
2481   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2482   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2483 
2484   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2485 }
2486 
2487 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2488                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE,
2489                                         Assembler::LMUL lmul) {
2490   Label loop;
2491   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2492 
2493   bind(loop);
2494   vsetvli(tmp1, cnt, sew, lmul);
2495   vlex_v(vr1, a1, sew);
2496   vlex_v(vr2, a2, sew);
2497   vmsne_vv(vrs, vr1, vr2);
2498   vfirst_m(tmp2, vrs);
2499   bgez(tmp2, DONE);
2500   sub(cnt, cnt, tmp1);
2501   if (!islatin) {
2502     slli(tmp1, tmp1, 1); // get byte counts
2503   }
2504   add(a1, a1, tmp1);
2505   add(a2, a2, tmp1);
2506   bnez(cnt, loop);
2507 
2508   mv(result, true);
2509 }
2510 
2511 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2512   Label DONE;
2513   Register tmp1 = t0;
2514   Register tmp2 = t1;
2515 
2516   BLOCK_COMMENT("string_equals_v {");
2517 
2518   mv(result, false);
2519 
2520   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2);
2521 
2522   bind(DONE);
2523   BLOCK_COMMENT("} string_equals_v");
2524 }
2525 
2526 // used by C2 ClearArray patterns.
2527 // base: Address of a buffer to be zeroed
2528 // cnt: Count in HeapWords
2529 //
2530 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2531 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2532   Label loop;
2533 
2534   // making zero words
2535   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2536   vxor_vv(v4, v4, v4);
2537 
2538   bind(loop);
2539   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2540   vse64_v(v4, base);
2541   sub(cnt, cnt, t0);
2542   shadd(base, t0, base, t0, 3);
2543   bnez(cnt, loop);
2544 }
2545 
2546 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2547                                         Register cnt1, int elem_size) {
2548   Label DONE;
2549   Register tmp1 = t0;
2550   Register tmp2 = t1;
2551   Register cnt2 = tmp2;
2552   int length_offset = arrayOopDesc::length_offset_in_bytes();
2553   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2554 
2555   BLOCK_COMMENT("arrays_equals_v {");
2556 
2557   // if (a1 == a2), return true
2558   mv(result, true);
2559   beq(a1, a2, DONE);
2560 
2561   mv(result, false);
2562   // if a1 == null or a2 == null, return false
2563   beqz(a1, DONE);
2564   beqz(a2, DONE);
2565   // if (a1.length != a2.length), return false
2566   lwu(cnt1, Address(a1, length_offset));
2567   lwu(cnt2, Address(a2, length_offset));
2568   bne(cnt1, cnt2, DONE);
2569 
2570   la(a1, Address(a1, base_offset));
2571   la(a2, Address(a2, base_offset));
2572 
2573   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2);
2574 
2575   bind(DONE);
2576 
2577   BLOCK_COMMENT("} arrays_equals_v");
2578 }
2579 
2580 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2581                                          Register result, Register tmp1, Register tmp2, int encForm) {
2582   Label DIFFERENCE, DONE, L, loop;
2583   bool encLL = encForm == StrIntrinsicNode::LL;
2584   bool encLU = encForm == StrIntrinsicNode::LU;
2585   bool encUL = encForm == StrIntrinsicNode::UL;
2586 
2587   bool str1_isL = encLL || encLU;
2588   bool str2_isL = encLL || encUL;
2589 
2590   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2591 
2592   BLOCK_COMMENT("string_compare {");
2593 
2594   // for Latin strings, 1 byte for 1 character
2595   // for UTF16 strings, 2 bytes for 1 character
2596   if (!str1_isL)
2597     sraiw(cnt1, cnt1, 1);
2598   if (!str2_isL)
2599     sraiw(cnt2, cnt2, 1);
2600 
2601   // if str1 == str2, return the difference
2602   // save the minimum of the string lengths in cnt2.
2603   sub(result, cnt1, cnt2);
2604   bgt(cnt1, cnt2, L);
2605   mv(cnt2, cnt1);
2606   bind(L);
2607 
2608   // We focus on the optimization of small sized string.
2609   // Please check below document for string size distribution statistics.
2610   // https://cr.openjdk.org/~shade/density/string-density-report.pdf
2611   if (str1_isL == str2_isL) { // LL or UU
2612     // Below construction of v regs and lmul is based on test on 2 different boards,
2613     // vlen == 128 and vlen == 256 respectively.
2614     if (!encLL && MaxVectorSize == 16) { // UU
2615       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4);
2616     } else { // UU + MaxVectorSize or LL
2617       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2);
2618     }
2619 
2620     j(DONE);
2621   } else { // LU or UL
2622     Register strL = encLU ? str1 : str2;
2623     Register strU = encLU ? str2 : str1;
2624     VectorRegister vstr1 = encLU ? v8 : v4;
2625     VectorRegister vstr2 = encLU ? v4 : v8;
2626 
2627     bind(loop);
2628     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2629     vle8_v(vstr1, strL);
2630     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2631     vzext_vf2(vstr2, vstr1);
2632     vle16_v(vstr1, strU);
2633     vmsne_vv(v4, vstr2, vstr1);
2634     vfirst_m(tmp2, v4);
2635     bgez(tmp2, DIFFERENCE);
2636     sub(cnt2, cnt2, tmp1);
2637     add(strL, strL, tmp1);
2638     shadd(strU, tmp1, strU, tmp1, 1);
2639     bnez(cnt2, loop);
2640     j(DONE);
2641   }
2642 
2643   bind(DIFFERENCE);
2644   slli(tmp1, tmp2, 1);
2645   add(str1, str1, str1_isL ? tmp2 : tmp1);
2646   add(str2, str2, str2_isL ? tmp2 : tmp1);
2647   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2648   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2649   sub(result, tmp1, tmp2);
2650 
2651   bind(DONE);
2652 }
2653 
2654 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2655   Label loop;
2656   assert_different_registers(src, dst, len, tmp, t0);
2657 
2658   BLOCK_COMMENT("byte_array_inflate_v {");
2659   bind(loop);
2660   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2661   vle8_v(v6, src);
2662   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2663   vzext_vf2(v4, v6);
2664   vse16_v(v4, dst);
2665   sub(len, len, tmp);
2666   add(src, src, tmp);
2667   shadd(dst, tmp, dst, tmp, 1);
2668   bnez(len, loop);
2669   BLOCK_COMMENT("} byte_array_inflate_v");
2670 }
2671 
2672 // Compress char[] array to byte[].
2673 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2674 // result: the array length if every element in array can be encoded,
2675 // otherwise, the index of first non-latin1 (> 0xff) character.
2676 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2677                                               Register result, Register tmp) {
2678   encode_iso_array_v(src, dst, len, result, tmp, false);
2679 }
2680 
2681 // Intrinsic for
2682 //
2683 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2684 //     return the number of characters copied.
2685 // - java/lang/StringUTF16.compress
2686 //     return index of non-latin1 character if copy fails, otherwise 'len'.
2687 //
2688 // This version always returns the number of characters copied. A successful
2689 // copy will complete with the post-condition: 'res' == 'len', while an
2690 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2691 //
2692 // Clobbers: src, dst, len, result, t0
2693 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2694                                            Register result, Register tmp, bool ascii) {
2695   Label loop, fail, done;
2696 
2697   BLOCK_COMMENT("encode_iso_array_v {");
2698   mv(result, 0);
2699 
2700   bind(loop);
2701   mv(tmp, ascii ? 0x7f : 0xff);
2702   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2703   vle16_v(v2, src);
2704 
2705   vmsgtu_vx(v1, v2, tmp);
2706   vfirst_m(tmp, v1);
2707   vmsbf_m(v0, v1);
2708   // compress char to byte
2709   vsetvli(t0, len, Assembler::e8);
2710   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2711   vse8_v(v1, dst, Assembler::v0_t);
2712 
2713   // fail if char > 0x7f/0xff
2714   bgez(tmp, fail);
2715   add(result, result, t0);
2716   add(dst, dst, t0);
2717   sub(len, len, t0);
2718   shadd(src, t0, src, t0, 1);
2719   bnez(len, loop);
2720   j(done);
2721 
2722   bind(fail);
2723   add(result, result, tmp);
2724 
2725   bind(done);
2726   BLOCK_COMMENT("} encode_iso_array_v");
2727 }
2728 
2729 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2730   Label LOOP, SET_RESULT, DONE;
2731 
2732   BLOCK_COMMENT("count_positives_v {");
2733   assert_different_registers(ary, len, result, tmp);
2734 
2735   mv(result, zr);
2736 
2737   bind(LOOP);
2738   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2739   vle8_v(v4, ary);
2740   vmslt_vx(v4, v4, zr);
2741   vfirst_m(tmp, v4);
2742   bgez(tmp, SET_RESULT);
2743   // if tmp == -1, all bytes are positive
2744   add(result, result, t0);
2745 
2746   sub(len, len, t0);
2747   add(ary, ary, t0);
2748   bnez(len, LOOP);
2749   j(DONE);
2750 
2751   // add remaining positive bytes count
2752   bind(SET_RESULT);
2753   add(result, result, tmp);
2754 
2755   bind(DONE);
2756   BLOCK_COMMENT("} count_positives_v");
2757 }
2758 
2759 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2760                                               Register ch, Register result,
2761                                               Register tmp1, Register tmp2,
2762                                               bool isL) {
2763   mv(result, zr);
2764 
2765   Label loop, MATCH, DONE;
2766   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2767   bind(loop);
2768   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2769   vlex_v(v4, str1, sew);
2770   vmseq_vx(v4, v4, ch);
2771   vfirst_m(tmp2, v4);
2772   bgez(tmp2, MATCH); // if equal, return index
2773 
2774   add(result, result, tmp1);
2775   sub(cnt1, cnt1, tmp1);
2776   if (!isL) slli(tmp1, tmp1, 1);
2777   add(str1, str1, tmp1);
2778   bnez(cnt1, loop);
2779 
2780   mv(result, -1);
2781   j(DONE);
2782 
2783   bind(MATCH);
2784   add(result, result, tmp2);
2785 
2786   bind(DONE);
2787 }
2788 
2789 // Set dst to NaN if any NaN input.
2790 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2791                                     BasicType bt, bool is_min, uint vector_length) {
2792   assert_different_registers(dst, src1, src2);
2793 
2794   vsetvli_helper(bt, vector_length);
2795 
2796   is_min ? vfmin_vv(dst, src1, src2)
2797          : vfmax_vv(dst, src1, src2);
2798 
2799   vmfne_vv(v0,  src1, src1);
2800   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2801   vmfne_vv(v0,  src2, src2);
2802   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2803 }
2804 
2805 // Set dst to NaN if any NaN input.
2806 // The destination vector register elements corresponding to masked-off elements
2807 // are handled with a mask-undisturbed policy.
2808 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2809                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2810                                            BasicType bt, bool is_min, uint vector_length) {
2811   assert_different_registers(src1, src2, tmp1, tmp2);
2812   vsetvli_helper(bt, vector_length);
2813 
2814   // Check vector elements of src1 and src2 for NaN.
2815   vmfeq_vv(tmp1, src1, src1);
2816   vmfeq_vv(tmp2, src2, src2);
2817 
2818   vmandn_mm(v0, vmask, tmp1);
2819   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2820   vmandn_mm(v0, vmask, tmp2);
2821   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2822 
2823   vmand_mm(tmp2, tmp1, tmp2);
2824   vmand_mm(v0, vmask, tmp2);
2825   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2826          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2827 }
2828 
2829 // Set dst to NaN if any NaN input.
2830 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2831                                            FloatRegister src1, VectorRegister src2,
2832                                            VectorRegister tmp1, VectorRegister tmp2,
2833                                            bool is_double, bool is_min, uint vector_length, VectorMask vm) {
2834   assert_different_registers(dst, src1);
2835   assert_different_registers(src2, tmp1, tmp2);
2836 
2837   Label L_done, L_NaN_1, L_NaN_2;
2838   // Set dst to src1 if src1 is NaN
2839   is_double ? feq_d(t0, src1, src1)
2840             : feq_s(t0, src1, src1);
2841   beqz(t0, L_NaN_2);
2842 
2843   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2844   vfmv_s_f(tmp2, src1);
2845 
2846   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2847          : vfredmax_vs(tmp1, src2, tmp2, vm);
2848   vfmv_f_s(dst, tmp1);
2849 
2850   // Checking NaNs in src2
2851   vmfne_vv(tmp1, src2, src2, vm);
2852   vcpop_m(t0, tmp1, vm);
2853   beqz(t0, L_done);
2854 
2855   bind(L_NaN_1);
2856   vfredusum_vs(tmp1, src2, tmp2, vm);
2857   vfmv_f_s(dst, tmp1);
2858   j(L_done);
2859 
2860   bind(L_NaN_2);
2861   is_double ? fmv_d(dst, src1)
2862             : fmv_s(dst, src1);
2863   bind(L_done);
2864 }
2865 
2866 bool C2_MacroAssembler::in_scratch_emit_size() {
2867   if (ciEnv::current()->task() != nullptr) {
2868     PhaseOutput* phase_output = Compile::current()->output();
2869     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2870       return true;
2871     }
2872   }
2873   return MacroAssembler::in_scratch_emit_size();
2874 }
2875 
2876 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2877                                           VectorRegister src2, VectorRegister tmp,
2878                                           int opc, BasicType bt, uint vector_length, VectorMask vm) {
2879   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2880   vsetvli_helper(bt, vector_length);
2881   vmv_s_x(tmp, src1);
2882   switch (opc) {
2883     case Op_AddReductionVI:
2884     case Op_AddReductionVL:
2885       vredsum_vs(tmp, src2, tmp, vm);
2886       break;
2887     case Op_AndReductionV:
2888       vredand_vs(tmp, src2, tmp, vm);
2889       break;
2890     case Op_OrReductionV:
2891       vredor_vs(tmp, src2, tmp, vm);
2892       break;
2893     case Op_XorReductionV:
2894       vredxor_vs(tmp, src2, tmp, vm);
2895       break;
2896     case Op_MaxReductionV:
2897       vredmax_vs(tmp, src2, tmp, vm);
2898       break;
2899     case Op_MinReductionV:
2900       vredmin_vs(tmp, src2, tmp, vm);
2901       break;
2902     default:
2903       ShouldNotReachHere();
2904   }
2905   vmv_x_s(dst, tmp);
2906 }
2907 
2908 // Set vl and vtype for full and partial vector operations.
2909 // (vma = mu, vta = tu, vill = false)
2910 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
2911   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2912   if (vector_length <= 31) {
2913     vsetivli(tmp, vector_length, sew, vlmul);
2914   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2915     vsetvli(tmp, x0, sew, vlmul);
2916   } else {
2917     mv(tmp, vector_length);
2918     vsetvli(tmp, tmp, sew, vlmul);
2919   }
2920 }
2921 
2922 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2923                                            int cond, BasicType bt, uint vector_length, VectorMask vm) {
2924   assert(is_integral_type(bt), "unsupported element type");
2925   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2926   vsetvli_helper(bt, vector_length);
2927   vmclr_m(vd);
2928   switch (cond) {
2929     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2930     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2931     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2932     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2933     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2934     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2935     case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
2936     case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
2937     case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
2938     case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
2939     default:
2940       assert(false, "unsupported compare condition");
2941       ShouldNotReachHere();
2942   }
2943 }
2944 
2945 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2946                                      int cond, BasicType bt, uint vector_length, VectorMask vm) {
2947   assert(is_floating_point_type(bt), "unsupported element type");
2948   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2949   vsetvli_helper(bt, vector_length);
2950   vmclr_m(vd);
2951   switch (cond) {
2952     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2953     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2954     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2955     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2956     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2957     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2958     default:
2959       assert(false, "unsupported compare condition");
2960       ShouldNotReachHere();
2961   }
2962 }
2963 
2964 // In Matcher::scalable_predicate_reg_slots,
2965 // we assume each predicate register is one-eighth of the size of
2966 // scalable vector register, one mask bit per vector byte.
2967 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
2968   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2969   add(t0, sp, offset);
2970   vse8_v(v, t0);
2971 }
2972 
2973 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
2974   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2975   add(t0, sp, offset);
2976   vle8_v(v, t0);
2977 }
2978 
2979 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
2980                                          VectorRegister src, BasicType src_bt, bool is_signed) {
2981   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2982   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2983   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2984   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2985   // and the overlap is in the highest-numbered part of the destination register group.
2986   // Since LMUL=1, vd and vs cannot be the same.
2987   assert_different_registers(dst, src);
2988 
2989   vsetvli_helper(dst_bt, vector_length);
2990   if (is_signed) {
2991     if (src_bt == T_BYTE) {
2992       switch (dst_bt) {
2993       case T_SHORT:
2994         vsext_vf2(dst, src);
2995         break;
2996       case T_INT:
2997         vsext_vf4(dst, src);
2998         break;
2999       case T_LONG:
3000         vsext_vf8(dst, src);
3001         break;
3002       default:
3003         ShouldNotReachHere();
3004       }
3005     } else if (src_bt == T_SHORT) {
3006       if (dst_bt == T_INT) {
3007         vsext_vf2(dst, src);
3008       } else {
3009         vsext_vf4(dst, src);
3010       }
3011     } else if (src_bt == T_INT) {
3012       vsext_vf2(dst, src);
3013     }
3014   } else {
3015     if (src_bt == T_BYTE) {
3016       switch (dst_bt) {
3017       case T_SHORT:
3018         vzext_vf2(dst, src);
3019         break;
3020       case T_INT:
3021         vzext_vf4(dst, src);
3022         break;
3023       case T_LONG:
3024         vzext_vf8(dst, src);
3025         break;
3026       default:
3027         ShouldNotReachHere();
3028       }
3029     } else if (src_bt == T_SHORT) {
3030       if (dst_bt == T_INT) {
3031         vzext_vf2(dst, src);
3032       } else {
3033         vzext_vf4(dst, src);
3034       }
3035     } else if (src_bt == T_INT) {
3036       vzext_vf2(dst, src);
3037     }
3038   }
3039 }
3040 
3041 // Vector narrow from src to dst with specified element sizes.
3042 // High part of dst vector will be filled with zero.
3043 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3044                                          VectorRegister src, BasicType src_bt) {
3045   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
3046   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3047   mv(t0, vector_length);
3048   if (src_bt == T_LONG) {
3049     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
3050     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
3051     // So we can currently only scale down by 1/2 the width at a time.
3052     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
3053     vncvt_x_x_w(dst, src);
3054     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
3055       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3056       vncvt_x_x_w(dst, dst);
3057       if (dst_bt == T_BYTE) {
3058         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3059         vncvt_x_x_w(dst, dst);
3060       }
3061     }
3062   } else if (src_bt == T_INT) {
3063     // T_SHORT
3064     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3065     vncvt_x_x_w(dst, src);
3066     if (dst_bt == T_BYTE) {
3067       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3068       vncvt_x_x_w(dst, dst);
3069     }
3070   } else if (src_bt == T_SHORT) {
3071     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3072     vncvt_x_x_w(dst, src);
3073   }
3074 }
3075 
3076 #define VFCVT_SAFE(VFLOATCVT)                                                      \
3077 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
3078   assert_different_registers(dst, src);                                            \
3079   vxor_vv(dst, dst, dst);                                                          \
3080   vmfeq_vv(v0, src, src);                                                          \
3081   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
3082 }
3083 
3084 VFCVT_SAFE(vfcvt_rtz_x_f_v);
3085 
3086 #undef VFCVT_SAFE
3087 
3088 // Extract a scalar element from an vector at position 'idx'.
3089 // The input elements in src are expected to be of integral type.
3090 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
3091                                   int idx, VectorRegister tmp) {
3092   assert(is_integral_type(bt), "unsupported element type");
3093   assert(idx >= 0, "idx cannot be negative");
3094   // Only need the first element after vector slidedown
3095   vsetvli_helper(bt, 1);
3096   if (idx == 0) {
3097     vmv_x_s(dst, src);
3098   } else if (idx <= 31) {
3099     vslidedown_vi(tmp, src, idx);
3100     vmv_x_s(dst, tmp);
3101   } else {
3102     mv(t0, idx);
3103     vslidedown_vx(tmp, src, t0);
3104     vmv_x_s(dst, tmp);
3105   }
3106 }
3107 
3108 // Extract a scalar element from an vector at position 'idx'.
3109 // The input elements in src are expected to be of floating point type.
3110 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
3111                                      int idx, VectorRegister tmp) {
3112   assert(is_floating_point_type(bt), "unsupported element type");
3113   assert(idx >= 0, "idx cannot be negative");
3114   // Only need the first element after vector slidedown
3115   vsetvli_helper(bt, 1);
3116   if (idx == 0) {
3117     vfmv_f_s(dst, src);
3118   } else if (idx <= 31) {
3119     vslidedown_vi(tmp, src, idx);
3120     vfmv_f_s(dst, tmp);
3121   } else {
3122     mv(t0, idx);
3123     vslidedown_vx(tmp, src, t0);
3124     vfmv_f_s(dst, tmp);
3125   }
3126 }
--- EOF ---