1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  48                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  49   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  50   Register flag = t1;
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmp1Reg;
  54   Register tmp = tmp2Reg;
  55   Label object_has_monitor;
  56   // Finish fast lock successfully. MUST branch to with flag == 0
  57   Label locked;
  58   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
  59   Label slow_path;
  60 
  61   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  62   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  63 
  64   mv(flag, 1);
  65 
  66   // Load markWord from object into displaced_header.
  67   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  68 
  69   if (DiagnoseSyncOnValueBasedClasses != 0) {
  70     load_klass(tmp, oop);
  71     lbu(tmp, Address(tmp, Klass::misc_flags_offset()));
  72     test_bit(tmp, tmp, exact_log2(KlassFlags::_misc_is_value_based_class));
  73     bnez(tmp, slow_path);
  74   }
  75 
  76   // Check for existing monitor
  77   test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value));
  78   bnez(tmp, object_has_monitor);
  79 
  80   if (LockingMode == LM_MONITOR) {
  81     j(slow_path);
  82   } else {
  83     assert(LockingMode == LM_LEGACY, "must be");
  84     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  85     ori(tmp, disp_hdr, markWord::unlocked_value);
  86 
  87     // Initialize the box. (Must happen before we update the object mark!)
  88     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  89 
  90     // Compare object markWord with an unlocked value (tmp) and if
  91     // equal exchange the stack address of our box with object markWord.
  92     // On failure disp_hdr contains the possibly locked markWord.
  93     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64,
  94             Assembler::aq, Assembler::rl, /*result*/disp_hdr);
  95     beq(disp_hdr, tmp, locked);
  96 
  97     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  98 
  99     // If the compare-and-exchange succeeded, then we found an unlocked
 100     // object, will have now locked it will continue at label locked
 101     // We did not see an unlocked object so try the fast recursive case.
 102 
 103     // Check if the owner is self by comparing the value in the
 104     // markWord of object (disp_hdr) with the stack pointer.
 105     sub(disp_hdr, disp_hdr, sp);
 106     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 107     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked,
 108     // hence we can store 0 as the displaced header in the box, which indicates that it is a
 109     // recursive lock.
 110     andr(tmp/*==0?*/, disp_hdr, tmp);
 111     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 112     beqz(tmp, locked);
 113     j(slow_path);
 114   }
 115 
 116   // Handle existing monitor.
 117   bind(object_has_monitor);
 118   // The object's monitor m is unlocked iff m->owner == nullptr,
 119   // otherwise m->owner may contain a thread or a stack address.
 120   //
 121   // Try to CAS m->owner from null to current thread.
 122   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 123   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64,
 124           Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected)
 125 
 126   // Store a non-null value into the box to avoid looking like a re-entrant
 127   // lock. The fast-path monitor unlock code checks for
 128   // markWord::monitor_value so use markWord::unused_mark which has the
 129   // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 130   mv(tmp, (address)markWord::unused_mark().value());
 131   sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 132 
 133   beqz(tmp3Reg, locked); // CAS success means locking succeeded
 134 
 135   bne(tmp3Reg, xthread, slow_path); // Check for recursive locking
 136 
 137   // Recursive lock case
 138   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg);
 139 
 140   bind(locked);
 141   mv(flag, zr);
 142   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg);
 143 
 144 #ifdef ASSERT
 145   // Check that locked label is reached with flag == 0.
 146   Label flag_correct;
 147   beqz(flag, flag_correct);
 148   stop("Fast Lock Flag != 0");
 149 #endif
 150 
 151   bind(slow_path);
 152 #ifdef ASSERT
 153   // Check that slow_path label is reached with flag != 0.
 154   bnez(flag, flag_correct);
 155   stop("Fast Lock Flag == 0");
 156   bind(flag_correct);
 157 #endif
 158   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 159 }
 160 
 161 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 162                                     Register tmp1Reg, Register tmp2Reg) {
 163   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 164   Register flag = t1;
 165   Register oop = objectReg;
 166   Register box = boxReg;
 167   Register disp_hdr = tmp1Reg;
 168   Register owner_addr = tmp1Reg;
 169   Register tmp = tmp2Reg;
 170   Label object_has_monitor;
 171   // Finish fast lock successfully. MUST branch to with flag == 0
 172   Label unlocked;
 173   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 174   Label slow_path;
 175 
 176   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 177   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 178 
 179   mv(flag, 1);
 180 
 181   if (LockingMode == LM_LEGACY) {
 182     // Find the lock address and load the displaced header from the stack.
 183     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 184 
 185     // If the displaced header is 0, we have a recursive unlock.
 186     beqz(disp_hdr, unlocked);
 187   }
 188 
 189   // Handle existing monitor.
 190   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 191   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 192   bnez(t0, object_has_monitor);
 193 
 194   if (LockingMode == LM_MONITOR) {
 195     j(slow_path);
 196   } else {
 197     assert(LockingMode == LM_LEGACY, "must be");
 198     // Check if it is still a light weight lock, this is true if we
 199     // see the stack address of the basicLock in the markWord of the
 200     // object.
 201 
 202     cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64,
 203             Assembler::relaxed, Assembler::rl, /*result*/tmp);
 204     beq(box, tmp, unlocked); // box == tmp if cas succeeds
 205     j(slow_path);
 206   }
 207 
 208   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 209 
 210   // Handle existing monitor.
 211   bind(object_has_monitor);
 212   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 213   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 214 
 215   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 216 
 217   Label notRecursive;
 218   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 219 
 220   // Recursive lock
 221   addi(disp_hdr, disp_hdr, -1);
 222   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 223   j(unlocked);
 224 
 225   bind(notRecursive);
 226   // Compute owner address.
 227   la(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 228 
 229   // Set owner to null.
 230   // Release to satisfy the JMM
 231   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 232   sd(zr, Address(owner_addr));
 233   // We need a full fence after clearing owner to avoid stranding.
 234   // StoreLoad achieves this.
 235   membar(StoreLoad);
 236 
 237   // Check if the entry lists are empty (EntryList first - by convention).
 238   ld(t0, Address(tmp, ObjectMonitor::EntryList_offset()));
 239   ld(tmp1Reg, Address(tmp, ObjectMonitor::cxq_offset()));
 240   orr(t0, t0, tmp1Reg);
 241   beqz(t0, unlocked); // If so we are done.
 242 
 243   // Check if there is a successor.
 244   ld(t0, Address(tmp, ObjectMonitor::succ_offset()));
 245   bnez(t0, unlocked); // If so we are done.
 246 
 247   // Save the monitor pointer in the current thread, so we can try to
 248   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 249   sd(tmp, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
 250 
 251   mv(flag, 1);
 252   j(slow_path);
 253 
 254   bind(unlocked);
 255   mv(flag, zr);
 256   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg);
 257 
 258 #ifdef ASSERT
 259   // Check that unlocked label is reached with flag == 0.
 260   Label flag_correct;
 261   beqz(flag, flag_correct);
 262   stop("Fast Lock Flag != 0");
 263 #endif
 264 
 265   bind(slow_path);
 266 #ifdef ASSERT
 267   // Check that slow_path label is reached with flag != 0.
 268   bnez(flag, flag_correct);
 269   stop("Fast Lock Flag == 0");
 270   bind(flag_correct);
 271 #endif
 272   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 273 }
 274 
 275 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box,
 276                                               Register tmp1, Register tmp2, Register tmp3) {
 277   // Flag register, zero for success; non-zero for failure.
 278   Register flag = t1;
 279 
 280   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 281   assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0);
 282 
 283   mv(flag, 1);
 284 
 285   // Handle inflated monitor.
 286   Label inflated;
 287   // Finish fast lock successfully. MUST branch to with flag == 0
 288   Label locked;
 289   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 290   Label slow_path;
 291 
 292   if (UseObjectMonitorTable) {
 293     // Clear cache in case fast locking succeeds.
 294     sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 295   }
 296 
 297   if (DiagnoseSyncOnValueBasedClasses != 0) {
 298     load_klass(tmp1, obj);
 299     lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
 300     test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
 301     bnez(tmp1, slow_path);
 302   }
 303 
 304   const Register tmp1_mark = tmp1;
 305   const Register tmp3_t = tmp3;
 306 
 307   { // Lightweight locking
 308 
 309     // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
 310     Label push;
 311 
 312     const Register tmp2_top = tmp2;
 313 
 314     // Check if lock-stack is full.
 315     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 316     mv(tmp3_t, (unsigned)LockStack::end_offset());
 317     bge(tmp2_top, tmp3_t, slow_path);
 318 
 319     // Check if recursive.
 320     add(tmp3_t, xthread, tmp2_top);
 321     ld(tmp3_t, Address(tmp3_t, -oopSize));
 322     beq(obj, tmp3_t, push);
 323 
 324     // Relaxed normal load to check for monitor. Optimization for monitor case.
 325     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 326     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 327     bnez(tmp3_t, inflated);
 328 
 329     // Not inflated
 330     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
 331 
 332     // Try to lock. Transition lock-bits 0b01 => 0b00
 333     ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
 334     xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 335     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 336             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
 337     bne(tmp1_mark, tmp3_t, slow_path);
 338 
 339     bind(push);
 340     // After successful lock, push object on lock-stack.
 341     add(tmp3_t, xthread, tmp2_top);
 342     sd(obj, Address(tmp3_t));
 343     addw(tmp2_top, tmp2_top, oopSize);
 344     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 345     j(locked);
 346   }
 347 
 348   { // Handle inflated monitor.
 349     bind(inflated);
 350 
 351     const Register tmp1_monitor = tmp1;
 352     if (!UseObjectMonitorTable) {
 353       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 354     } else {
 355       Label monitor_found;
 356 
 357       // Load cache address
 358       la(tmp3_t, Address(xthread, JavaThread::om_cache_oops_offset()));
 359 
 360       const int num_unrolled = 2;
 361       for (int i = 0; i < num_unrolled; i++) {
 362         ld(tmp1, Address(tmp3_t));
 363         beq(obj, tmp1, monitor_found);
 364         add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference()));
 365       }
 366 
 367       Label loop;
 368 
 369       // Search for obj in cache.
 370       bind(loop);
 371 
 372       // Check for match.
 373       ld(tmp1, Address(tmp3_t));
 374       beq(obj, tmp1, monitor_found);
 375 
 376       // Search until null encountered, guaranteed _null_sentinel at end.
 377       add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference()));
 378       bnez(tmp1, loop);
 379       // Cache Miss. Take the slowpath.
 380       j(slow_path);
 381 
 382       bind(monitor_found);
 383       ld(tmp1_monitor, Address(tmp3_t, OMCache::oop_to_monitor_difference()));
 384     }
 385 
 386     const Register tmp2_owner_addr = tmp2;
 387     const Register tmp3_owner = tmp3;
 388 
 389     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 390     const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 391     const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 392 
 393     Label monitor_locked;
 394 
 395     // Compute owner address.
 396     la(tmp2_owner_addr, owner_address);
 397 
 398     // CAS owner (null => current thread).
 399     cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64,
 400             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
 401     beqz(tmp3_owner, monitor_locked);
 402 
 403     // Check if recursive.
 404     bne(tmp3_owner, xthread, slow_path);
 405 
 406     // Recursive.
 407     increment(recursions_address, 1, tmp2, tmp3);
 408 
 409     bind(monitor_locked);
 410     if (UseObjectMonitorTable) {
 411       sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 412     }
 413   }
 414 
 415   bind(locked);
 416   mv(flag, zr);
 417   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 418 
 419 #ifdef ASSERT
 420   // Check that locked label is reached with flag == 0.
 421   Label flag_correct;
 422   beqz(flag, flag_correct);
 423   stop("Fast Lock Flag != 0");
 424 #endif
 425 
 426   bind(slow_path);
 427 #ifdef ASSERT
 428   // Check that slow_path label is reached with flag != 0.
 429   bnez(flag, flag_correct);
 430   stop("Fast Lock Flag == 0");
 431   bind(flag_correct);
 432 #endif
 433   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 434 }
 435 
 436 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box,
 437                                                 Register tmp1, Register tmp2, Register tmp3) {
 438   // Flag register, zero for success; non-zero for failure.
 439   Register flag = t1;
 440 
 441   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 442   assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0);
 443 
 444   mv(flag, 1);
 445 
 446   // Handle inflated monitor.
 447   Label inflated, inflated_load_mark;
 448   // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
 449   Label unlocked;
 450   // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
 451   Label slow_path;
 452 
 453   const Register tmp1_mark = tmp1;
 454   const Register tmp2_top = tmp2;
 455   const Register tmp3_t = tmp3;
 456 
 457   { // Lightweight unlock
 458     Label push_and_slow_path;
 459 
 460     // Check if obj is top of lock-stack.
 461     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 462     subw(tmp2_top, tmp2_top, oopSize);
 463     add(tmp3_t, xthread, tmp2_top);
 464     ld(tmp3_t, Address(tmp3_t));
 465     // Top of lock stack was not obj. Must be monitor.
 466     bne(obj, tmp3_t, inflated_load_mark);
 467 
 468     // Pop lock-stack.
 469     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 470     DEBUG_ONLY(sd(zr, Address(tmp3_t));)
 471     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 472 
 473     // Check if recursive.
 474     add(tmp3_t, xthread, tmp2_top);
 475     ld(tmp3_t, Address(tmp3_t, -oopSize));
 476     beq(obj, tmp3_t, unlocked);
 477 
 478     // Not recursive.
 479     // Load Mark.
 480     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 481 
 482     // Check header for monitor (0b10).
 483     // Because we got here by popping (meaning we pushed in locked)
 484     // there will be no monitor in the box. So we need to push back the obj
 485     // so that the runtime can fix any potential anonymous owner.
 486     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 487     bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated);
 488 
 489     // Try to unlock. Transition lock bits 0b00 => 0b01
 490     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 491     ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 492     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 493             /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
 494     beq(tmp1_mark, tmp3_t, unlocked);
 495 
 496     bind(push_and_slow_path);
 497     // Compare and exchange failed.
 498     // Restore lock-stack and handle the unlock in runtime.
 499     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 500     DEBUG_ONLY(sd(obj, Address(tmp3_t));)
 501     addw(tmp2_top, tmp2_top, oopSize);
 502     sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 503     j(slow_path);
 504   }
 505 
 506   { // Handle inflated monitor.
 507     bind(inflated_load_mark);
 508     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 509 #ifdef ASSERT
 510     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 511     bnez(tmp3_t, inflated);
 512     stop("Fast Unlock not monitor");
 513 #endif
 514 
 515     bind(inflated);
 516 
 517 #ifdef ASSERT
 518     Label check_done;
 519     subw(tmp2_top, tmp2_top, oopSize);
 520     mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
 521     blt(tmp2_top, tmp3_t, check_done);
 522     add(tmp3_t, xthread, tmp2_top);
 523     ld(tmp3_t, Address(tmp3_t));
 524     bne(obj, tmp3_t, inflated);
 525     stop("Fast Unlock lock on stack");
 526     bind(check_done);
 527 #endif
 528 
 529     const Register tmp1_monitor = tmp1;
 530 
 531     if (!UseObjectMonitorTable) {
 532       assert(tmp1_monitor == tmp1_mark, "should be the same here");
 533       // Untag the monitor.
 534       add(tmp1_monitor, tmp1_mark, -(int)markWord::monitor_value);
 535     } else {
 536       ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 537       // No valid pointer below alignof(ObjectMonitor*). Take the slow path.
 538       mv(tmp3_t, alignof(ObjectMonitor*));
 539       bltu(tmp1_monitor, tmp3_t, slow_path);
 540     }
 541 
 542     const Register tmp2_recursions = tmp2;
 543     Label not_recursive;
 544 
 545     // Check if recursive.
 546     ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 547     beqz(tmp2_recursions, not_recursive);
 548 
 549     // Recursive unlock.
 550     addi(tmp2_recursions, tmp2_recursions, -1);
 551     sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 552     j(unlocked);
 553 
 554     bind(not_recursive);
 555 
 556     const Register tmp2_owner_addr = tmp2;
 557 
 558     // Compute owner address.
 559     la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
 560 
 561     // Set owner to null.
 562     // Release to satisfy the JMM
 563     membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 564     sd(zr, Address(tmp2_owner_addr));
 565     // We need a full fence after clearing owner to avoid stranding.
 566     // StoreLoad achieves this.
 567     membar(StoreLoad);
 568 
 569     // Check if the entry lists are empty (EntryList first - by convention).
 570     ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset()));
 571     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset()));
 572     orr(t0, t0, tmp3_t);
 573     beqz(t0, unlocked); // If so we are done.
 574 
 575     // Check if there is a successor.
 576     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset()));
 577     bnez(tmp3_t, unlocked); // If so we are done.
 578 
 579     // Save the monitor pointer in the current thread, so we can try
 580     // to reacquire the lock in SharedRuntime::monitor_exit_helper().
 581     sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
 582 
 583     mv(flag, 1);
 584     j(slow_path);
 585   }
 586 
 587   bind(unlocked);
 588   mv(flag, zr);
 589   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 590 
 591 #ifdef ASSERT
 592   // Check that unlocked label is reached with flag == 0.
 593   Label flag_correct;
 594   beqz(flag, flag_correct);
 595   stop("Fast Lock Flag != 0");
 596 #endif
 597 
 598   bind(slow_path);
 599 #ifdef ASSERT
 600   // Check that slow_path label is reached with flag != 0.
 601   bnez(flag, flag_correct);
 602   stop("Fast Lock Flag == 0");
 603   bind(flag_correct);
 604 #endif
 605   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 606 }
 607 
 608 // short string
 609 // StringUTF16.indexOfChar
 610 // StringLatin1.indexOfChar
 611 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 612                                                   Register ch, Register result,
 613                                                   bool isL)
 614 {
 615   Register ch1 = t0;
 616   Register index = t1;
 617 
 618   BLOCK_COMMENT("string_indexof_char_short {");
 619 
 620   Label LOOP, LOOP1, LOOP4, LOOP8;
 621   Label MATCH,  MATCH1, MATCH2, MATCH3,
 622         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 623 
 624   mv(result, -1);
 625   mv(index, zr);
 626 
 627   bind(LOOP);
 628   addi(t0, index, 8);
 629   ble(t0, cnt1, LOOP8);
 630   addi(t0, index, 4);
 631   ble(t0, cnt1, LOOP4);
 632   j(LOOP1);
 633 
 634   bind(LOOP8);
 635   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 636   beq(ch, ch1, MATCH);
 637   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 638   beq(ch, ch1, MATCH1);
 639   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 640   beq(ch, ch1, MATCH2);
 641   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 642   beq(ch, ch1, MATCH3);
 643   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 644   beq(ch, ch1, MATCH4);
 645   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 646   beq(ch, ch1, MATCH5);
 647   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 648   beq(ch, ch1, MATCH6);
 649   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 650   beq(ch, ch1, MATCH7);
 651   addi(index, index, 8);
 652   addi(str1, str1, isL ? 8 : 16);
 653   blt(index, cnt1, LOOP);
 654   j(NOMATCH);
 655 
 656   bind(LOOP4);
 657   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 658   beq(ch, ch1, MATCH);
 659   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 660   beq(ch, ch1, MATCH1);
 661   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 662   beq(ch, ch1, MATCH2);
 663   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 664   beq(ch, ch1, MATCH3);
 665   addi(index, index, 4);
 666   addi(str1, str1, isL ? 4 : 8);
 667   bge(index, cnt1, NOMATCH);
 668 
 669   bind(LOOP1);
 670   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 671   beq(ch, ch1, MATCH);
 672   addi(index, index, 1);
 673   addi(str1, str1, isL ? 1 : 2);
 674   blt(index, cnt1, LOOP1);
 675   j(NOMATCH);
 676 
 677   bind(MATCH1);
 678   addi(index, index, 1);
 679   j(MATCH);
 680 
 681   bind(MATCH2);
 682   addi(index, index, 2);
 683   j(MATCH);
 684 
 685   bind(MATCH3);
 686   addi(index, index, 3);
 687   j(MATCH);
 688 
 689   bind(MATCH4);
 690   addi(index, index, 4);
 691   j(MATCH);
 692 
 693   bind(MATCH5);
 694   addi(index, index, 5);
 695   j(MATCH);
 696 
 697   bind(MATCH6);
 698   addi(index, index, 6);
 699   j(MATCH);
 700 
 701   bind(MATCH7);
 702   addi(index, index, 7);
 703 
 704   bind(MATCH);
 705   mv(result, index);
 706   bind(NOMATCH);
 707   BLOCK_COMMENT("} string_indexof_char_short");
 708 }
 709 
 710 // StringUTF16.indexOfChar
 711 // StringLatin1.indexOfChar
 712 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 713                                             Register ch, Register result,
 714                                             Register tmp1, Register tmp2,
 715                                             Register tmp3, Register tmp4,
 716                                             bool isL)
 717 {
 718   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 719   Register ch1 = t0;
 720   Register orig_cnt = t1;
 721   Register mask1 = tmp3;
 722   Register mask2 = tmp2;
 723   Register match_mask = tmp1;
 724   Register trailing_char = tmp4;
 725   Register unaligned_elems = tmp4;
 726 
 727   BLOCK_COMMENT("string_indexof_char {");
 728   beqz(cnt1, NOMATCH);
 729 
 730   addi(t0, cnt1, isL ? -32 : -16);
 731   bgtz(t0, DO_LONG);
 732   string_indexof_char_short(str1, cnt1, ch, result, isL);
 733   j(DONE);
 734 
 735   bind(DO_LONG);
 736   mv(orig_cnt, cnt1);
 737   if (AvoidUnalignedAccesses) {
 738     Label ALIGNED;
 739     andi(unaligned_elems, str1, 0x7);
 740     beqz(unaligned_elems, ALIGNED);
 741     sub(unaligned_elems, unaligned_elems, 8);
 742     neg(unaligned_elems, unaligned_elems);
 743     if (!isL) {
 744       srli(unaligned_elems, unaligned_elems, 1);
 745     }
 746     // do unaligned part per element
 747     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 748     bgez(result, DONE);
 749     mv(orig_cnt, cnt1);
 750     sub(cnt1, cnt1, unaligned_elems);
 751     bind(ALIGNED);
 752   }
 753 
 754   // duplicate ch
 755   if (isL) {
 756     slli(ch1, ch, 8);
 757     orr(ch, ch1, ch);
 758   }
 759   slli(ch1, ch, 16);
 760   orr(ch, ch1, ch);
 761   slli(ch1, ch, 32);
 762   orr(ch, ch1, ch);
 763 
 764   if (!isL) {
 765     slli(cnt1, cnt1, 1);
 766   }
 767 
 768   uint64_t mask0101 = UCONST64(0x0101010101010101);
 769   uint64_t mask0001 = UCONST64(0x0001000100010001);
 770   mv(mask1, isL ? mask0101 : mask0001);
 771   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 772   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 773   mv(mask2, isL ? mask7f7f : mask7fff);
 774 
 775   bind(CH1_LOOP);
 776   ld(ch1, Address(str1));
 777   addi(str1, str1, 8);
 778   addi(cnt1, cnt1, -8);
 779   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 780   bnez(match_mask, HIT);
 781   bgtz(cnt1, CH1_LOOP);
 782   j(NOMATCH);
 783 
 784   bind(HIT);
 785   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 786   srli(trailing_char, trailing_char, 3);
 787   addi(cnt1, cnt1, 8);
 788   ble(cnt1, trailing_char, NOMATCH);
 789   // match case
 790   if (!isL) {
 791     srli(cnt1, cnt1, 1);
 792     srli(trailing_char, trailing_char, 1);
 793   }
 794 
 795   sub(result, orig_cnt, cnt1);
 796   add(result, result, trailing_char);
 797   j(DONE);
 798 
 799   bind(NOMATCH);
 800   mv(result, -1);
 801 
 802   bind(DONE);
 803   BLOCK_COMMENT("} string_indexof_char");
 804 }
 805 
 806 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 807 
 808 // Search for needle in haystack and return index or -1
 809 // x10: result
 810 // x11: haystack
 811 // x12: haystack_len
 812 // x13: needle
 813 // x14: needle_len
 814 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 815                                        Register haystack_len, Register needle_len,
 816                                        Register tmp1, Register tmp2,
 817                                        Register tmp3, Register tmp4,
 818                                        Register tmp5, Register tmp6,
 819                                        Register result, int ae)
 820 {
 821   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 822 
 823   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 824 
 825   Register ch1 = t0;
 826   Register ch2 = t1;
 827   Register nlen_tmp = tmp1; // needle len tmp
 828   Register hlen_tmp = tmp2; // haystack len tmp
 829   Register result_tmp = tmp4;
 830 
 831   bool isLL = ae == StrIntrinsicNode::LL;
 832 
 833   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 834   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 835   int needle_chr_shift = needle_isL ? 0 : 1;
 836   int haystack_chr_shift = haystack_isL ? 0 : 1;
 837   int needle_chr_size = needle_isL ? 1 : 2;
 838   int haystack_chr_size = haystack_isL ? 1 : 2;
 839   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 840                               (load_chr_insn)&MacroAssembler::lhu;
 841   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 842                                 (load_chr_insn)&MacroAssembler::lhu;
 843 
 844   BLOCK_COMMENT("string_indexof {");
 845 
 846   // Note, inline_string_indexOf() generates checks:
 847   // if (pattern.count > src.count) return -1;
 848   // if (pattern.count == 0) return 0;
 849 
 850   // We have two strings, a source string in haystack, haystack_len and a pattern string
 851   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 852 
 853   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 854   // With a small pattern and source we use linear scan.
 855 
 856   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 857   sub(result_tmp, haystack_len, needle_len);
 858   // needle_len < 8, use linear scan
 859   sub(t0, needle_len, 8);
 860   bltz(t0, LINEARSEARCH);
 861   // needle_len >= 256, use linear scan
 862   sub(t0, needle_len, 256);
 863   bgez(t0, LINEARSTUB);
 864   // needle_len >= haystack_len/4, use linear scan
 865   srli(t0, haystack_len, 2);
 866   bge(needle_len, t0, LINEARSTUB);
 867 
 868   // Boyer-Moore-Horspool introduction:
 869   // The Boyer Moore alogorithm is based on the description here:-
 870   //
 871   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 872   //
 873   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 874   // and the 'Good Suffix' rule.
 875   //
 876   // These rules are essentially heuristics for how far we can shift the
 877   // pattern along the search string.
 878   //
 879   // The implementation here uses the 'Bad Character' rule only because of the
 880   // complexity of initialisation for the 'Good Suffix' rule.
 881   //
 882   // This is also known as the Boyer-Moore-Horspool algorithm:
 883   //
 884   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 885   //
 886   // #define ASIZE 256
 887   //
 888   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 889   //      int i, j;
 890   //      unsigned c;
 891   //      unsigned char bc[ASIZE];
 892   //
 893   //      /* Preprocessing */
 894   //      for (i = 0; i < ASIZE; ++i)
 895   //        bc[i] = m;
 896   //      for (i = 0; i < m - 1; ) {
 897   //        c = pattern[i];
 898   //        ++i;
 899   //        // c < 256 for Latin1 string, so, no need for branch
 900   //        #ifdef PATTERN_STRING_IS_LATIN1
 901   //        bc[c] = m - i;
 902   //        #else
 903   //        if (c < ASIZE) bc[c] = m - i;
 904   //        #endif
 905   //      }
 906   //
 907   //      /* Searching */
 908   //      j = 0;
 909   //      while (j <= n - m) {
 910   //        c = src[i+j];
 911   //        if (pattern[m-1] == c)
 912   //          int k;
 913   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 914   //          if (k < 0) return j;
 915   //          // c < 256 for Latin1 string, so, no need for branch
 916   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 917   //          // LL case: (c< 256) always true. Remove branch
 918   //          j += bc[pattern[j+m-1]];
 919   //          #endif
 920   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 921   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 922   //          if (c < ASIZE)
 923   //            j += bc[pattern[j+m-1]];
 924   //          else
 925   //            j += 1
 926   //          #endif
 927   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 928   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 929   //          if (c < ASIZE)
 930   //            j += bc[pattern[j+m-1]];
 931   //          else
 932   //            j += m
 933   //          #endif
 934   //      }
 935   //      return -1;
 936   //    }
 937 
 938   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 939   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 940         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 941 
 942   Register haystack_end = haystack_len;
 943   Register skipch = tmp2;
 944 
 945   // pattern length is >=8, so, we can read at least 1 register for cases when
 946   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 947   // UL case. We'll re-read last character in inner pre-loop code to have
 948   // single outer pre-loop load
 949   const int firstStep = isLL ? 7 : 3;
 950 
 951   const int ASIZE = 256;
 952   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 953 
 954   sub(sp, sp, ASIZE);
 955 
 956   // init BC offset table with default value: needle_len
 957   slli(t0, needle_len, 8);
 958   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 959   slli(tmp1, t0, 16);
 960   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 961   slli(tmp1, t0, 32);
 962   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 963 
 964   mv(ch1, sp);  // ch1 is t0
 965   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 966 
 967   bind(BM_INIT_LOOP);
 968   // for (i = 0; i < ASIZE; ++i)
 969   //   bc[i] = m;
 970   for (int i = 0; i < 4; i++) {
 971     sd(tmp5, Address(ch1, i * wordSize));
 972   }
 973   add(ch1, ch1, 32);
 974   sub(tmp6, tmp6, 4);
 975   bgtz(tmp6, BM_INIT_LOOP);
 976 
 977   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 978   Register orig_haystack = tmp5;
 979   mv(orig_haystack, haystack);
 980   // result_tmp = tmp4
 981   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 982   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 983   mv(tmp3, needle);
 984 
 985   //  for (i = 0; i < m - 1; ) {
 986   //    c = pattern[i];
 987   //    ++i;
 988   //    // c < 256 for Latin1 string, so, no need for branch
 989   //    #ifdef PATTERN_STRING_IS_LATIN1
 990   //    bc[c] = m - i;
 991   //    #else
 992   //    if (c < ASIZE) bc[c] = m - i;
 993   //    #endif
 994   //  }
 995   bind(BCLOOP);
 996   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 997   add(tmp3, tmp3, needle_chr_size);
 998   if (!needle_isL) {
 999     // ae == StrIntrinsicNode::UU
1000     mv(tmp6, ASIZE);
1001     bgeu(ch1, tmp6, BCSKIP);
1002   }
1003   add(tmp4, sp, ch1);
1004   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
1005 
1006   bind(BCSKIP);
1007   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
1008   bgtz(ch2, BCLOOP);
1009 
1010   // tmp6: pattern end, address after needle
1011   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
1012   if (needle_isL == haystack_isL) {
1013     // load last 8 bytes (8LL/4UU symbols)
1014     ld(tmp6, Address(tmp6, -wordSize));
1015   } else {
1016     // UL: from UTF-16(source) search Latin1(pattern)
1017     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
1018     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
1019     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
1020     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
1021     slli(ch2, tmp6, XLEN - 24);
1022     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
1023     slli(ch1, tmp6, XLEN - 16);
1024     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
1025     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
1026     slli(ch2, ch2, 16);
1027     orr(ch2, ch2, ch1); // 0x00000b0c
1028     slli(result, tmp3, 48); // use result as temp register
1029     orr(tmp6, tmp6, result); // 0x0a00000d
1030     slli(result, ch2, 16);
1031     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
1032   }
1033 
1034   // i = m - 1;
1035   // skipch = j + i;
1036   // if (skipch == pattern[m - 1]
1037   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
1038   // else
1039   //   move j with bad char offset table
1040   bind(BMLOOPSTR2);
1041   // compare pattern to source string backward
1042   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
1043   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
1044   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
1045   if (needle_isL == haystack_isL) {
1046     // re-init tmp3. It's for free because it's executed in parallel with
1047     // load above. Alternative is to initialize it before loop, but it'll
1048     // affect performance on in-order systems with 2 or more ld/st pipelines
1049     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
1050   }
1051   if (!isLL) { // UU/UL case
1052     slli(ch2, nlen_tmp, 1); // offsets in bytes
1053   }
1054   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
1055   add(result, haystack, isLL ? nlen_tmp : ch2);
1056   // load 8 bytes from source string
1057   // if isLL is false then read granularity can be 2
1058   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
1059   mv(ch1, tmp6);
1060   if (isLL) {
1061     j(BMLOOPSTR1_AFTER_LOAD);
1062   } else {
1063     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
1064     j(BMLOOPSTR1_CMP);
1065   }
1066 
1067   bind(BMLOOPSTR1);
1068   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
1069   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1070   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
1071   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1072 
1073   bind(BMLOOPSTR1_AFTER_LOAD);
1074   sub(nlen_tmp, nlen_tmp, 1);
1075   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
1076 
1077   bind(BMLOOPSTR1_CMP);
1078   beq(ch1, ch2, BMLOOPSTR1);
1079 
1080   bind(BMSKIP);
1081   if (!isLL) {
1082     // if we've met UTF symbol while searching Latin1 pattern, then we can
1083     // skip needle_len symbols
1084     if (needle_isL != haystack_isL) {
1085       mv(result_tmp, needle_len);
1086     } else {
1087       mv(result_tmp, 1);
1088     }
1089     mv(t0, ASIZE);
1090     bgeu(skipch, t0, BMADV);
1091   }
1092   add(result_tmp, sp, skipch);
1093   lbu(result_tmp, Address(result_tmp)); // load skip offset
1094 
1095   bind(BMADV);
1096   sub(nlen_tmp, needle_len, 1);
1097   // move haystack after bad char skip offset
1098   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
1099   ble(haystack, haystack_end, BMLOOPSTR2);
1100   add(sp, sp, ASIZE);
1101   j(NOMATCH);
1102 
1103   bind(BMLOOPSTR1_LASTCMP);
1104   bne(ch1, ch2, BMSKIP);
1105 
1106   bind(BMMATCH);
1107   sub(result, haystack, orig_haystack);
1108   if (!haystack_isL) {
1109     srli(result, result, 1);
1110   }
1111   add(sp, sp, ASIZE);
1112   j(DONE);
1113 
1114   bind(LINEARSTUB);
1115   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
1116   bltz(t0, LINEARSEARCH);
1117   mv(result, zr);
1118   RuntimeAddress stub = nullptr;
1119   if (isLL) {
1120     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
1121     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
1122   } else if (needle_isL) {
1123     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
1124     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
1125   } else {
1126     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
1127     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
1128   }
1129   address call = reloc_call(stub);
1130   if (call == nullptr) {
1131     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
1132     ciEnv::current()->record_failure("CodeCache is full");
1133     return;
1134   }
1135   j(DONE);
1136 
1137   bind(NOMATCH);
1138   mv(result, -1);
1139   j(DONE);
1140 
1141   bind(LINEARSEARCH);
1142   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
1143 
1144   bind(DONE);
1145   BLOCK_COMMENT("} string_indexof");
1146 }
1147 
1148 // string_indexof
1149 // result: x10
1150 // src: x11
1151 // src_count: x12
1152 // pattern: x13
1153 // pattern_count: x14 or 1/2/3/4
1154 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
1155                                                Register haystack_len, Register needle_len,
1156                                                Register tmp1, Register tmp2,
1157                                                Register tmp3, Register tmp4,
1158                                                int needle_con_cnt, Register result, int ae)
1159 {
1160   // Note:
1161   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
1162   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
1163   assert(needle_con_cnt <= 4, "Invalid needle constant count");
1164   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1165 
1166   Register ch1 = t0;
1167   Register ch2 = t1;
1168   Register hlen_neg = haystack_len, nlen_neg = needle_len;
1169   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
1170 
1171   bool isLL = ae == StrIntrinsicNode::LL;
1172 
1173   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
1174   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
1175   int needle_chr_shift = needle_isL ? 0 : 1;
1176   int haystack_chr_shift = haystack_isL ? 0 : 1;
1177   int needle_chr_size = needle_isL ? 1 : 2;
1178   int haystack_chr_size = haystack_isL ? 1 : 2;
1179 
1180   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
1181                               (load_chr_insn)&MacroAssembler::lhu;
1182   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
1183                                 (load_chr_insn)&MacroAssembler::lhu;
1184   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
1185   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
1186 
1187   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
1188 
1189   Register first = tmp3;
1190 
1191   if (needle_con_cnt == -1) {
1192     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1193 
1194     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1195     bltz(t0, DOSHORT);
1196 
1197     (this->*needle_load_1chr)(first, Address(needle), noreg);
1198     slli(t0, needle_len, needle_chr_shift);
1199     add(needle, needle, t0);
1200     neg(nlen_neg, t0);
1201     slli(t0, result_tmp, haystack_chr_shift);
1202     add(haystack, haystack, t0);
1203     neg(hlen_neg, t0);
1204 
1205     bind(FIRST_LOOP);
1206     add(t0, haystack, hlen_neg);
1207     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1208     beq(first, ch2, STR1_LOOP);
1209 
1210     bind(STR2_NEXT);
1211     add(hlen_neg, hlen_neg, haystack_chr_size);
1212     blez(hlen_neg, FIRST_LOOP);
1213     j(NOMATCH);
1214 
1215     bind(STR1_LOOP);
1216     add(nlen_tmp, nlen_neg, needle_chr_size);
1217     add(hlen_tmp, hlen_neg, haystack_chr_size);
1218     bgez(nlen_tmp, MATCH);
1219 
1220     bind(STR1_NEXT);
1221     add(ch1, needle, nlen_tmp);
1222     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1223     add(ch2, haystack, hlen_tmp);
1224     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1225     bne(ch1, ch2, STR2_NEXT);
1226     add(nlen_tmp, nlen_tmp, needle_chr_size);
1227     add(hlen_tmp, hlen_tmp, haystack_chr_size);
1228     bltz(nlen_tmp, STR1_NEXT);
1229     j(MATCH);
1230 
1231     bind(DOSHORT);
1232     if (needle_isL == haystack_isL) {
1233       sub(t0, needle_len, 2);
1234       bltz(t0, DO1);
1235       bgtz(t0, DO3);
1236     }
1237   }
1238 
1239   if (needle_con_cnt == 4) {
1240     Label CH1_LOOP;
1241     (this->*load_4chr)(ch1, Address(needle), noreg);
1242     sub(result_tmp, haystack_len, 4);
1243     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1244     add(haystack, haystack, tmp3);
1245     neg(hlen_neg, tmp3);
1246     if (AvoidUnalignedAccesses) {
1247       // preload first value, then we will read by 1 character per loop, instead of four
1248       // just shifting previous ch2 right by size of character in bits
1249       add(tmp3, haystack, hlen_neg);
1250       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1251       if (isLL) {
1252         // need to erase 1 most significant byte in 32-bit value of ch2
1253         slli(ch2, ch2, 40);
1254         srli(ch2, ch2, 32);
1255       } else {
1256         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1257       }
1258     }
1259 
1260     bind(CH1_LOOP);
1261     add(tmp3, haystack, hlen_neg);
1262     if (AvoidUnalignedAccesses) {
1263       srli(ch2, ch2, isLL ? 8 : 16);
1264       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1265       slli(tmp3, tmp3, isLL ? 24 : 48);
1266       add(ch2, ch2, tmp3);
1267     } else {
1268       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1269     }
1270     beq(ch1, ch2, MATCH);
1271     add(hlen_neg, hlen_neg, haystack_chr_size);
1272     blez(hlen_neg, CH1_LOOP);
1273     j(NOMATCH);
1274   }
1275 
1276   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1277     Label CH1_LOOP;
1278     BLOCK_COMMENT("string_indexof DO2 {");
1279     bind(DO2);
1280     (this->*load_2chr)(ch1, Address(needle), noreg);
1281     if (needle_con_cnt == 2) {
1282       sub(result_tmp, haystack_len, 2);
1283     }
1284     slli(tmp3, result_tmp, haystack_chr_shift);
1285     add(haystack, haystack, tmp3);
1286     neg(hlen_neg, tmp3);
1287     if (AvoidUnalignedAccesses) {
1288       // preload first value, then we will read by 1 character per loop, instead of two
1289       // just shifting previous ch2 right by size of character in bits
1290       add(tmp3, haystack, hlen_neg);
1291       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1292       slli(ch2, ch2, isLL ? 8 : 16);
1293     }
1294     bind(CH1_LOOP);
1295     add(tmp3, haystack, hlen_neg);
1296     if (AvoidUnalignedAccesses) {
1297       srli(ch2, ch2, isLL ? 8 : 16);
1298       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1299       slli(tmp3, tmp3, isLL ? 8 : 16);
1300       add(ch2, ch2, tmp3);
1301     } else {
1302       (this->*load_2chr)(ch2, Address(tmp3), noreg);
1303     }
1304     beq(ch1, ch2, MATCH);
1305     add(hlen_neg, hlen_neg, haystack_chr_size);
1306     blez(hlen_neg, CH1_LOOP);
1307     j(NOMATCH);
1308     BLOCK_COMMENT("} string_indexof DO2");
1309   }
1310 
1311   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1312     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1313     BLOCK_COMMENT("string_indexof DO3 {");
1314 
1315     bind(DO3);
1316     (this->*load_2chr)(first, Address(needle), noreg);
1317     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1318     if (needle_con_cnt == 3) {
1319       sub(result_tmp, haystack_len, 3);
1320     }
1321     slli(hlen_tmp, result_tmp, haystack_chr_shift);
1322     add(haystack, haystack, hlen_tmp);
1323     neg(hlen_neg, hlen_tmp);
1324 
1325     bind(FIRST_LOOP);
1326     add(ch2, haystack, hlen_neg);
1327     if (AvoidUnalignedAccesses) {
1328       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1329       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1330       slli(tmp2, tmp2, isLL ? 8 : 16);
1331       add(ch2, ch2, tmp2);
1332     } else {
1333       (this->*load_2chr)(ch2, Address(ch2), noreg);
1334     }
1335     beq(first, ch2, STR1_LOOP);
1336 
1337     bind(STR2_NEXT);
1338     add(hlen_neg, hlen_neg, haystack_chr_size);
1339     blez(hlen_neg, FIRST_LOOP);
1340     j(NOMATCH);
1341 
1342     bind(STR1_LOOP);
1343     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1344     add(ch2, haystack, hlen_tmp);
1345     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1346     bne(ch1, ch2, STR2_NEXT);
1347     j(MATCH);
1348     BLOCK_COMMENT("} string_indexof DO3");
1349   }
1350 
1351   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1352     Label DO1_LOOP;
1353 
1354     BLOCK_COMMENT("string_indexof DO1 {");
1355     bind(DO1);
1356     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1357     sub(result_tmp, haystack_len, 1);
1358     slli(tmp3, result_tmp, haystack_chr_shift);
1359     add(haystack, haystack, tmp3);
1360     neg(hlen_neg, tmp3);
1361 
1362     bind(DO1_LOOP);
1363     add(tmp3, haystack, hlen_neg);
1364     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1365     beq(ch1, ch2, MATCH);
1366     add(hlen_neg, hlen_neg, haystack_chr_size);
1367     blez(hlen_neg, DO1_LOOP);
1368     BLOCK_COMMENT("} string_indexof DO1");
1369   }
1370 
1371   bind(NOMATCH);
1372   mv(result, -1);
1373   j(DONE);
1374 
1375   bind(MATCH);
1376   srai(t0, hlen_neg, haystack_chr_shift);
1377   add(result, result_tmp, t0);
1378 
1379   bind(DONE);
1380 }
1381 
1382 // Compare strings.
1383 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1384                                        Register cnt1, Register cnt2, Register result,
1385                                        Register tmp1, Register tmp2, Register tmp3,
1386                                        int ae)
1387 {
1388   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1389         DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1390         SHORT_LOOP_START, TAIL_CHECK, L;
1391 
1392   const int STUB_THRESHOLD = 64 + 8;
1393   bool isLL = ae == StrIntrinsicNode::LL;
1394   bool isLU = ae == StrIntrinsicNode::LU;
1395   bool isUL = ae == StrIntrinsicNode::UL;
1396 
1397   bool str1_isL = isLL || isLU;
1398   bool str2_isL = isLL || isUL;
1399 
1400   // for L strings, 1 byte for 1 character
1401   // for U strings, 2 bytes for 1 character
1402   int str1_chr_size = str1_isL ? 1 : 2;
1403   int str2_chr_size = str2_isL ? 1 : 2;
1404   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1405 
1406   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1407   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1408 
1409   BLOCK_COMMENT("string_compare {");
1410 
1411   // Bizarrely, the counts are passed in bytes, regardless of whether they
1412   // are L or U strings, however the result is always in characters.
1413   if (!str1_isL) {
1414     sraiw(cnt1, cnt1, 1);
1415   }
1416   if (!str2_isL) {
1417     sraiw(cnt2, cnt2, 1);
1418   }
1419 
1420   // Compute the minimum of the string lengths and save the difference in result.
1421   sub(result, cnt1, cnt2);
1422   bgt(cnt1, cnt2, L);
1423   mv(cnt2, cnt1);
1424   bind(L);
1425 
1426   // A very short string
1427   mv(t0, minCharsInWord);
1428   ble(cnt2, t0, SHORT_STRING);
1429 
1430   // Compare longwords
1431   // load first parts of strings and finish initialization while loading
1432   {
1433     if (str1_isL == str2_isL) { // LL or UU
1434       // check if str1 and str2 is same pointer
1435       beq(str1, str2, DONE);
1436       // load 8 bytes once to compare
1437       ld(tmp1, Address(str1));
1438       ld(tmp2, Address(str2));
1439       mv(t0, STUB_THRESHOLD);
1440       bge(cnt2, t0, STUB);
1441       sub(cnt2, cnt2, minCharsInWord);
1442       beqz(cnt2, TAIL_CHECK);
1443       // convert cnt2 from characters to bytes
1444       if (!str1_isL) {
1445         slli(cnt2, cnt2, 1);
1446       }
1447       add(str2, str2, cnt2);
1448       add(str1, str1, cnt2);
1449       sub(cnt2, zr, cnt2);
1450     } else if (isLU) { // LU case
1451       lwu(tmp1, Address(str1));
1452       ld(tmp2, Address(str2));
1453       mv(t0, STUB_THRESHOLD);
1454       bge(cnt2, t0, STUB);
1455       addi(cnt2, cnt2, -4);
1456       add(str1, str1, cnt2);
1457       sub(cnt1, zr, cnt2);
1458       slli(cnt2, cnt2, 1);
1459       add(str2, str2, cnt2);
1460       inflate_lo32(tmp3, tmp1);
1461       mv(tmp1, tmp3);
1462       sub(cnt2, zr, cnt2);
1463       addi(cnt1, cnt1, 4);
1464     } else { // UL case
1465       ld(tmp1, Address(str1));
1466       lwu(tmp2, Address(str2));
1467       mv(t0, STUB_THRESHOLD);
1468       bge(cnt2, t0, STUB);
1469       addi(cnt2, cnt2, -4);
1470       slli(t0, cnt2, 1);
1471       sub(cnt1, zr, t0);
1472       add(str1, str1, t0);
1473       add(str2, str2, cnt2);
1474       inflate_lo32(tmp3, tmp2);
1475       mv(tmp2, tmp3);
1476       sub(cnt2, zr, cnt2);
1477       addi(cnt1, cnt1, 8);
1478     }
1479     addi(cnt2, cnt2, isUL ? 4 : 8);
1480     bne(tmp1, tmp2, DIFFERENCE);
1481     bgez(cnt2, TAIL);
1482 
1483     // main loop
1484     bind(NEXT_WORD);
1485     if (str1_isL == str2_isL) { // LL or UU
1486       add(t0, str1, cnt2);
1487       ld(tmp1, Address(t0));
1488       add(t0, str2, cnt2);
1489       ld(tmp2, Address(t0));
1490       addi(cnt2, cnt2, 8);
1491     } else if (isLU) { // LU case
1492       add(t0, str1, cnt1);
1493       lwu(tmp1, Address(t0));
1494       add(t0, str2, cnt2);
1495       ld(tmp2, Address(t0));
1496       addi(cnt1, cnt1, 4);
1497       inflate_lo32(tmp3, tmp1);
1498       mv(tmp1, tmp3);
1499       addi(cnt2, cnt2, 8);
1500     } else { // UL case
1501       add(t0, str2, cnt2);
1502       lwu(tmp2, Address(t0));
1503       add(t0, str1, cnt1);
1504       ld(tmp1, Address(t0));
1505       inflate_lo32(tmp3, tmp2);
1506       mv(tmp2, tmp3);
1507       addi(cnt1, cnt1, 8);
1508       addi(cnt2, cnt2, 4);
1509     }
1510     bne(tmp1, tmp2, DIFFERENCE);
1511     bltz(cnt2, NEXT_WORD);
1512     bind(TAIL);
1513     if (str1_isL == str2_isL) { // LL or UU
1514       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1515       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1516     } else if (isLU) { // LU case
1517       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1518       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1519       inflate_lo32(tmp3, tmp1);
1520       mv(tmp1, tmp3);
1521     } else { // UL case
1522       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1523       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1524       inflate_lo32(tmp3, tmp2);
1525       mv(tmp2, tmp3);
1526     }
1527     bind(TAIL_CHECK);
1528     beq(tmp1, tmp2, DONE);
1529 
1530     // Find the first different characters in the longwords and
1531     // compute their difference.
1532     bind(DIFFERENCE);
1533     xorr(tmp3, tmp1, tmp2);
1534     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1535     srl(tmp1, tmp1, result);
1536     srl(tmp2, tmp2, result);
1537     if (isLL) {
1538       andi(tmp1, tmp1, 0xFF);
1539       andi(tmp2, tmp2, 0xFF);
1540     } else {
1541       andi(tmp1, tmp1, 0xFFFF);
1542       andi(tmp2, tmp2, 0xFFFF);
1543     }
1544     sub(result, tmp1, tmp2);
1545     j(DONE);
1546   }
1547 
1548   bind(STUB);
1549   RuntimeAddress stub = nullptr;
1550   switch (ae) {
1551     case StrIntrinsicNode::LL:
1552       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1553       break;
1554     case StrIntrinsicNode::UU:
1555       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1556       break;
1557     case StrIntrinsicNode::LU:
1558       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1559       break;
1560     case StrIntrinsicNode::UL:
1561       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1562       break;
1563     default:
1564       ShouldNotReachHere();
1565   }
1566   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1567   address call = reloc_call(stub);
1568   if (call == nullptr) {
1569     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1570     ciEnv::current()->record_failure("CodeCache is full");
1571     return;
1572   }
1573   j(DONE);
1574 
1575   bind(SHORT_STRING);
1576   // Is the minimum length zero?
1577   beqz(cnt2, DONE);
1578   // arrange code to do most branches while loading and loading next characters
1579   // while comparing previous
1580   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1581   addi(str1, str1, str1_chr_size);
1582   addi(cnt2, cnt2, -1);
1583   beqz(cnt2, SHORT_LAST_INIT);
1584   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1585   addi(str2, str2, str2_chr_size);
1586   j(SHORT_LOOP_START);
1587   bind(SHORT_LOOP);
1588   addi(cnt2, cnt2, -1);
1589   beqz(cnt2, SHORT_LAST);
1590   bind(SHORT_LOOP_START);
1591   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1592   addi(str1, str1, str1_chr_size);
1593   (this->*str2_load_chr)(t0, Address(str2), t0);
1594   addi(str2, str2, str2_chr_size);
1595   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1596   addi(cnt2, cnt2, -1);
1597   beqz(cnt2, SHORT_LAST2);
1598   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1599   addi(str1, str1, str1_chr_size);
1600   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1601   addi(str2, str2, str2_chr_size);
1602   beq(tmp2, t0, SHORT_LOOP);
1603   sub(result, tmp2, t0);
1604   j(DONE);
1605   bind(SHORT_LOOP_TAIL);
1606   sub(result, tmp1, cnt1);
1607   j(DONE);
1608   bind(SHORT_LAST2);
1609   beq(tmp2, t0, DONE);
1610   sub(result, tmp2, t0);
1611 
1612   j(DONE);
1613   bind(SHORT_LAST_INIT);
1614   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1615   addi(str2, str2, str2_chr_size);
1616   bind(SHORT_LAST);
1617   beq(tmp1, cnt1, DONE);
1618   sub(result, tmp1, cnt1);
1619 
1620   bind(DONE);
1621 
1622   BLOCK_COMMENT("} string_compare");
1623 }
1624 
1625 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1626                                       Register tmp1, Register tmp2, Register tmp3,
1627                                       Register result, int elem_size) {
1628   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1629   assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1630 
1631   int elem_per_word = wordSize/elem_size;
1632   int log_elem_size = exact_log2(elem_size);
1633   int length_offset = arrayOopDesc::length_offset_in_bytes();
1634   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1635 
1636   Register cnt1 = tmp3;
1637   Register cnt2 = tmp1;  // cnt2 only used in array length compare
1638   Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1639 
1640   BLOCK_COMMENT("arrays_equals {");
1641 
1642   // if (a1 == a2), return true
1643   beq(a1, a2, SAME);
1644 
1645   mv(result, false);
1646   // if (a1 == nullptr || a2 == nullptr)
1647   //     return false;
1648   beqz(a1, DONE);
1649   beqz(a2, DONE);
1650 
1651   // if (a1.length != a2.length)
1652   //      return false;
1653   lwu(cnt1, Address(a1, length_offset));
1654   lwu(cnt2, Address(a2, length_offset));
1655   bne(cnt1, cnt2, DONE);
1656 
1657   la(a1, Address(a1, base_offset));
1658   la(a2, Address(a2, base_offset));
1659   // Check for short strings, i.e. smaller than wordSize.
1660   addi(cnt1, cnt1, -elem_per_word);
1661   bltz(cnt1, SHORT);
1662 
1663   // Main 8 byte comparison loop.
1664   bind(NEXT_WORD); {
1665     ld(tmp1, Address(a1));
1666     ld(tmp2, Address(a2));
1667     addi(cnt1, cnt1, -elem_per_word);
1668     addi(a1, a1, wordSize);
1669     addi(a2, a2, wordSize);
1670     bne(tmp1, tmp2, DONE);
1671   } bgez(cnt1, NEXT_WORD);
1672 
1673   addi(tmp1, cnt1, elem_per_word);
1674   beqz(tmp1, SAME);
1675 
1676   bind(SHORT);
1677   test_bit(tmp1, cnt1, 2 - log_elem_size);
1678   beqz(tmp1, TAIL03); // 0-7 bytes left.
1679   {
1680     lwu(tmp1, Address(a1));
1681     lwu(tmp2, Address(a2));
1682     addi(a1, a1, 4);
1683     addi(a2, a2, 4);
1684     bne(tmp1, tmp2, DONE);
1685   }
1686 
1687   bind(TAIL03);
1688   test_bit(tmp1, cnt1, 1 - log_elem_size);
1689   beqz(tmp1, TAIL01); // 0-3 bytes left.
1690   {
1691     lhu(tmp1, Address(a1));
1692     lhu(tmp2, Address(a2));
1693     addi(a1, a1, 2);
1694     addi(a2, a2, 2);
1695     bne(tmp1, tmp2, DONE);
1696   }
1697 
1698   bind(TAIL01);
1699   if (elem_size == 1) { // Only needed when comparing byte arrays.
1700     test_bit(tmp1, cnt1, 0);
1701     beqz(tmp1, SAME); // 0-1 bytes left.
1702     {
1703       lbu(tmp1, Address(a1));
1704       lbu(tmp2, Address(a2));
1705       bne(tmp1, tmp2, DONE);
1706     }
1707   }
1708 
1709   bind(SAME);
1710   mv(result, true);
1711   // That's it.
1712   bind(DONE);
1713 
1714   BLOCK_COMMENT("} arrays_equals");
1715 }
1716 
1717 // Compare Strings
1718 
1719 // For Strings we're passed the address of the first characters in a1 and a2
1720 // and the length in cnt1. There are two implementations.
1721 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1722 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1723 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1724 
1725 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1726                                       Register result, Register cnt1)
1727 {
1728   Label SAME, DONE, SHORT, NEXT_WORD;
1729   Register tmp1 = t0;
1730   Register tmp2 = t1;
1731 
1732   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1733 
1734   BLOCK_COMMENT("string_equals {");
1735 
1736   mv(result, false);
1737 
1738   // Check for short strings, i.e. smaller than wordSize.
1739   addi(cnt1, cnt1, -wordSize);
1740   bltz(cnt1, SHORT);
1741 
1742   // Main 8 byte comparison loop.
1743   bind(NEXT_WORD); {
1744     ld(tmp1, Address(a1));
1745     ld(tmp2, Address(a2));
1746     addi(cnt1, cnt1, -wordSize);
1747     addi(a1, a1, wordSize);
1748     addi(a2, a2, wordSize);
1749     bne(tmp1, tmp2, DONE);
1750   } bgez(cnt1, NEXT_WORD);
1751 
1752   addi(tmp1, cnt1, wordSize);
1753   beqz(tmp1, SAME);
1754 
1755   bind(SHORT);
1756   Label TAIL03, TAIL01;
1757 
1758   // 0-7 bytes left.
1759   test_bit(tmp1, cnt1, 2);
1760   beqz(tmp1, TAIL03);
1761   {
1762     lwu(tmp1, Address(a1));
1763     lwu(tmp2, Address(a2));
1764     addi(a1, a1, 4);
1765     addi(a2, a2, 4);
1766     bne(tmp1, tmp2, DONE);
1767   }
1768 
1769   bind(TAIL03);
1770   // 0-3 bytes left.
1771   test_bit(tmp1, cnt1, 1);
1772   beqz(tmp1, TAIL01);
1773   {
1774     lhu(tmp1, Address(a1));
1775     lhu(tmp2, Address(a2));
1776     addi(a1, a1, 2);
1777     addi(a2, a2, 2);
1778     bne(tmp1, tmp2, DONE);
1779   }
1780 
1781   bind(TAIL01);
1782   // 0-1 bytes left.
1783   test_bit(tmp1, cnt1, 0);
1784   beqz(tmp1, SAME);
1785   {
1786     lbu(tmp1, Address(a1));
1787     lbu(tmp2, Address(a2));
1788     bne(tmp1, tmp2, DONE);
1789   }
1790 
1791   // Arrays are equal.
1792   bind(SAME);
1793   mv(result, true);
1794 
1795   // That's it.
1796   bind(DONE);
1797   BLOCK_COMMENT("} string_equals");
1798 }
1799 
1800 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1801 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1802                                         Register tmp1, Register tmp2, Register tmp3,
1803                                         Register tmp4, Register tmp5, Register tmp6,
1804                                         BasicType eltype)
1805 {
1806   assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1807 
1808   const int elsize = arrays_hashcode_elsize(eltype);
1809   const int chunks_end_shift = exact_log2(elsize);
1810 
1811   switch (eltype) {
1812   case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1813   case T_CHAR:    BLOCK_COMMENT("arrays_hashcode(char) {");          break;
1814   case T_BYTE:    BLOCK_COMMENT("arrays_hashcode(byte) {");          break;
1815   case T_SHORT:   BLOCK_COMMENT("arrays_hashcode(short) {");         break;
1816   case T_INT:     BLOCK_COMMENT("arrays_hashcode(int) {");           break;
1817   default:
1818     ShouldNotReachHere();
1819   }
1820 
1821   const int stride = 4;
1822   const Register pow31_4 = tmp1;
1823   const Register pow31_3 = tmp2;
1824   const Register pow31_2 = tmp3;
1825   const Register chunks  = tmp4;
1826   const Register chunks_end = chunks;
1827 
1828   Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1829 
1830   // result has a value initially
1831 
1832   beqz(cnt, DONE);
1833 
1834   andi(chunks, cnt, ~(stride-1));
1835   beqz(chunks, TAIL);
1836 
1837   mv(pow31_4, 923521);           // [31^^4]
1838   mv(pow31_3,  29791);           // [31^^3]
1839   mv(pow31_2,    961);           // [31^^2]
1840 
1841   slli(chunks_end, chunks, chunks_end_shift);
1842   add(chunks_end, ary, chunks_end);
1843   andi(cnt, cnt, stride-1);      // don't forget about tail!
1844 
1845   bind(WIDE_LOOP);
1846   mulw(result, result, pow31_4); // 31^^4 * h
1847   arrays_hashcode_elload(t0,   Address(ary, 0 * elsize), eltype);
1848   arrays_hashcode_elload(t1,   Address(ary, 1 * elsize), eltype);
1849   arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1850   arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1851   mulw(t0, t0, pow31_3);         // 31^^3 * ary[i+0]
1852   addw(result, result, t0);
1853   mulw(t1, t1, pow31_2);         // 31^^2 * ary[i+1]
1854   addw(result, result, t1);
1855   slli(t0, tmp5, 5);             // optimize 31^^1 * ary[i+2]
1856   subw(tmp5, t0, tmp5);          // with ary[i+2]<<5 - ary[i+2]
1857   addw(result, result, tmp5);
1858   addw(result, result, tmp6);    // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1859                                  //           + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1860   addi(ary, ary, elsize * stride);
1861   bne(ary, chunks_end, WIDE_LOOP);
1862   beqz(cnt, DONE);
1863 
1864   bind(TAIL);
1865   slli(chunks_end, cnt, chunks_end_shift);
1866   add(chunks_end, ary, chunks_end);
1867 
1868   bind(TAIL_LOOP);
1869   arrays_hashcode_elload(t0, Address(ary), eltype);
1870   slli(t1, result, 5);           // optimize 31 * result
1871   subw(result, t1, result);      // with result<<5 - result
1872   addw(result, result, t0);
1873   addi(ary, ary, elsize);
1874   bne(ary, chunks_end, TAIL_LOOP);
1875 
1876   bind(DONE);
1877   BLOCK_COMMENT("} // arrays_hashcode");
1878 }
1879 
1880 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1881   switch (eltype) {
1882   case T_BOOLEAN: return sizeof(jboolean);
1883   case T_BYTE:    return sizeof(jbyte);
1884   case T_SHORT:   return sizeof(jshort);
1885   case T_CHAR:    return sizeof(jchar);
1886   case T_INT:     return sizeof(jint);
1887   default:
1888     ShouldNotReachHere();
1889     return -1;
1890   }
1891 }
1892 
1893 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1894   switch (eltype) {
1895   // T_BOOLEAN used as surrogate for unsigned byte
1896   case T_BOOLEAN: lbu(dst, src);   break;
1897   case T_BYTE:     lb(dst, src);   break;
1898   case T_SHORT:    lh(dst, src);   break;
1899   case T_CHAR:    lhu(dst, src);   break;
1900   case T_INT:      lw(dst, src);   break;
1901   default:
1902     ShouldNotReachHere();
1903   }
1904 }
1905 
1906 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1907 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1908                                                               bool is_far, bool is_unordered);
1909 
1910 static conditional_branch_insn conditional_branches[] =
1911 {
1912   /* SHORT branches */
1913   (conditional_branch_insn)&MacroAssembler::beq,
1914   (conditional_branch_insn)&MacroAssembler::bgt,
1915   nullptr, // BoolTest::overflow
1916   (conditional_branch_insn)&MacroAssembler::blt,
1917   (conditional_branch_insn)&MacroAssembler::bne,
1918   (conditional_branch_insn)&MacroAssembler::ble,
1919   nullptr, // BoolTest::no_overflow
1920   (conditional_branch_insn)&MacroAssembler::bge,
1921 
1922   /* UNSIGNED branches */
1923   (conditional_branch_insn)&MacroAssembler::beq,
1924   (conditional_branch_insn)&MacroAssembler::bgtu,
1925   nullptr,
1926   (conditional_branch_insn)&MacroAssembler::bltu,
1927   (conditional_branch_insn)&MacroAssembler::bne,
1928   (conditional_branch_insn)&MacroAssembler::bleu,
1929   nullptr,
1930   (conditional_branch_insn)&MacroAssembler::bgeu
1931 };
1932 
1933 static float_conditional_branch_insn float_conditional_branches[] =
1934 {
1935   /* FLOAT SHORT branches */
1936   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1937   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1938   nullptr,  // BoolTest::overflow
1939   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1940   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1941   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1942   nullptr, // BoolTest::no_overflow
1943   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1944 
1945   /* DOUBLE SHORT branches */
1946   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1947   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1948   nullptr,
1949   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1950   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1951   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1952   nullptr,
1953   (float_conditional_branch_insn)&MacroAssembler::double_bge
1954 };
1955 
1956 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1957   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1958          "invalid conditional branch index");
1959   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1960 }
1961 
1962 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1963 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1964 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1965   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1966          "invalid float conditional branch index");
1967   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1968   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1969     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1970 }
1971 
1972 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1973   switch (cmpFlag) {
1974     case BoolTest::eq:
1975     case BoolTest::le:
1976       beqz(op1, L, is_far);
1977       break;
1978     case BoolTest::ne:
1979     case BoolTest::gt:
1980       bnez(op1, L, is_far);
1981       break;
1982     default:
1983       ShouldNotReachHere();
1984   }
1985 }
1986 
1987 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1988   switch (cmpFlag) {
1989     case BoolTest::eq:
1990       beqz(op1, L, is_far);
1991       break;
1992     case BoolTest::ne:
1993       bnez(op1, L, is_far);
1994       break;
1995     default:
1996       ShouldNotReachHere();
1997   }
1998 }
1999 
2000 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
2001   Label L;
2002   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
2003   mv(dst, src);
2004   bind(L);
2005 }
2006 
2007 // Set dst to NaN if any NaN input.
2008 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
2009                                   bool is_double, bool is_min) {
2010   assert_different_registers(dst, src1, src2);
2011 
2012   Label Done, Compare;
2013 
2014   is_double ? fclass_d(t0, src1)
2015             : fclass_s(t0, src1);
2016   is_double ? fclass_d(t1, src2)
2017             : fclass_s(t1, src2);
2018   orr(t0, t0, t1);
2019   andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2020   beqz(t0, Compare);
2021   is_double ? fadd_d(dst, src1, src2)
2022             : fadd_s(dst, src1, src2);
2023   j(Done);
2024 
2025   bind(Compare);
2026   if (is_double) {
2027     is_min ? fmin_d(dst, src1, src2)
2028            : fmax_d(dst, src1, src2);
2029   } else {
2030     is_min ? fmin_s(dst, src1, src2)
2031            : fmax_s(dst, src1, src2);
2032   }
2033 
2034   bind(Done);
2035 }
2036 
2037 // According to Java SE specification, for floating-point round operations, if
2038 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
2039 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
2040 // round out-of-range values to the nearest max or min value), therefore special
2041 // handling is needed by NaN, +/-Infinity, +/-0.
2042 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
2043                                           Register tmp1, Register tmp2, Register tmp3) {
2044 
2045   assert_different_registers(dst, src);
2046   assert_different_registers(tmp1, tmp2, tmp3);
2047 
2048   // Set rounding mode for conversions
2049   // Here we use similar modes to double->long and long->double conversions
2050   // Different mode for long->double conversion matter only if long value was not representable as double,
2051   // we got long value as a result of double->long conversion so, it is definitely representable
2052   RoundingMode rm;
2053   switch (round_mode) {
2054     case RoundDoubleModeNode::rmode_ceil:
2055       rm = RoundingMode::rup;
2056       break;
2057     case RoundDoubleModeNode::rmode_floor:
2058       rm = RoundingMode::rdn;
2059       break;
2060     case RoundDoubleModeNode::rmode_rint:
2061       rm = RoundingMode::rne;
2062       break;
2063     default:
2064       ShouldNotReachHere();
2065   }
2066 
2067   // tmp1 - is a register to store double converted to long int
2068   // tmp2 - is a register to create constant for comparison
2069   // tmp3 - is a register where we store modified result of double->long conversion
2070   Label done, bad_val;
2071 
2072   // Conversion from double to long
2073   fcvt_l_d(tmp1, src, rm);
2074 
2075   // Generate constant (tmp2)
2076   // tmp2 = 100...0000
2077   addi(tmp2, zr, 1);
2078   slli(tmp2, tmp2, 63);
2079 
2080   // Prepare converted long (tmp1)
2081   // as a result when conversion overflow we got:
2082   // tmp1 = 011...1111 or 100...0000
2083   // Convert it to: tmp3 = 100...0000
2084   addi(tmp3, tmp1, 1);
2085   andi(tmp3, tmp3, -2);
2086   beq(tmp3, tmp2, bad_val);
2087 
2088   // Conversion from long to double
2089   fcvt_d_l(dst, tmp1, rm);
2090   // Add sign of input value to result for +/- 0 cases
2091   fsgnj_d(dst, dst, src);
2092   j(done);
2093 
2094   // If got conversion overflow return src
2095   bind(bad_val);
2096   fmv_d(dst, src);
2097 
2098   bind(done);
2099 }
2100 
2101 // According to Java SE specification, for floating-point signum operations, if
2102 // on input we have NaN or +/-0.0 value we should return it,
2103 // otherwise return +/- 1.0 using sign of input.
2104 // one - gives us a floating-point 1.0 (got from matching rule)
2105 // bool is_double - specifies single or double precision operations will be used.
2106 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2107   Label done;
2108 
2109   is_double ? fclass_d(t0, dst)
2110             : fclass_s(t0, dst);
2111 
2112   // check if input is -0, +0, signaling NaN or quiet NaN
2113   andi(t0, t0, fclass_mask::zero | fclass_mask::nan);
2114 
2115   bnez(t0, done);
2116 
2117   // use floating-point 1.0 with a sign of input
2118   is_double ? fsgnj_d(dst, one, dst)
2119             : fsgnj_s(dst, one, dst);
2120 
2121   bind(done);
2122 }
2123 
2124 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2125 #define __ masm.
2126   FloatRegister dst = stub.data<0>();
2127   Register src = stub.data<1>();
2128   Register tmp = stub.data<2>();
2129   __ bind(stub.entry());
2130 
2131   // following instructions mainly focus on NaN, as riscv does not handle
2132   // NaN well with fcvt, but the code also works for Inf at the same time.
2133 
2134   // construct a NaN in 32 bits from the NaN in 16 bits,
2135   // we need the payloads of non-canonical NaNs to be preserved.
2136   __ mv(tmp, 0x7f800000);
2137   // sign-bit was already set via sign-extension if necessary.
2138   __ slli(t0, src, 13);
2139   __ orr(tmp, t0, tmp);
2140   __ fmv_w_x(dst, tmp);
2141 
2142   __ j(stub.continuation());
2143 #undef __
2144 }
2145 
2146 // j.l.Float.float16ToFloat
2147 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2148   auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2149 
2150   // On riscv, NaN needs a special process as fcvt does not work in that case.
2151   // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2152   // but we consider to get the slow path to process NaN and Inf at the same time,
2153   // as both of them are rare cases, and if we try to get the slow path to handle
2154   // only NaN case it would sacrifise the performance for normal cases,
2155   // i.e. non-NaN and non-Inf cases.
2156 
2157   // check whether it's a NaN or +/- Inf.
2158   mv(t0, 0x7c00);
2159   andr(tmp, src, t0);
2160   // jump to stub processing NaN and Inf cases.
2161   beq(t0, tmp, stub->entry());
2162 
2163   // non-NaN or non-Inf cases, just use built-in instructions.
2164   fmv_h_x(dst, src);
2165   fcvt_s_h(dst, dst);
2166 
2167   bind(stub->continuation());
2168 }
2169 
2170 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2171 #define __ masm.
2172   Register dst = stub.data<0>();
2173   FloatRegister src = stub.data<1>();
2174   Register tmp = stub.data<2>();
2175   __ bind(stub.entry());
2176 
2177   __ fmv_x_w(dst, src);
2178 
2179   // preserve the payloads of non-canonical NaNs.
2180   __ srai(dst, dst, 13);
2181   // preserve the sign bit.
2182   __ srai(tmp, dst, 13);
2183   __ slli(tmp, tmp, 10);
2184   __ mv(t0, 0x3ff);
2185   __ orr(tmp, tmp, t0);
2186 
2187   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2188   __ andr(dst, dst, tmp);
2189 
2190   __ j(stub.continuation());
2191 #undef __
2192 }
2193 
2194 // j.l.Float.floatToFloat16
2195 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2196   auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path);
2197 
2198   // On riscv, NaN needs a special process as fcvt does not work in that case.
2199 
2200   // check whether it's a NaN.
2201   // replace fclass with feq as performance optimization.
2202   feq_s(t0, src, src);
2203   // jump to stub processing NaN cases.
2204   beqz(t0, stub->entry());
2205 
2206   // non-NaN cases, just use built-in instructions.
2207   fcvt_h_s(ftmp, src);
2208   fmv_x_h(dst, ftmp);
2209 
2210   bind(stub->continuation());
2211 }
2212 
2213 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2214 #define __ masm.
2215   VectorRegister dst = stub.data<0>();
2216   VectorRegister src = stub.data<1>();
2217   uint vector_length = stub.data<2>();
2218   __ bind(stub.entry());
2219 
2220   // following instructions mainly focus on NaN, as riscv does not handle
2221   // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2222   //
2223   // construct NaN's in 32 bits from the NaN's in 16 bits,
2224   // we need the payloads of non-canonical NaNs to be preserved.
2225 
2226   // adjust vector type to 2 * SEW.
2227   __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2228   // widen and sign-extend src data.
2229   __ vsext_vf2(dst, src, Assembler::v0_t);
2230   __ mv(t0, 0x7f800000);
2231   // sign-bit was already set via sign-extension if necessary.
2232   __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2233   __ vor_vx(dst, dst, t0, Assembler::v0_t);
2234 
2235   __ j(stub.continuation());
2236 #undef __
2237 }
2238 
2239 // j.l.Float.float16ToFloat
2240 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2241   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2242               (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2243   assert_different_registers(dst, src);
2244 
2245   // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2246   // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2247   // but we consider to get the slow path to process NaN and Inf at the same time,
2248   // as both of them are rare cases, and if we try to get the slow path to handle
2249   // only NaN case it would sacrifise the performance for normal cases,
2250   // i.e. non-NaN and non-Inf cases.
2251 
2252   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2253 
2254   // check whether there is a NaN or +/- Inf.
2255   mv(t0, 0x7c00);
2256   vand_vx(v0, src, t0);
2257   // v0 will be used as mask in slow path.
2258   vmseq_vx(v0, v0, t0);
2259   vcpop_m(t0, v0);
2260 
2261   // For non-NaN or non-Inf cases, just use built-in instructions.
2262   vfwcvt_f_f_v(dst, src);
2263 
2264   // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2265   bnez(t0, stub->entry());
2266 
2267   bind(stub->continuation());
2268 }
2269 
2270 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2271                                          C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2272 #define __ masm.
2273   VectorRegister dst = stub.data<0>();
2274   VectorRegister src = stub.data<1>();
2275   VectorRegister tmp = stub.data<2>();
2276   __ bind(stub.entry());
2277 
2278   // mul is already set to mf2 in float_to_float16_v.
2279 
2280   // preserve the payloads of non-canonical NaNs.
2281   __ vnsra_wi(dst, src, 13, Assembler::v0_t);
2282 
2283   // preserve the sign bit.
2284   __ vnsra_wi(tmp, src, 26, Assembler::v0_t);
2285   __ vsll_vi(tmp, tmp, 10, Assembler::v0_t);
2286   __ mv(t0, 0x3ff);
2287   __ vor_vx(tmp, tmp, t0, Assembler::v0_t);
2288 
2289   // get the result by merging sign bit and payloads of preserved non-canonical NaNs.
2290   __ vand_vv(dst, dst, tmp, Assembler::v0_t);
2291 
2292   __ j(stub.continuation());
2293 #undef __
2294 }
2295 
2296 // j.l.Float.float16ToFloat
2297 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp,
2298                                            Register tmp, uint vector_length) {
2299   assert_different_registers(dst, src, vtmp);
2300 
2301   auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2302               (dst, src, vtmp, 28, float_to_float16_v_slow_path);
2303 
2304   // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2305 
2306   vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2307 
2308   // check whether there is a NaN.
2309   // replace v_fclass with vmseq_vv as performance optimization.
2310   vmfne_vv(v0, src, src);
2311   vcpop_m(t0, v0);
2312 
2313   vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2314 
2315   // For non-NaN cases, just use built-in instructions.
2316   vfncvt_f_f_w(dst, src);
2317 
2318   // jump to stub processing NaN cases.
2319   bnez(t0, stub->entry());
2320 
2321   bind(stub->continuation());
2322 }
2323 
2324 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2325   vsetvli_helper(bt, vlen);
2326 
2327   // check if input is -0, +0, signaling NaN or quiet NaN
2328   vfclass_v(v0, dst);
2329   mv(t0, fclass_mask::zero | fclass_mask::nan);
2330   vand_vx(v0, v0, t0);
2331   vmseq_vi(v0, v0, 0);
2332 
2333   // use floating-point 1.0 with a sign of input
2334   vfsgnj_vv(dst, one, dst, v0_t);
2335 }
2336 
2337 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
2338   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2339   // intrinsic is enabled when MaxVectorSize >= 16
2340   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2341   long len = is_long ? 64 : 32;
2342 
2343   // load the src data(in bits) to be compressed.
2344   vsetivli(x0, 1, sew, Assembler::m1);
2345   vmv_s_x(v0, src);
2346   // reset the src data(in bytes) to zero.
2347   mv(t0, len);
2348   vsetvli(x0, t0, Assembler::e8, lmul);
2349   vmv_v_i(v4, 0);
2350   // convert the src data from bits to bytes.
2351   vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
2352   // reset the dst data(in bytes) to zero.
2353   vmv_v_i(v8, 0);
2354   // load the mask data(in bits).
2355   vsetivli(x0, 1, sew, Assembler::m1);
2356   vmv_s_x(v0, mask);
2357   // compress the src data(in bytes) to dst(in bytes).
2358   vsetvli(x0, t0, Assembler::e8, lmul);
2359   vcompress_vm(v8, v4, v0);
2360   // convert the dst data from bytes to bits.
2361   vmseq_vi(v0, v8, 1);
2362   // store result back.
2363   vsetivli(x0, 1, sew, Assembler::m1);
2364   vmv_x_s(dst, v0);
2365 }
2366 
2367 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
2368   compress_bits_v(dst, src, mask, /* is_long */ false);
2369 }
2370 
2371 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
2372   compress_bits_v(dst, src, mask, /* is_long */ true);
2373 }
2374 
2375 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
2376   Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
2377   // intrinsic is enabled when MaxVectorSize >= 16
2378   Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
2379   long len = is_long ? 64 : 32;
2380 
2381   // load the src data(in bits) to be expanded.
2382   vsetivli(x0, 1, sew, Assembler::m1);
2383   vmv_s_x(v0, src);
2384   // reset the src data(in bytes) to zero.
2385   mv(t0, len);
2386   vsetvli(x0, t0, Assembler::e8, lmul);
2387   vmv_v_i(v4, 0);
2388   // convert the src data from bits to bytes.
2389   vmerge_vim(v4, v4, 1); // v0 as implicit mask register
2390   // reset the dst data(in bytes) to zero.
2391   vmv_v_i(v12, 0);
2392   // load the mask data(in bits).
2393   vsetivli(x0, 1, sew, Assembler::m1);
2394   vmv_s_x(v0, mask);
2395   // expand the src data(in bytes) to dst(in bytes).
2396   vsetvli(x0, t0, Assembler::e8, lmul);
2397   viota_m(v8, v0);
2398   vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
2399   // convert the dst data from bytes to bits.
2400   vmseq_vi(v0, v12, 1);
2401   // store result back.
2402   vsetivli(x0, 1, sew, Assembler::m1);
2403   vmv_x_s(dst, v0);
2404 }
2405 
2406 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
2407   expand_bits_v(dst, src, mask, /* is_long */ false);
2408 }
2409 
2410 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
2411   expand_bits_v(dst, src, mask, /* is_long */ true);
2412 }
2413 
2414 // j.l.Math.round(float)
2415 //  Returns the closest int to the argument, with ties rounding to positive infinity.
2416 // We need to handle 3 special cases defined by java api spec:
2417 //    NaN,
2418 //    float >= Integer.MAX_VALUE,
2419 //    float <= Integer.MIN_VALUE.
2420 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2421                                            BasicType bt, uint vector_length) {
2422   // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined,
2423   // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g.
2424   //  RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted
2425   //    to 2, instead of 2 and 3 respectively.
2426   //  RUP does not work either, although java api requires "rounding to positive infinity",
2427   //    but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively.
2428   //
2429   // The optimal solution for non-NaN cases is:
2430   //    src+0.5 => dst, with rdn rounding mode,
2431   //    convert dst from float to int, with rnd rounding mode.
2432   // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE.
2433   //
2434   // But, we still need to handle NaN explicilty with vector mask instructions.
2435   //
2436   // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details.
2437 
2438   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2439   vsetvli_helper(bt, vector_length);
2440 
2441   // don't rearrage the instructions sequence order without performance testing.
2442   // check MacroAssembler::java_round_float in riscv64 for more details.
2443   mv(t0, jint_cast(0.5f));
2444   fmv_w_x(ftmp, t0);
2445 
2446   // replacing vfclass with feq as performance optimization
2447   vmfeq_vv(v0, src, src);
2448   // set dst = 0 in cases of NaN
2449   vmv_v_x(dst, zr);
2450 
2451   // dst = (src + 0.5) rounded down towards negative infinity
2452   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2453   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2454 
2455   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2456 }
2457 
2458 // java.lang.Math.round(double a)
2459 // Returns the closest long to the argument, with ties rounding to positive infinity.
2460 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2461                                             BasicType bt, uint vector_length) {
2462   // check C2_MacroAssembler::java_round_float_v above for more details.
2463 
2464   csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2465   vsetvli_helper(bt, vector_length);
2466 
2467   mv(t0, julong_cast(0.5));
2468   fmv_d_x(ftmp, t0);
2469 
2470   // replacing vfclass with feq as performance optimization
2471   vmfeq_vv(v0, src, src);
2472   // set dst = 0 in cases of NaN
2473   vmv_v_x(dst, zr);
2474 
2475   // dst = (src + 0.5) rounded down towards negative infinity
2476   vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2477   vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2478 
2479   csrwi(CSR_FRM, C2_MacroAssembler::rne);
2480 }
2481 
2482 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2483                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE,
2484                                         Assembler::LMUL lmul) {
2485   Label loop;
2486   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2487 
2488   bind(loop);
2489   vsetvli(tmp1, cnt, sew, lmul);
2490   vlex_v(vr1, a1, sew);
2491   vlex_v(vr2, a2, sew);
2492   vmsne_vv(vrs, vr1, vr2);
2493   vfirst_m(tmp2, vrs);
2494   bgez(tmp2, DONE);
2495   sub(cnt, cnt, tmp1);
2496   if (!islatin) {
2497     slli(tmp1, tmp1, 1); // get byte counts
2498   }
2499   add(a1, a1, tmp1);
2500   add(a2, a2, tmp1);
2501   bnez(cnt, loop);
2502 
2503   mv(result, true);
2504 }
2505 
2506 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2507   Label DONE;
2508   Register tmp1 = t0;
2509   Register tmp2 = t1;
2510 
2511   BLOCK_COMMENT("string_equals_v {");
2512 
2513   mv(result, false);
2514 
2515   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2);
2516 
2517   bind(DONE);
2518   BLOCK_COMMENT("} string_equals_v");
2519 }
2520 
2521 // used by C2 ClearArray patterns.
2522 // base: Address of a buffer to be zeroed
2523 // cnt: Count in HeapWords
2524 //
2525 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2526 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2527   Label loop;
2528 
2529   // making zero words
2530   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2531   vxor_vv(v4, v4, v4);
2532 
2533   bind(loop);
2534   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2535   vse64_v(v4, base);
2536   sub(cnt, cnt, t0);
2537   shadd(base, t0, base, t0, 3);
2538   bnez(cnt, loop);
2539 }
2540 
2541 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2542                                         Register cnt1, int elem_size) {
2543   Label DONE;
2544   Register tmp1 = t0;
2545   Register tmp2 = t1;
2546   Register cnt2 = tmp2;
2547   int length_offset = arrayOopDesc::length_offset_in_bytes();
2548   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2549 
2550   BLOCK_COMMENT("arrays_equals_v {");
2551 
2552   // if (a1 == a2), return true
2553   mv(result, true);
2554   beq(a1, a2, DONE);
2555 
2556   mv(result, false);
2557   // if a1 == null or a2 == null, return false
2558   beqz(a1, DONE);
2559   beqz(a2, DONE);
2560   // if (a1.length != a2.length), return false
2561   lwu(cnt1, Address(a1, length_offset));
2562   lwu(cnt2, Address(a2, length_offset));
2563   bne(cnt1, cnt2, DONE);
2564 
2565   la(a1, Address(a1, base_offset));
2566   la(a2, Address(a2, base_offset));
2567 
2568   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2);
2569 
2570   bind(DONE);
2571 
2572   BLOCK_COMMENT("} arrays_equals_v");
2573 }
2574 
2575 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2576                                          Register result, Register tmp1, Register tmp2, int encForm) {
2577   Label DIFFERENCE, DONE, L, loop;
2578   bool encLL = encForm == StrIntrinsicNode::LL;
2579   bool encLU = encForm == StrIntrinsicNode::LU;
2580   bool encUL = encForm == StrIntrinsicNode::UL;
2581 
2582   bool str1_isL = encLL || encLU;
2583   bool str2_isL = encLL || encUL;
2584 
2585   int minCharsInWord = encLL ? wordSize : wordSize / 2;
2586 
2587   BLOCK_COMMENT("string_compare {");
2588 
2589   // for Latin strings, 1 byte for 1 character
2590   // for UTF16 strings, 2 bytes for 1 character
2591   if (!str1_isL)
2592     sraiw(cnt1, cnt1, 1);
2593   if (!str2_isL)
2594     sraiw(cnt2, cnt2, 1);
2595 
2596   // if str1 == str2, return the difference
2597   // save the minimum of the string lengths in cnt2.
2598   sub(result, cnt1, cnt2);
2599   bgt(cnt1, cnt2, L);
2600   mv(cnt2, cnt1);
2601   bind(L);
2602 
2603   // We focus on the optimization of small sized string.
2604   // Please check below document for string size distribution statistics.
2605   // https://cr.openjdk.org/~shade/density/string-density-report.pdf
2606   if (str1_isL == str2_isL) { // LL or UU
2607     // Below construction of v regs and lmul is based on test on 2 different boards,
2608     // vlen == 128 and vlen == 256 respectively.
2609     if (!encLL && MaxVectorSize == 16) { // UU
2610       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4);
2611     } else { // UU + MaxVectorSize or LL
2612       element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2);
2613     }
2614 
2615     j(DONE);
2616   } else { // LU or UL
2617     Register strL = encLU ? str1 : str2;
2618     Register strU = encLU ? str2 : str1;
2619     VectorRegister vstr1 = encLU ? v8 : v4;
2620     VectorRegister vstr2 = encLU ? v4 : v8;
2621 
2622     bind(loop);
2623     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2624     vle8_v(vstr1, strL);
2625     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2626     vzext_vf2(vstr2, vstr1);
2627     vle16_v(vstr1, strU);
2628     vmsne_vv(v4, vstr2, vstr1);
2629     vfirst_m(tmp2, v4);
2630     bgez(tmp2, DIFFERENCE);
2631     sub(cnt2, cnt2, tmp1);
2632     add(strL, strL, tmp1);
2633     shadd(strU, tmp1, strU, tmp1, 1);
2634     bnez(cnt2, loop);
2635     j(DONE);
2636   }
2637 
2638   bind(DIFFERENCE);
2639   slli(tmp1, tmp2, 1);
2640   add(str1, str1, str1_isL ? tmp2 : tmp1);
2641   add(str2, str2, str2_isL ? tmp2 : tmp1);
2642   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2643   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2644   sub(result, tmp1, tmp2);
2645 
2646   bind(DONE);
2647 }
2648 
2649 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2650   Label loop;
2651   assert_different_registers(src, dst, len, tmp, t0);
2652 
2653   BLOCK_COMMENT("byte_array_inflate_v {");
2654   bind(loop);
2655   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2656   vle8_v(v6, src);
2657   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2658   vzext_vf2(v4, v6);
2659   vse16_v(v4, dst);
2660   sub(len, len, tmp);
2661   add(src, src, tmp);
2662   shadd(dst, tmp, dst, tmp, 1);
2663   bnez(len, loop);
2664   BLOCK_COMMENT("} byte_array_inflate_v");
2665 }
2666 
2667 // Compress char[] array to byte[].
2668 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2669 // result: the array length if every element in array can be encoded,
2670 // otherwise, the index of first non-latin1 (> 0xff) character.
2671 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2672                                               Register result, Register tmp) {
2673   encode_iso_array_v(src, dst, len, result, tmp, false);
2674 }
2675 
2676 // Intrinsic for
2677 //
2678 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2679 //     return the number of characters copied.
2680 // - java/lang/StringUTF16.compress
2681 //     return index of non-latin1 character if copy fails, otherwise 'len'.
2682 //
2683 // This version always returns the number of characters copied. A successful
2684 // copy will complete with the post-condition: 'res' == 'len', while an
2685 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2686 //
2687 // Clobbers: src, dst, len, result, t0
2688 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2689                                            Register result, Register tmp, bool ascii) {
2690   Label loop, fail, done;
2691 
2692   BLOCK_COMMENT("encode_iso_array_v {");
2693   mv(result, 0);
2694 
2695   bind(loop);
2696   mv(tmp, ascii ? 0x7f : 0xff);
2697   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2698   vle16_v(v2, src);
2699 
2700   vmsgtu_vx(v1, v2, tmp);
2701   vfirst_m(tmp, v1);
2702   vmsbf_m(v0, v1);
2703   // compress char to byte
2704   vsetvli(t0, len, Assembler::e8);
2705   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2706   vse8_v(v1, dst, Assembler::v0_t);
2707 
2708   // fail if char > 0x7f/0xff
2709   bgez(tmp, fail);
2710   add(result, result, t0);
2711   add(dst, dst, t0);
2712   sub(len, len, t0);
2713   shadd(src, t0, src, t0, 1);
2714   bnez(len, loop);
2715   j(done);
2716 
2717   bind(fail);
2718   add(result, result, tmp);
2719 
2720   bind(done);
2721   BLOCK_COMMENT("} encode_iso_array_v");
2722 }
2723 
2724 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2725   Label LOOP, SET_RESULT, DONE;
2726 
2727   BLOCK_COMMENT("count_positives_v {");
2728   assert_different_registers(ary, len, result, tmp);
2729 
2730   mv(result, zr);
2731 
2732   bind(LOOP);
2733   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2734   vle8_v(v4, ary);
2735   vmslt_vx(v4, v4, zr);
2736   vfirst_m(tmp, v4);
2737   bgez(tmp, SET_RESULT);
2738   // if tmp == -1, all bytes are positive
2739   add(result, result, t0);
2740 
2741   sub(len, len, t0);
2742   add(ary, ary, t0);
2743   bnez(len, LOOP);
2744   j(DONE);
2745 
2746   // add remaining positive bytes count
2747   bind(SET_RESULT);
2748   add(result, result, tmp);
2749 
2750   bind(DONE);
2751   BLOCK_COMMENT("} count_positives_v");
2752 }
2753 
2754 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2755                                               Register ch, Register result,
2756                                               Register tmp1, Register tmp2,
2757                                               bool isL) {
2758   mv(result, zr);
2759 
2760   Label loop, MATCH, DONE;
2761   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2762   bind(loop);
2763   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2764   vlex_v(v4, str1, sew);
2765   vmseq_vx(v4, v4, ch);
2766   vfirst_m(tmp2, v4);
2767   bgez(tmp2, MATCH); // if equal, return index
2768 
2769   add(result, result, tmp1);
2770   sub(cnt1, cnt1, tmp1);
2771   if (!isL) slli(tmp1, tmp1, 1);
2772   add(str1, str1, tmp1);
2773   bnez(cnt1, loop);
2774 
2775   mv(result, -1);
2776   j(DONE);
2777 
2778   bind(MATCH);
2779   add(result, result, tmp2);
2780 
2781   bind(DONE);
2782 }
2783 
2784 // Set dst to NaN if any NaN input.
2785 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2786                                     BasicType bt, bool is_min, uint vector_length) {
2787   assert_different_registers(dst, src1, src2);
2788 
2789   vsetvli_helper(bt, vector_length);
2790 
2791   is_min ? vfmin_vv(dst, src1, src2)
2792          : vfmax_vv(dst, src1, src2);
2793 
2794   vmfne_vv(v0,  src1, src1);
2795   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2796   vmfne_vv(v0,  src2, src2);
2797   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2798 }
2799 
2800 // Set dst to NaN if any NaN input.
2801 // The destination vector register elements corresponding to masked-off elements
2802 // are handled with a mask-undisturbed policy.
2803 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2804                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2805                                            BasicType bt, bool is_min, uint vector_length) {
2806   assert_different_registers(src1, src2, tmp1, tmp2);
2807   vsetvli_helper(bt, vector_length);
2808 
2809   // Check vector elements of src1 and src2 for NaN.
2810   vmfeq_vv(tmp1, src1, src1);
2811   vmfeq_vv(tmp2, src2, src2);
2812 
2813   vmandn_mm(v0, vmask, tmp1);
2814   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2815   vmandn_mm(v0, vmask, tmp2);
2816   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2817 
2818   vmand_mm(tmp2, tmp1, tmp2);
2819   vmand_mm(v0, vmask, tmp2);
2820   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2821          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2822 }
2823 
2824 // Set dst to NaN if any NaN input.
2825 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2826                                            FloatRegister src1, VectorRegister src2,
2827                                            VectorRegister tmp1, VectorRegister tmp2,
2828                                            bool is_double, bool is_min, uint vector_length, VectorMask vm) {
2829   assert_different_registers(dst, src1);
2830   assert_different_registers(src2, tmp1, tmp2);
2831 
2832   Label L_done, L_NaN_1, L_NaN_2;
2833   // Set dst to src1 if src1 is NaN
2834   is_double ? feq_d(t0, src1, src1)
2835             : feq_s(t0, src1, src1);
2836   beqz(t0, L_NaN_2);
2837 
2838   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2839   vfmv_s_f(tmp2, src1);
2840 
2841   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2842          : vfredmax_vs(tmp1, src2, tmp2, vm);
2843   vfmv_f_s(dst, tmp1);
2844 
2845   // Checking NaNs in src2
2846   vmfne_vv(tmp1, src2, src2, vm);
2847   vcpop_m(t0, tmp1, vm);
2848   beqz(t0, L_done);
2849 
2850   bind(L_NaN_1);
2851   vfredusum_vs(tmp1, src2, tmp2, vm);
2852   vfmv_f_s(dst, tmp1);
2853   j(L_done);
2854 
2855   bind(L_NaN_2);
2856   is_double ? fmv_d(dst, src1)
2857             : fmv_s(dst, src1);
2858   bind(L_done);
2859 }
2860 
2861 bool C2_MacroAssembler::in_scratch_emit_size() {
2862   if (ciEnv::current()->task() != nullptr) {
2863     PhaseOutput* phase_output = Compile::current()->output();
2864     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2865       return true;
2866     }
2867   }
2868   return MacroAssembler::in_scratch_emit_size();
2869 }
2870 
2871 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2872                                           VectorRegister src2, VectorRegister tmp,
2873                                           int opc, BasicType bt, uint vector_length, VectorMask vm) {
2874   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2875   vsetvli_helper(bt, vector_length);
2876   vmv_s_x(tmp, src1);
2877   switch (opc) {
2878     case Op_AddReductionVI:
2879     case Op_AddReductionVL:
2880       vredsum_vs(tmp, src2, tmp, vm);
2881       break;
2882     case Op_AndReductionV:
2883       vredand_vs(tmp, src2, tmp, vm);
2884       break;
2885     case Op_OrReductionV:
2886       vredor_vs(tmp, src2, tmp, vm);
2887       break;
2888     case Op_XorReductionV:
2889       vredxor_vs(tmp, src2, tmp, vm);
2890       break;
2891     case Op_MaxReductionV:
2892       vredmax_vs(tmp, src2, tmp, vm);
2893       break;
2894     case Op_MinReductionV:
2895       vredmin_vs(tmp, src2, tmp, vm);
2896       break;
2897     default:
2898       ShouldNotReachHere();
2899   }
2900   vmv_x_s(dst, tmp);
2901 }
2902 
2903 // Set vl and vtype for full and partial vector operations.
2904 // (vma = mu, vta = tu, vill = false)
2905 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
2906   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2907   if (vector_length <= 31) {
2908     vsetivli(tmp, vector_length, sew, vlmul);
2909   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2910     vsetvli(tmp, x0, sew, vlmul);
2911   } else {
2912     mv(tmp, vector_length);
2913     vsetvli(tmp, tmp, sew, vlmul);
2914   }
2915 }
2916 
2917 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2918                                            int cond, BasicType bt, uint vector_length, VectorMask vm) {
2919   assert(is_integral_type(bt), "unsupported element type");
2920   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2921   vsetvli_helper(bt, vector_length);
2922   vmclr_m(vd);
2923   switch (cond) {
2924     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2925     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2926     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2927     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2928     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2929     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2930     case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
2931     case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
2932     case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
2933     case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
2934     default:
2935       assert(false, "unsupported compare condition");
2936       ShouldNotReachHere();
2937   }
2938 }
2939 
2940 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2941                                      int cond, BasicType bt, uint vector_length, VectorMask vm) {
2942   assert(is_floating_point_type(bt), "unsupported element type");
2943   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2944   vsetvli_helper(bt, vector_length);
2945   vmclr_m(vd);
2946   switch (cond) {
2947     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2948     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2949     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2950     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2951     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2952     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2953     default:
2954       assert(false, "unsupported compare condition");
2955       ShouldNotReachHere();
2956   }
2957 }
2958 
2959 // In Matcher::scalable_predicate_reg_slots,
2960 // we assume each predicate register is one-eighth of the size of
2961 // scalable vector register, one mask bit per vector byte.
2962 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
2963   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2964   add(t0, sp, offset);
2965   vse8_v(v, t0);
2966 }
2967 
2968 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
2969   vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
2970   add(t0, sp, offset);
2971   vle8_v(v, t0);
2972 }
2973 
2974 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
2975                                          VectorRegister src, BasicType src_bt, bool is_signed) {
2976   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2977   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2978   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2979   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2980   // and the overlap is in the highest-numbered part of the destination register group.
2981   // Since LMUL=1, vd and vs cannot be the same.
2982   assert_different_registers(dst, src);
2983 
2984   vsetvli_helper(dst_bt, vector_length);
2985   if (is_signed) {
2986     if (src_bt == T_BYTE) {
2987       switch (dst_bt) {
2988       case T_SHORT:
2989         vsext_vf2(dst, src);
2990         break;
2991       case T_INT:
2992         vsext_vf4(dst, src);
2993         break;
2994       case T_LONG:
2995         vsext_vf8(dst, src);
2996         break;
2997       default:
2998         ShouldNotReachHere();
2999       }
3000     } else if (src_bt == T_SHORT) {
3001       if (dst_bt == T_INT) {
3002         vsext_vf2(dst, src);
3003       } else {
3004         vsext_vf4(dst, src);
3005       }
3006     } else if (src_bt == T_INT) {
3007       vsext_vf2(dst, src);
3008     }
3009   } else {
3010     if (src_bt == T_BYTE) {
3011       switch (dst_bt) {
3012       case T_SHORT:
3013         vzext_vf2(dst, src);
3014         break;
3015       case T_INT:
3016         vzext_vf4(dst, src);
3017         break;
3018       case T_LONG:
3019         vzext_vf8(dst, src);
3020         break;
3021       default:
3022         ShouldNotReachHere();
3023       }
3024     } else if (src_bt == T_SHORT) {
3025       if (dst_bt == T_INT) {
3026         vzext_vf2(dst, src);
3027       } else {
3028         vzext_vf4(dst, src);
3029       }
3030     } else if (src_bt == T_INT) {
3031       vzext_vf2(dst, src);
3032     }
3033   }
3034 }
3035 
3036 // Vector narrow from src to dst with specified element sizes.
3037 // High part of dst vector will be filled with zero.
3038 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3039                                          VectorRegister src, BasicType src_bt) {
3040   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
3041   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3042   mv(t0, vector_length);
3043   if (src_bt == T_LONG) {
3044     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
3045     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
3046     // So we can currently only scale down by 1/2 the width at a time.
3047     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
3048     vncvt_x_x_w(dst, src);
3049     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
3050       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3051       vncvt_x_x_w(dst, dst);
3052       if (dst_bt == T_BYTE) {
3053         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3054         vncvt_x_x_w(dst, dst);
3055       }
3056     }
3057   } else if (src_bt == T_INT) {
3058     // T_SHORT
3059     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3060     vncvt_x_x_w(dst, src);
3061     if (dst_bt == T_BYTE) {
3062       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3063       vncvt_x_x_w(dst, dst);
3064     }
3065   } else if (src_bt == T_SHORT) {
3066     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3067     vncvt_x_x_w(dst, src);
3068   }
3069 }
3070 
3071 #define VFCVT_SAFE(VFLOATCVT)                                                      \
3072 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
3073   assert_different_registers(dst, src);                                            \
3074   vxor_vv(dst, dst, dst);                                                          \
3075   vmfeq_vv(v0, src, src);                                                          \
3076   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
3077 }
3078 
3079 VFCVT_SAFE(vfcvt_rtz_x_f_v);
3080 
3081 #undef VFCVT_SAFE
3082 
3083 // Extract a scalar element from an vector at position 'idx'.
3084 // The input elements in src are expected to be of integral type.
3085 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
3086                                   int idx, VectorRegister tmp) {
3087   assert(is_integral_type(bt), "unsupported element type");
3088   assert(idx >= 0, "idx cannot be negative");
3089   // Only need the first element after vector slidedown
3090   vsetvli_helper(bt, 1);
3091   if (idx == 0) {
3092     vmv_x_s(dst, src);
3093   } else if (idx <= 31) {
3094     vslidedown_vi(tmp, src, idx);
3095     vmv_x_s(dst, tmp);
3096   } else {
3097     mv(t0, idx);
3098     vslidedown_vx(tmp, src, t0);
3099     vmv_x_s(dst, tmp);
3100   }
3101 }
3102 
3103 // Extract a scalar element from an vector at position 'idx'.
3104 // The input elements in src are expected to be of floating point type.
3105 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
3106                                      int idx, VectorRegister tmp) {
3107   assert(is_floating_point_type(bt), "unsupported element type");
3108   assert(idx >= 0, "idx cannot be negative");
3109   // Only need the first element after vector slidedown
3110   vsetvli_helper(bt, 1);
3111   if (idx == 0) {
3112     vfmv_f_s(dst, src);
3113   } else if (idx <= 31) {
3114     vslidedown_vi(tmp, src, idx);
3115     vfmv_f_s(dst, tmp);
3116   } else {
3117     mv(t0, idx);
3118     vslidedown_vx(tmp, src, t0);
3119     vfmv_f_s(dst, tmp);
3120   }
3121 }