1 /*
   2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/intrinsicnode.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
  48                                   Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
  49   // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
  50   Register flag = t1;
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmp1Reg;
  54   Register tmp = tmp2Reg;
  55   Label object_has_monitor;
  56   // Finish fast lock successfully. MUST branch to with flag == 0
  57   Label locked;
  58   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
  59   Label slow_path;
  60 
  61   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  62   assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
  63 
  64   mv(flag, 1);
  65 
  66   // Load markWord from object into displaced_header.
  67   ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  68 
  69   if (DiagnoseSyncOnValueBasedClasses != 0) {
  70     load_klass(tmp, oop);
  71     lwu(tmp, Address(tmp, Klass::access_flags_offset()));
  72     test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
  73     bnez(tmp, slow_path);
  74   }
  75 
  76   // Check for existing monitor
  77   test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value));
  78   bnez(tmp, object_has_monitor);
  79 
  80   if (LockingMode == LM_MONITOR) {
  81     j(slow_path);
  82   } else {
  83     assert(LockingMode == LM_LEGACY, "must be");
  84     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  85     ori(tmp, disp_hdr, markWord::unlocked_value);
  86 
  87     // Initialize the box. (Must happen before we update the object mark!)
  88     sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  89 
  90     // Compare object markWord with an unlocked value (tmp) and if
  91     // equal exchange the stack address of our box with object markWord.
  92     // On failure disp_hdr contains the possibly locked markWord.
  93     cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64,
  94             Assembler::aq, Assembler::rl, /*result*/disp_hdr);
  95     beq(disp_hdr, tmp, locked);
  96 
  97     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  98 
  99     // If the compare-and-exchange succeeded, then we found an unlocked
 100     // object, will have now locked it will continue at label locked
 101     // We did not see an unlocked object so try the fast recursive case.
 102 
 103     // Check if the owner is self by comparing the value in the
 104     // markWord of object (disp_hdr) with the stack pointer.
 105     sub(disp_hdr, disp_hdr, sp);
 106     mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
 107     // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked,
 108     // hence we can store 0 as the displaced header in the box, which indicates that it is a
 109     // recursive lock.
 110     andr(tmp/*==0?*/, disp_hdr, tmp);
 111     sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 112     beqz(tmp, locked);
 113     j(slow_path);
 114   }
 115 
 116   // Handle existing monitor.
 117   bind(object_has_monitor);
 118   // The object's monitor m is unlocked iff m->owner == NULL,
 119   // otherwise m->owner may contain a thread or a stack address.
 120   //
 121   // Try to CAS m->owner from NULL to current thread.
 122   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
 123   cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64,
 124           Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected)
 125 
 126   // Store a non-null value into the box to avoid looking like a re-entrant
 127   // lock. The fast-path monitor unlock code checks for
 128   // markWord::monitor_value so use markWord::unused_mark which has the
 129   // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
 130   mv(tmp, (address)markWord::unused_mark().value());
 131   sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 132 
 133   beqz(tmp3Reg, locked); // CAS success means locking succeeded
 134 
 135   bne(tmp3Reg, xthread, slow_path); // Check for recursive locking
 136 
 137   // Recursive lock case
 138   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg);
 139 
 140   bind(locked);
 141   mv(flag, zr);
 142   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg);
 143 
 144 #ifdef ASSERT
 145   // Check that locked label is reached with flag == 0.
 146   Label flag_correct;
 147   beqz(flag, flag_correct);
 148   stop("Fast Lock Flag != 0");
 149 #endif
 150 
 151   bind(slow_path);
 152 #ifdef ASSERT
 153   // Check that slow_path label is reached with flag != 0.
 154   bnez(flag, flag_correct);
 155   stop("Fast Lock Flag == 0");
 156   bind(flag_correct);
 157 #endif
 158   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 159 }
 160 
 161 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
 162                                     Register tmp1Reg, Register tmp2Reg) {
 163   // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
 164   Register flag = t1;
 165   Register oop = objectReg;
 166   Register box = boxReg;
 167   Register disp_hdr = tmp1Reg;
 168   Register tmp = tmp2Reg;
 169   Label object_has_monitor;
 170   // Finish fast lock successfully. MUST branch to with flag == 0
 171   Label unlocked;
 172   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 173   Label slow_path;
 174 
 175   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 176   assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
 177 
 178   mv(flag, 1);
 179 
 180   if (LockingMode == LM_LEGACY) {
 181     // Find the lock address and load the displaced header from the stack.
 182     ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 183 
 184     // If the displaced header is 0, we have a recursive unlock.
 185     beqz(disp_hdr, unlocked);
 186   }
 187 
 188   // Handle existing monitor.
 189   ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 190   test_bit(t0, tmp, exact_log2(markWord::monitor_value));
 191   bnez(t0, object_has_monitor);
 192 
 193   if (LockingMode == LM_MONITOR) {
 194     j(slow_path);
 195   } else {
 196     assert(LockingMode == LM_LEGACY, "must be");
 197     // Check if it is still a light weight lock, this is true if we
 198     // see the stack address of the basicLock in the markWord of the
 199     // object.
 200 
 201     cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64,
 202             Assembler::relaxed, Assembler::rl, /*result*/tmp);
 203     beq(box, tmp, unlocked); // box == tmp if cas succeeds
 204     j(slow_path);
 205   }
 206 
 207   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 208 
 209   // Handle existing monitor.
 210   bind(object_has_monitor);
 211   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 212   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 213 
 214   ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 215 
 216   Label notRecursive;
 217   beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
 218 
 219   // Recursive lock
 220   addi(disp_hdr, disp_hdr, -1);
 221   sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 222   j(unlocked);
 223 
 224   bind(notRecursive);
 225   ld(t0, Address(tmp, ObjectMonitor::EntryList_offset()));
 226   ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 227   orr(t0, t0, disp_hdr); // Will be 0 if both are 0.
 228   bnez(t0, slow_path);
 229 
 230   // need a release store here
 231   la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 232   membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 233   sd(zr, Address(tmp)); // set unowned
 234 
 235   bind(unlocked);
 236   mv(flag, zr);
 237   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg);
 238 
 239 #ifdef ASSERT
 240   // Check that unlocked label is reached with flag == 0.
 241   Label flag_correct;
 242   beqz(flag, flag_correct);
 243   stop("Fast Lock Flag != 0");
 244 #endif
 245 
 246   bind(slow_path);
 247 #ifdef ASSERT
 248   // Check that slow_path label is reached with flag != 0.
 249   bnez(flag, flag_correct);
 250   stop("Fast Lock Flag == 0");
 251   bind(flag_correct);
 252 #endif
 253   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 254 }
 255 
 256 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) {
 257   // Flag register, zero for success; non-zero for failure.
 258   Register flag = t1;
 259 
 260   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 261   assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
 262 
 263   mv(flag, 1);
 264 
 265   // Handle inflated monitor.
 266   Label inflated;
 267   // Finish fast lock successfully. MUST branch to with flag == 0
 268   Label locked;
 269   // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
 270   Label slow_path;
 271 
 272   if (DiagnoseSyncOnValueBasedClasses != 0) {
 273     load_klass(tmp1, obj);
 274     lwu(tmp1, Address(tmp1, Klass::access_flags_offset()));
 275     test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
 276     bnez(tmp1, slow_path);
 277   }
 278 
 279   const Register tmp1_mark = tmp1;
 280 
 281   { // Lightweight locking
 282 
 283     // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
 284     Label push;
 285 
 286     const Register tmp2_top = tmp2;
 287     const Register tmp3_t = tmp3;
 288 
 289     // Check if lock-stack is full.
 290     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 291     mv(tmp3_t, (unsigned)LockStack::end_offset());
 292     bge(tmp2_top, tmp3_t, slow_path);
 293 
 294     // Check if recursive.
 295     add(tmp3_t, xthread, tmp2_top);
 296     ld(tmp3_t, Address(tmp3_t, -oopSize));
 297     beq(obj, tmp3_t, push);
 298 
 299     // Relaxed normal load to check for monitor. Optimization for monitor case.
 300     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 301     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 302     bnez(tmp3_t, inflated);
 303 
 304     // Not inflated
 305     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
 306 
 307     // Try to lock. Transition lock-bits 0b01 => 0b00
 308     ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
 309     xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 310     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 311             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
 312     bne(tmp1_mark, tmp3_t, slow_path);
 313 
 314     bind(push);
 315     // After successful lock, push object on lock-stack.
 316     add(tmp3_t, xthread, tmp2_top);
 317     sd(obj, Address(tmp3_t));
 318     addw(tmp2_top, tmp2_top, oopSize);
 319     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 320     j(locked);
 321   }
 322 
 323   { // Handle inflated monitor.
 324     bind(inflated);
 325 
 326     // mark contains the tagged ObjectMonitor*.
 327     const Register tmp1_tagged_monitor = tmp1_mark;
 328     const uintptr_t monitor_tag = markWord::monitor_value;
 329     const Register tmp2_owner_addr = tmp2;
 330     const Register tmp3_owner = tmp3;
 331 
 332     // Compute owner address.
 333     la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
 334 
 335     // CAS owner (null => current thread).
 336     cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64,
 337             /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
 338     beqz(tmp3_owner, locked);
 339 
 340     // Check if recursive.
 341     bne(tmp3_owner, xthread, slow_path);
 342 
 343     // Recursive.
 344     increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3);
 345   }
 346 
 347   bind(locked);
 348   mv(flag, zr);
 349   increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 350 
 351 #ifdef ASSERT
 352   // Check that locked label is reached with flag == 0.
 353   Label flag_correct;
 354   beqz(flag, flag_correct);
 355   stop("Fast Lock Flag != 0");
 356 #endif
 357 
 358   bind(slow_path);
 359 #ifdef ASSERT
 360   // Check that slow_path label is reached with flag != 0.
 361   bnez(flag, flag_correct);
 362   stop("Fast Lock Flag == 0");
 363   bind(flag_correct);
 364 #endif
 365   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 366 }
 367 
 368 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2,
 369                                                 Register tmp3) {
 370   // Flag register, zero for success; non-zero for failure.
 371   Register flag = t1;
 372 
 373   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 374   assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
 375 
 376   mv(flag, 1);
 377 
 378   // Handle inflated monitor.
 379   Label inflated, inflated_load_monitor;
 380   // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
 381   Label unlocked;
 382   // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
 383   Label slow_path;
 384 
 385   const Register tmp1_mark = tmp1;
 386   const Register tmp2_top = tmp2;
 387   const Register tmp3_t = tmp3;
 388 
 389   { // Lightweight unlock
 390 
 391     // Check if obj is top of lock-stack.
 392     lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 393     subw(tmp2_top, tmp2_top, oopSize);
 394     add(tmp3_t, xthread, tmp2_top);
 395     ld(tmp3_t, Address(tmp3_t));
 396     // Top of lock stack was not obj. Must be monitor.
 397     bne(obj, tmp3_t, inflated_load_monitor);
 398 
 399     // Pop lock-stack.
 400     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 401     DEBUG_ONLY(sd(zr, Address(tmp3_t));)
 402     sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 403 
 404     // Check if recursive.
 405     add(tmp3_t, xthread, tmp2_top);
 406     ld(tmp3_t, Address(tmp3_t, -oopSize));
 407     beq(obj, tmp3_t, unlocked);
 408 
 409     // Not recursive.
 410     // Load Mark.
 411     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 412 
 413     // Check header for monitor (0b10).
 414     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 415     bnez(tmp3_t, inflated);
 416 
 417     // Try to unlock. Transition lock bits 0b00 => 0b01
 418     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 419     ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
 420     cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
 421             /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
 422     beq(tmp1_mark, tmp3_t, unlocked);
 423 
 424     // Compare and exchange failed.
 425     // Restore lock-stack and handle the unlock in runtime.
 426     DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
 427     DEBUG_ONLY(sd(obj, Address(tmp3_t));)
 428     addw(tmp2_top, tmp2_top, oopSize);
 429     sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
 430     j(slow_path);
 431   }
 432 
 433   { // Handle inflated monitor.
 434     bind(inflated_load_monitor);
 435     ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 436 #ifdef ASSERT
 437     test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
 438     bnez(tmp3_t, inflated);
 439     stop("Fast Unlock not monitor");
 440 #endif
 441 
 442     bind(inflated);
 443 
 444 #ifdef ASSERT
 445     Label check_done;
 446     subw(tmp2_top, tmp2_top, oopSize);
 447     mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
 448     blt(tmp2_top, tmp3_t, check_done);
 449     add(tmp3_t, xthread, tmp2_top);
 450     ld(tmp3_t, Address(tmp3_t));
 451     bne(obj, tmp3_t, inflated);
 452     stop("Fast Unlock lock on stack");
 453     bind(check_done);
 454 #endif
 455 
 456     // mark contains the tagged ObjectMonitor*.
 457     const Register tmp1_monitor = tmp1_mark;
 458     const uintptr_t monitor_tag = markWord::monitor_value;
 459 
 460     // Untag the monitor.
 461     sub(tmp1_monitor, tmp1_mark, monitor_tag);
 462 
 463     const Register tmp2_recursions = tmp2;
 464     Label not_recursive;
 465 
 466     // Check if recursive.
 467     ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 468     beqz(tmp2_recursions, not_recursive);
 469 
 470     // Recursive unlock.
 471     addi(tmp2_recursions, tmp2_recursions, -1);
 472     sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
 473     j(unlocked);
 474 
 475     bind(not_recursive);
 476 
 477     Label release;
 478     const Register tmp2_owner_addr = tmp2;
 479 
 480     // Compute owner address.
 481     la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
 482 
 483     // Check if the entry lists are empty.
 484     ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset()));
 485     ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset()));
 486     orr(t0, t0, tmp3_t);
 487     beqz(t0, release);
 488 
 489     // The owner may be anonymous and we removed the last obj entry in
 490     // the lock-stack. This loses the information about the owner.
 491     // Write the thread to the owner field so the runtime knows the owner.
 492     sd(xthread, Address(tmp2_owner_addr));
 493     j(slow_path);
 494 
 495     bind(release);
 496     // Set owner to null.
 497     membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
 498     sd(zr, Address(tmp2_owner_addr));
 499   }
 500 
 501   bind(unlocked);
 502   mv(flag, zr);
 503   decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
 504 
 505 #ifdef ASSERT
 506   // Check that unlocked label is reached with flag == 0.
 507   Label flag_correct;
 508   beqz(flag, flag_correct);
 509   stop("Fast Lock Flag != 0");
 510 #endif
 511 
 512   bind(slow_path);
 513 #ifdef ASSERT
 514   // Check that slow_path label is reached with flag != 0.
 515   bnez(flag, flag_correct);
 516   stop("Fast Lock Flag == 0");
 517   bind(flag_correct);
 518 #endif
 519   // C2 uses the value of flag (0 vs !0) to determine the continuation.
 520 }
 521 
 522 // short string
 523 // StringUTF16.indexOfChar
 524 // StringLatin1.indexOfChar
 525 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
 526                                                   Register ch, Register result,
 527                                                   bool isL)
 528 {
 529   Register ch1 = t0;
 530   Register index = t1;
 531 
 532   BLOCK_COMMENT("string_indexof_char_short {");
 533 
 534   Label LOOP, LOOP1, LOOP4, LOOP8;
 535   Label MATCH,  MATCH1, MATCH2, MATCH3,
 536         MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
 537 
 538   mv(result, -1);
 539   mv(index, zr);
 540 
 541   bind(LOOP);
 542   addi(t0, index, 8);
 543   ble(t0, cnt1, LOOP8);
 544   addi(t0, index, 4);
 545   ble(t0, cnt1, LOOP4);
 546   j(LOOP1);
 547 
 548   bind(LOOP8);
 549   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 550   beq(ch, ch1, MATCH);
 551   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 552   beq(ch, ch1, MATCH1);
 553   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 554   beq(ch, ch1, MATCH2);
 555   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 556   beq(ch, ch1, MATCH3);
 557   isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
 558   beq(ch, ch1, MATCH4);
 559   isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
 560   beq(ch, ch1, MATCH5);
 561   isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
 562   beq(ch, ch1, MATCH6);
 563   isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
 564   beq(ch, ch1, MATCH7);
 565   addi(index, index, 8);
 566   addi(str1, str1, isL ? 8 : 16);
 567   blt(index, cnt1, LOOP);
 568   j(NOMATCH);
 569 
 570   bind(LOOP4);
 571   isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
 572   beq(ch, ch1, MATCH);
 573   isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
 574   beq(ch, ch1, MATCH1);
 575   isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
 576   beq(ch, ch1, MATCH2);
 577   isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
 578   beq(ch, ch1, MATCH3);
 579   addi(index, index, 4);
 580   addi(str1, str1, isL ? 4 : 8);
 581   bge(index, cnt1, NOMATCH);
 582 
 583   bind(LOOP1);
 584   isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
 585   beq(ch, ch1, MATCH);
 586   addi(index, index, 1);
 587   addi(str1, str1, isL ? 1 : 2);
 588   blt(index, cnt1, LOOP1);
 589   j(NOMATCH);
 590 
 591   bind(MATCH1);
 592   addi(index, index, 1);
 593   j(MATCH);
 594 
 595   bind(MATCH2);
 596   addi(index, index, 2);
 597   j(MATCH);
 598 
 599   bind(MATCH3);
 600   addi(index, index, 3);
 601   j(MATCH);
 602 
 603   bind(MATCH4);
 604   addi(index, index, 4);
 605   j(MATCH);
 606 
 607   bind(MATCH5);
 608   addi(index, index, 5);
 609   j(MATCH);
 610 
 611   bind(MATCH6);
 612   addi(index, index, 6);
 613   j(MATCH);
 614 
 615   bind(MATCH7);
 616   addi(index, index, 7);
 617 
 618   bind(MATCH);
 619   mv(result, index);
 620   bind(NOMATCH);
 621   BLOCK_COMMENT("} string_indexof_char_short");
 622 }
 623 
 624 // StringUTF16.indexOfChar
 625 // StringLatin1.indexOfChar
 626 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 627                                             Register ch, Register result,
 628                                             Register tmp1, Register tmp2,
 629                                             Register tmp3, Register tmp4,
 630                                             bool isL)
 631 {
 632   Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
 633   Register ch1 = t0;
 634   Register orig_cnt = t1;
 635   Register mask1 = tmp3;
 636   Register mask2 = tmp2;
 637   Register match_mask = tmp1;
 638   Register trailing_char = tmp4;
 639   Register unaligned_elems = tmp4;
 640 
 641   BLOCK_COMMENT("string_indexof_char {");
 642   beqz(cnt1, NOMATCH);
 643 
 644   addi(t0, cnt1, isL ? -32 : -16);
 645   bgtz(t0, DO_LONG);
 646   string_indexof_char_short(str1, cnt1, ch, result, isL);
 647   j(DONE);
 648 
 649   bind(DO_LONG);
 650   mv(orig_cnt, cnt1);
 651   if (AvoidUnalignedAccesses) {
 652     Label ALIGNED;
 653     andi(unaligned_elems, str1, 0x7);
 654     beqz(unaligned_elems, ALIGNED);
 655     sub(unaligned_elems, unaligned_elems, 8);
 656     neg(unaligned_elems, unaligned_elems);
 657     if (!isL) {
 658       srli(unaligned_elems, unaligned_elems, 1);
 659     }
 660     // do unaligned part per element
 661     string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
 662     bgez(result, DONE);
 663     mv(orig_cnt, cnt1);
 664     sub(cnt1, cnt1, unaligned_elems);
 665     bind(ALIGNED);
 666   }
 667 
 668   // duplicate ch
 669   if (isL) {
 670     slli(ch1, ch, 8);
 671     orr(ch, ch1, ch);
 672   }
 673   slli(ch1, ch, 16);
 674   orr(ch, ch1, ch);
 675   slli(ch1, ch, 32);
 676   orr(ch, ch1, ch);
 677 
 678   if (!isL) {
 679     slli(cnt1, cnt1, 1);
 680   }
 681 
 682   uint64_t mask0101 = UCONST64(0x0101010101010101);
 683   uint64_t mask0001 = UCONST64(0x0001000100010001);
 684   mv(mask1, isL ? mask0101 : mask0001);
 685   uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
 686   uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
 687   mv(mask2, isL ? mask7f7f : mask7fff);
 688 
 689   bind(CH1_LOOP);
 690   ld(ch1, Address(str1));
 691   addi(str1, str1, 8);
 692   addi(cnt1, cnt1, -8);
 693   compute_match_mask(ch1, ch, match_mask, mask1, mask2);
 694   bnez(match_mask, HIT);
 695   bgtz(cnt1, CH1_LOOP);
 696   j(NOMATCH);
 697 
 698   bind(HIT);
 699   ctzc_bit(trailing_char, match_mask, isL, ch1, result);
 700   srli(trailing_char, trailing_char, 3);
 701   addi(cnt1, cnt1, 8);
 702   ble(cnt1, trailing_char, NOMATCH);
 703   // match case
 704   if (!isL) {
 705     srli(cnt1, cnt1, 1);
 706     srli(trailing_char, trailing_char, 1);
 707   }
 708 
 709   sub(result, orig_cnt, cnt1);
 710   add(result, result, trailing_char);
 711   j(DONE);
 712 
 713   bind(NOMATCH);
 714   mv(result, -1);
 715 
 716   bind(DONE);
 717   BLOCK_COMMENT("} string_indexof_char");
 718 }
 719 
 720 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
 721 
 722 // Search for needle in haystack and return index or -1
 723 // x10: result
 724 // x11: haystack
 725 // x12: haystack_len
 726 // x13: needle
 727 // x14: needle_len
 728 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
 729                                        Register haystack_len, Register needle_len,
 730                                        Register tmp1, Register tmp2,
 731                                        Register tmp3, Register tmp4,
 732                                        Register tmp5, Register tmp6,
 733                                        Register result, int ae)
 734 {
 735   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 736 
 737   Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
 738 
 739   Register ch1 = t0;
 740   Register ch2 = t1;
 741   Register nlen_tmp = tmp1; // needle len tmp
 742   Register hlen_tmp = tmp2; // haystack len tmp
 743   Register result_tmp = tmp4;
 744 
 745   bool isLL = ae == StrIntrinsicNode::LL;
 746 
 747   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 748   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 749   int needle_chr_shift = needle_isL ? 0 : 1;
 750   int haystack_chr_shift = haystack_isL ? 0 : 1;
 751   int needle_chr_size = needle_isL ? 1 : 2;
 752   int haystack_chr_size = haystack_isL ? 1 : 2;
 753   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
 754                               (load_chr_insn)&MacroAssembler::lhu;
 755   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
 756                                 (load_chr_insn)&MacroAssembler::lhu;
 757 
 758   BLOCK_COMMENT("string_indexof {");
 759 
 760   // Note, inline_string_indexOf() generates checks:
 761   // if (pattern.count > src.count) return -1;
 762   // if (pattern.count == 0) return 0;
 763 
 764   // We have two strings, a source string in haystack, haystack_len and a pattern string
 765   // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
 766 
 767   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 768   // With a small pattern and source we use linear scan.
 769 
 770   // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
 771   sub(result_tmp, haystack_len, needle_len);
 772   // needle_len < 8, use linear scan
 773   sub(t0, needle_len, 8);
 774   bltz(t0, LINEARSEARCH);
 775   // needle_len >= 256, use linear scan
 776   sub(t0, needle_len, 256);
 777   bgez(t0, LINEARSTUB);
 778   // needle_len >= haystack_len/4, use linear scan
 779   srli(t0, haystack_len, 2);
 780   bge(needle_len, t0, LINEARSTUB);
 781 
 782   // Boyer-Moore-Horspool introduction:
 783   // The Boyer Moore alogorithm is based on the description here:-
 784   //
 785   // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 786   //
 787   // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 788   // and the 'Good Suffix' rule.
 789   //
 790   // These rules are essentially heuristics for how far we can shift the
 791   // pattern along the search string.
 792   //
 793   // The implementation here uses the 'Bad Character' rule only because of the
 794   // complexity of initialisation for the 'Good Suffix' rule.
 795   //
 796   // This is also known as the Boyer-Moore-Horspool algorithm:
 797   //
 798   // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 799   //
 800   // #define ASIZE 256
 801   //
 802   //    int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
 803   //      int i, j;
 804   //      unsigned c;
 805   //      unsigned char bc[ASIZE];
 806   //
 807   //      /* Preprocessing */
 808   //      for (i = 0; i < ASIZE; ++i)
 809   //        bc[i] = m;
 810   //      for (i = 0; i < m - 1; ) {
 811   //        c = pattern[i];
 812   //        ++i;
 813   //        // c < 256 for Latin1 string, so, no need for branch
 814   //        #ifdef PATTERN_STRING_IS_LATIN1
 815   //        bc[c] = m - i;
 816   //        #else
 817   //        if (c < ASIZE) bc[c] = m - i;
 818   //        #endif
 819   //      }
 820   //
 821   //      /* Searching */
 822   //      j = 0;
 823   //      while (j <= n - m) {
 824   //        c = src[i+j];
 825   //        if (pattern[m-1] == c)
 826   //          int k;
 827   //          for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 828   //          if (k < 0) return j;
 829   //          // c < 256 for Latin1 string, so, no need for branch
 830   //          #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
 831   //          // LL case: (c< 256) always true. Remove branch
 832   //          j += bc[pattern[j+m-1]];
 833   //          #endif
 834   //          #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
 835   //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 836   //          if (c < ASIZE)
 837   //            j += bc[pattern[j+m-1]];
 838   //          else
 839   //            j += 1
 840   //          #endif
 841   //          #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
 842   //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 843   //          if (c < ASIZE)
 844   //            j += bc[pattern[j+m-1]];
 845   //          else
 846   //            j += m
 847   //          #endif
 848   //      }
 849   //      return -1;
 850   //    }
 851 
 852   // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
 853   Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 854         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 855 
 856   Register haystack_end = haystack_len;
 857   Register skipch = tmp2;
 858 
 859   // pattern length is >=8, so, we can read at least 1 register for cases when
 860   // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 861   // UL case. We'll re-read last character in inner pre-loop code to have
 862   // single outer pre-loop load
 863   const int firstStep = isLL ? 7 : 3;
 864 
 865   const int ASIZE = 256;
 866   const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
 867 
 868   sub(sp, sp, ASIZE);
 869 
 870   // init BC offset table with default value: needle_len
 871   slli(t0, needle_len, 8);
 872   orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
 873   slli(tmp1, t0, 16);
 874   orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
 875   slli(tmp1, t0, 32);
 876   orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
 877 
 878   mv(ch1, sp);  // ch1 is t0
 879   mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
 880 
 881   bind(BM_INIT_LOOP);
 882   // for (i = 0; i < ASIZE; ++i)
 883   //   bc[i] = m;
 884   for (int i = 0; i < 4; i++) {
 885     sd(tmp5, Address(ch1, i * wordSize));
 886   }
 887   add(ch1, ch1, 32);
 888   sub(tmp6, tmp6, 4);
 889   bgtz(tmp6, BM_INIT_LOOP);
 890 
 891   sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
 892   Register orig_haystack = tmp5;
 893   mv(orig_haystack, haystack);
 894   // result_tmp = tmp4
 895   shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
 896   sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
 897   mv(tmp3, needle);
 898 
 899   //  for (i = 0; i < m - 1; ) {
 900   //    c = pattern[i];
 901   //    ++i;
 902   //    // c < 256 for Latin1 string, so, no need for branch
 903   //    #ifdef PATTERN_STRING_IS_LATIN1
 904   //    bc[c] = m - i;
 905   //    #else
 906   //    if (c < ASIZE) bc[c] = m - i;
 907   //    #endif
 908   //  }
 909   bind(BCLOOP);
 910   (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
 911   add(tmp3, tmp3, needle_chr_size);
 912   if (!needle_isL) {
 913     // ae == StrIntrinsicNode::UU
 914     mv(tmp6, ASIZE);
 915     bgeu(ch1, tmp6, BCSKIP);
 916   }
 917   add(tmp4, sp, ch1);
 918   sb(ch2, Address(tmp4)); // store skip offset to BC offset table
 919 
 920   bind(BCSKIP);
 921   sub(ch2, ch2, 1); // for next pattern element, skip distance -1
 922   bgtz(ch2, BCLOOP);
 923 
 924   // tmp6: pattern end, address after needle
 925   shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
 926   if (needle_isL == haystack_isL) {
 927     // load last 8 bytes (8LL/4UU symbols)
 928     ld(tmp6, Address(tmp6, -wordSize));
 929   } else {
 930     // UL: from UTF-16(source) search Latin1(pattern)
 931     lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
 932     // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
 933     // We'll have to wait until load completed, but it's still faster than per-character loads+checks
 934     srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
 935     slli(ch2, tmp6, XLEN - 24);
 936     srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
 937     slli(ch1, tmp6, XLEN - 16);
 938     srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
 939     andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
 940     slli(ch2, ch2, 16);
 941     orr(ch2, ch2, ch1); // 0x00000b0c
 942     slli(result, tmp3, 48); // use result as temp register
 943     orr(tmp6, tmp6, result); // 0x0a00000d
 944     slli(result, ch2, 16);
 945     orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
 946   }
 947 
 948   // i = m - 1;
 949   // skipch = j + i;
 950   // if (skipch == pattern[m - 1]
 951   //   for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
 952   // else
 953   //   move j with bad char offset table
 954   bind(BMLOOPSTR2);
 955   // compare pattern to source string backward
 956   shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
 957   (this->*haystack_load_1chr)(skipch, Address(result), noreg);
 958   sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
 959   if (needle_isL == haystack_isL) {
 960     // re-init tmp3. It's for free because it's executed in parallel with
 961     // load above. Alternative is to initialize it before loop, but it'll
 962     // affect performance on in-order systems with 2 or more ld/st pipelines
 963     srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
 964   }
 965   if (!isLL) { // UU/UL case
 966     slli(ch2, nlen_tmp, 1); // offsets in bytes
 967   }
 968   bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
 969   add(result, haystack, isLL ? nlen_tmp : ch2);
 970   // load 8 bytes from source string
 971   // if isLL is false then read granularity can be 2
 972   load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
 973   mv(ch1, tmp6);
 974   if (isLL) {
 975     j(BMLOOPSTR1_AFTER_LOAD);
 976   } else {
 977     sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 978     j(BMLOOPSTR1_CMP);
 979   }
 980 
 981   bind(BMLOOPSTR1);
 982   shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
 983   (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
 984   shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
 985   (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
 986 
 987   bind(BMLOOPSTR1_AFTER_LOAD);
 988   sub(nlen_tmp, nlen_tmp, 1);
 989   bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
 990 
 991   bind(BMLOOPSTR1_CMP);
 992   beq(ch1, ch2, BMLOOPSTR1);
 993 
 994   bind(BMSKIP);
 995   if (!isLL) {
 996     // if we've met UTF symbol while searching Latin1 pattern, then we can
 997     // skip needle_len symbols
 998     if (needle_isL != haystack_isL) {
 999       mv(result_tmp, needle_len);
1000     } else {
1001       mv(result_tmp, 1);
1002     }
1003     mv(t0, ASIZE);
1004     bgeu(skipch, t0, BMADV);
1005   }
1006   add(result_tmp, sp, skipch);
1007   lbu(result_tmp, Address(result_tmp)); // load skip offset
1008 
1009   bind(BMADV);
1010   sub(nlen_tmp, needle_len, 1);
1011   // move haystack after bad char skip offset
1012   shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
1013   ble(haystack, haystack_end, BMLOOPSTR2);
1014   add(sp, sp, ASIZE);
1015   j(NOMATCH);
1016 
1017   bind(BMLOOPSTR1_LASTCMP);
1018   bne(ch1, ch2, BMSKIP);
1019 
1020   bind(BMMATCH);
1021   sub(result, haystack, orig_haystack);
1022   if (!haystack_isL) {
1023     srli(result, result, 1);
1024   }
1025   add(sp, sp, ASIZE);
1026   j(DONE);
1027 
1028   bind(LINEARSTUB);
1029   sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
1030   bltz(t0, LINEARSEARCH);
1031   mv(result, zr);
1032   RuntimeAddress stub = nullptr;
1033   if (isLL) {
1034     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
1035     assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
1036   } else if (needle_isL) {
1037     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
1038     assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
1039   } else {
1040     stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
1041     assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
1042   }
1043   address call = trampoline_call(stub);
1044   if (call == nullptr) {
1045     DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
1046     ciEnv::current()->record_failure("CodeCache is full");
1047     return;
1048   }
1049   j(DONE);
1050 
1051   bind(NOMATCH);
1052   mv(result, -1);
1053   j(DONE);
1054 
1055   bind(LINEARSEARCH);
1056   string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
1057 
1058   bind(DONE);
1059   BLOCK_COMMENT("} string_indexof");
1060 }
1061 
1062 // string_indexof
1063 // result: x10
1064 // src: x11
1065 // src_count: x12
1066 // pattern: x13
1067 // pattern_count: x14 or 1/2/3/4
1068 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
1069                                                Register haystack_len, Register needle_len,
1070                                                Register tmp1, Register tmp2,
1071                                                Register tmp3, Register tmp4,
1072                                                int needle_con_cnt, Register result, int ae)
1073 {
1074   // Note:
1075   // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
1076   // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
1077   assert(needle_con_cnt <= 4, "Invalid needle constant count");
1078   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1079 
1080   Register ch1 = t0;
1081   Register ch2 = t1;
1082   Register hlen_neg = haystack_len, nlen_neg = needle_len;
1083   Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
1084 
1085   bool isLL = ae == StrIntrinsicNode::LL;
1086 
1087   bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
1088   bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
1089   int needle_chr_shift = needle_isL ? 0 : 1;
1090   int haystack_chr_shift = haystack_isL ? 0 : 1;
1091   int needle_chr_size = needle_isL ? 1 : 2;
1092   int haystack_chr_size = haystack_isL ? 1 : 2;
1093 
1094   load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
1095                               (load_chr_insn)&MacroAssembler::lhu;
1096   load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
1097                                 (load_chr_insn)&MacroAssembler::lhu;
1098   load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
1099   load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
1100 
1101   Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
1102 
1103   Register first = tmp3;
1104 
1105   if (needle_con_cnt == -1) {
1106     Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1107 
1108     sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1109     bltz(t0, DOSHORT);
1110 
1111     (this->*needle_load_1chr)(first, Address(needle), noreg);
1112     slli(t0, needle_len, needle_chr_shift);
1113     add(needle, needle, t0);
1114     neg(nlen_neg, t0);
1115     slli(t0, result_tmp, haystack_chr_shift);
1116     add(haystack, haystack, t0);
1117     neg(hlen_neg, t0);
1118 
1119     bind(FIRST_LOOP);
1120     add(t0, haystack, hlen_neg);
1121     (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1122     beq(first, ch2, STR1_LOOP);
1123 
1124     bind(STR2_NEXT);
1125     add(hlen_neg, hlen_neg, haystack_chr_size);
1126     blez(hlen_neg, FIRST_LOOP);
1127     j(NOMATCH);
1128 
1129     bind(STR1_LOOP);
1130     add(nlen_tmp, nlen_neg, needle_chr_size);
1131     add(hlen_tmp, hlen_neg, haystack_chr_size);
1132     bgez(nlen_tmp, MATCH);
1133 
1134     bind(STR1_NEXT);
1135     add(ch1, needle, nlen_tmp);
1136     (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1137     add(ch2, haystack, hlen_tmp);
1138     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1139     bne(ch1, ch2, STR2_NEXT);
1140     add(nlen_tmp, nlen_tmp, needle_chr_size);
1141     add(hlen_tmp, hlen_tmp, haystack_chr_size);
1142     bltz(nlen_tmp, STR1_NEXT);
1143     j(MATCH);
1144 
1145     bind(DOSHORT);
1146     if (needle_isL == haystack_isL) {
1147       sub(t0, needle_len, 2);
1148       bltz(t0, DO1);
1149       bgtz(t0, DO3);
1150     }
1151   }
1152 
1153   if (needle_con_cnt == 4) {
1154     Label CH1_LOOP;
1155     (this->*load_4chr)(ch1, Address(needle), noreg);
1156     sub(result_tmp, haystack_len, 4);
1157     slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1158     add(haystack, haystack, tmp3);
1159     neg(hlen_neg, tmp3);
1160     if (AvoidUnalignedAccesses) {
1161       // preload first value, then we will read by 1 character per loop, instead of four
1162       // just shifting previous ch2 right by size of character in bits
1163       add(tmp3, haystack, hlen_neg);
1164       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1165       if (isLL) {
1166         // need to erase 1 most significant byte in 32-bit value of ch2
1167         slli(ch2, ch2, 40);
1168         srli(ch2, ch2, 32);
1169       } else {
1170         slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1171       }
1172     }
1173 
1174     bind(CH1_LOOP);
1175     add(tmp3, haystack, hlen_neg);
1176     if (AvoidUnalignedAccesses) {
1177       srli(ch2, ch2, isLL ? 8 : 16);
1178       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1179       slli(tmp3, tmp3, isLL ? 24 : 48);
1180       add(ch2, ch2, tmp3);
1181     } else {
1182       (this->*load_4chr)(ch2, Address(tmp3), noreg);
1183     }
1184     beq(ch1, ch2, MATCH);
1185     add(hlen_neg, hlen_neg, haystack_chr_size);
1186     blez(hlen_neg, CH1_LOOP);
1187     j(NOMATCH);
1188   }
1189 
1190   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1191     Label CH1_LOOP;
1192     BLOCK_COMMENT("string_indexof DO2 {");
1193     bind(DO2);
1194     (this->*load_2chr)(ch1, Address(needle), noreg);
1195     if (needle_con_cnt == 2) {
1196       sub(result_tmp, haystack_len, 2);
1197     }
1198     slli(tmp3, result_tmp, haystack_chr_shift);
1199     add(haystack, haystack, tmp3);
1200     neg(hlen_neg, tmp3);
1201     if (AvoidUnalignedAccesses) {
1202       // preload first value, then we will read by 1 character per loop, instead of two
1203       // just shifting previous ch2 right by size of character in bits
1204       add(tmp3, haystack, hlen_neg);
1205       (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1206       slli(ch2, ch2, isLL ? 8 : 16);
1207     }
1208     bind(CH1_LOOP);
1209     add(tmp3, haystack, hlen_neg);
1210     if (AvoidUnalignedAccesses) {
1211       srli(ch2, ch2, isLL ? 8 : 16);
1212       (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1213       slli(tmp3, tmp3, isLL ? 8 : 16);
1214       add(ch2, ch2, tmp3);
1215     } else {
1216       (this->*load_2chr)(ch2, Address(tmp3), noreg);
1217     }
1218     beq(ch1, ch2, MATCH);
1219     add(hlen_neg, hlen_neg, haystack_chr_size);
1220     blez(hlen_neg, CH1_LOOP);
1221     j(NOMATCH);
1222     BLOCK_COMMENT("} string_indexof DO2");
1223   }
1224 
1225   if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1226     Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1227     BLOCK_COMMENT("string_indexof DO3 {");
1228 
1229     bind(DO3);
1230     (this->*load_2chr)(first, Address(needle), noreg);
1231     (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1232     if (needle_con_cnt == 3) {
1233       sub(result_tmp, haystack_len, 3);
1234     }
1235     slli(hlen_tmp, result_tmp, haystack_chr_shift);
1236     add(haystack, haystack, hlen_tmp);
1237     neg(hlen_neg, hlen_tmp);
1238 
1239     bind(FIRST_LOOP);
1240     add(ch2, haystack, hlen_neg);
1241     if (AvoidUnalignedAccesses) {
1242       (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1243       (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1244       slli(tmp2, tmp2, isLL ? 8 : 16);
1245       add(ch2, ch2, tmp2);
1246     } else {
1247       (this->*load_2chr)(ch2, Address(ch2), noreg);
1248     }
1249     beq(first, ch2, STR1_LOOP);
1250 
1251     bind(STR2_NEXT);
1252     add(hlen_neg, hlen_neg, haystack_chr_size);
1253     blez(hlen_neg, FIRST_LOOP);
1254     j(NOMATCH);
1255 
1256     bind(STR1_LOOP);
1257     add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1258     add(ch2, haystack, hlen_tmp);
1259     (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1260     bne(ch1, ch2, STR2_NEXT);
1261     j(MATCH);
1262     BLOCK_COMMENT("} string_indexof DO3");
1263   }
1264 
1265   if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1266     Label DO1_LOOP;
1267 
1268     BLOCK_COMMENT("string_indexof DO1 {");
1269     bind(DO1);
1270     (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1271     sub(result_tmp, haystack_len, 1);
1272     slli(tmp3, result_tmp, haystack_chr_shift);
1273     add(haystack, haystack, tmp3);
1274     neg(hlen_neg, tmp3);
1275 
1276     bind(DO1_LOOP);
1277     add(tmp3, haystack, hlen_neg);
1278     (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1279     beq(ch1, ch2, MATCH);
1280     add(hlen_neg, hlen_neg, haystack_chr_size);
1281     blez(hlen_neg, DO1_LOOP);
1282     BLOCK_COMMENT("} string_indexof DO1");
1283   }
1284 
1285   bind(NOMATCH);
1286   mv(result, -1);
1287   j(DONE);
1288 
1289   bind(MATCH);
1290   srai(t0, hlen_neg, haystack_chr_shift);
1291   add(result, result_tmp, t0);
1292 
1293   bind(DONE);
1294 }
1295 
1296 // Compare strings.
1297 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1298                                     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1299                                     Register tmp3, int ae)
1300 {
1301   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1302       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1303       SHORT_LOOP_START, TAIL_CHECK, L;
1304 
1305   const int STUB_THRESHOLD = 64 + 8;
1306   bool isLL = ae == StrIntrinsicNode::LL;
1307   bool isLU = ae == StrIntrinsicNode::LU;
1308   bool isUL = ae == StrIntrinsicNode::UL;
1309 
1310   bool str1_isL = isLL || isLU;
1311   bool str2_isL = isLL || isUL;
1312 
1313   // for L strings, 1 byte for 1 character
1314   // for U strings, 2 bytes for 1 character
1315   int str1_chr_size = str1_isL ? 1 : 2;
1316   int str2_chr_size = str2_isL ? 1 : 2;
1317   int minCharsInWord = isLL ? wordSize : wordSize / 2;
1318 
1319   load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1320   load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1321 
1322   BLOCK_COMMENT("string_compare {");
1323 
1324   // Bizzarely, the counts are passed in bytes, regardless of whether they
1325   // are L or U strings, however the result is always in characters.
1326   if (!str1_isL) {
1327     sraiw(cnt1, cnt1, 1);
1328   }
1329   if (!str2_isL) {
1330     sraiw(cnt2, cnt2, 1);
1331   }
1332 
1333   // Compute the minimum of the string lengths and save the difference in result.
1334   sub(result, cnt1, cnt2);
1335   bgt(cnt1, cnt2, L);
1336   mv(cnt2, cnt1);
1337   bind(L);
1338 
1339   // A very short string
1340   mv(t0, minCharsInWord);
1341   ble(cnt2, t0, SHORT_STRING);
1342 
1343   // Compare longwords
1344   // load first parts of strings and finish initialization while loading
1345   {
1346     if (str1_isL == str2_isL) { // LL or UU
1347       // check if str1 and str2 is same pointer
1348       beq(str1, str2, DONE);
1349       // load 8 bytes once to compare
1350       ld(tmp1, Address(str1));
1351       ld(tmp2, Address(str2));
1352       mv(t0, STUB_THRESHOLD);
1353       bge(cnt2, t0, STUB);
1354       sub(cnt2, cnt2, minCharsInWord);
1355       beqz(cnt2, TAIL_CHECK);
1356       // convert cnt2 from characters to bytes
1357       if (!str1_isL) {
1358         slli(cnt2, cnt2, 1);
1359       }
1360       add(str2, str2, cnt2);
1361       add(str1, str1, cnt2);
1362       sub(cnt2, zr, cnt2);
1363     } else if (isLU) { // LU case
1364       lwu(tmp1, Address(str1));
1365       ld(tmp2, Address(str2));
1366       mv(t0, STUB_THRESHOLD);
1367       bge(cnt2, t0, STUB);
1368       addi(cnt2, cnt2, -4);
1369       add(str1, str1, cnt2);
1370       sub(cnt1, zr, cnt2);
1371       slli(cnt2, cnt2, 1);
1372       add(str2, str2, cnt2);
1373       inflate_lo32(tmp3, tmp1);
1374       mv(tmp1, tmp3);
1375       sub(cnt2, zr, cnt2);
1376       addi(cnt1, cnt1, 4);
1377     } else { // UL case
1378       ld(tmp1, Address(str1));
1379       lwu(tmp2, Address(str2));
1380       mv(t0, STUB_THRESHOLD);
1381       bge(cnt2, t0, STUB);
1382       addi(cnt2, cnt2, -4);
1383       slli(t0, cnt2, 1);
1384       sub(cnt1, zr, t0);
1385       add(str1, str1, t0);
1386       add(str2, str2, cnt2);
1387       inflate_lo32(tmp3, tmp2);
1388       mv(tmp2, tmp3);
1389       sub(cnt2, zr, cnt2);
1390       addi(cnt1, cnt1, 8);
1391     }
1392     addi(cnt2, cnt2, isUL ? 4 : 8);
1393     bne(tmp1, tmp2, DIFFERENCE);
1394     bgez(cnt2, TAIL);
1395 
1396     // main loop
1397     bind(NEXT_WORD);
1398     if (str1_isL == str2_isL) { // LL or UU
1399       add(t0, str1, cnt2);
1400       ld(tmp1, Address(t0));
1401       add(t0, str2, cnt2);
1402       ld(tmp2, Address(t0));
1403       addi(cnt2, cnt2, 8);
1404     } else if (isLU) { // LU case
1405       add(t0, str1, cnt1);
1406       lwu(tmp1, Address(t0));
1407       add(t0, str2, cnt2);
1408       ld(tmp2, Address(t0));
1409       addi(cnt1, cnt1, 4);
1410       inflate_lo32(tmp3, tmp1);
1411       mv(tmp1, tmp3);
1412       addi(cnt2, cnt2, 8);
1413     } else { // UL case
1414       add(t0, str2, cnt2);
1415       lwu(tmp2, Address(t0));
1416       add(t0, str1, cnt1);
1417       ld(tmp1, Address(t0));
1418       inflate_lo32(tmp3, tmp2);
1419       mv(tmp2, tmp3);
1420       addi(cnt1, cnt1, 8);
1421       addi(cnt2, cnt2, 4);
1422     }
1423     bne(tmp1, tmp2, DIFFERENCE);
1424     bltz(cnt2, NEXT_WORD);
1425     bind(TAIL);
1426     if (str1_isL == str2_isL) { // LL or UU
1427       load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1428       load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1429     } else if (isLU) { // LU case
1430       load_int_misaligned(tmp1, Address(str1), tmp3, false);
1431       load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1432       inflate_lo32(tmp3, tmp1);
1433       mv(tmp1, tmp3);
1434     } else { // UL case
1435       load_int_misaligned(tmp2, Address(str2), tmp3, false);
1436       load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1437       inflate_lo32(tmp3, tmp2);
1438       mv(tmp2, tmp3);
1439     }
1440     bind(TAIL_CHECK);
1441     beq(tmp1, tmp2, DONE);
1442 
1443     // Find the first different characters in the longwords and
1444     // compute their difference.
1445     bind(DIFFERENCE);
1446     xorr(tmp3, tmp1, tmp2);
1447     ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1448     srl(tmp1, tmp1, result);
1449     srl(tmp2, tmp2, result);
1450     if (isLL) {
1451       andi(tmp1, tmp1, 0xFF);
1452       andi(tmp2, tmp2, 0xFF);
1453     } else {
1454       andi(tmp1, tmp1, 0xFFFF);
1455       andi(tmp2, tmp2, 0xFFFF);
1456     }
1457     sub(result, tmp1, tmp2);
1458     j(DONE);
1459   }
1460 
1461   bind(STUB);
1462   RuntimeAddress stub = nullptr;
1463   switch (ae) {
1464     case StrIntrinsicNode::LL:
1465       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1466       break;
1467     case StrIntrinsicNode::UU:
1468       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1469       break;
1470     case StrIntrinsicNode::LU:
1471       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1472       break;
1473     case StrIntrinsicNode::UL:
1474       stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1475       break;
1476     default:
1477       ShouldNotReachHere();
1478   }
1479   assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1480   address call = trampoline_call(stub);
1481   if (call == nullptr) {
1482     DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1483     ciEnv::current()->record_failure("CodeCache is full");
1484     return;
1485   }
1486   j(DONE);
1487 
1488   bind(SHORT_STRING);
1489   // Is the minimum length zero?
1490   beqz(cnt2, DONE);
1491   // arrange code to do most branches while loading and loading next characters
1492   // while comparing previous
1493   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1494   addi(str1, str1, str1_chr_size);
1495   addi(cnt2, cnt2, -1);
1496   beqz(cnt2, SHORT_LAST_INIT);
1497   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1498   addi(str2, str2, str2_chr_size);
1499   j(SHORT_LOOP_START);
1500   bind(SHORT_LOOP);
1501   addi(cnt2, cnt2, -1);
1502   beqz(cnt2, SHORT_LAST);
1503   bind(SHORT_LOOP_START);
1504   (this->*str1_load_chr)(tmp2, Address(str1), t0);
1505   addi(str1, str1, str1_chr_size);
1506   (this->*str2_load_chr)(t0, Address(str2), t0);
1507   addi(str2, str2, str2_chr_size);
1508   bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1509   addi(cnt2, cnt2, -1);
1510   beqz(cnt2, SHORT_LAST2);
1511   (this->*str1_load_chr)(tmp1, Address(str1), t0);
1512   addi(str1, str1, str1_chr_size);
1513   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1514   addi(str2, str2, str2_chr_size);
1515   beq(tmp2, t0, SHORT_LOOP);
1516   sub(result, tmp2, t0);
1517   j(DONE);
1518   bind(SHORT_LOOP_TAIL);
1519   sub(result, tmp1, cnt1);
1520   j(DONE);
1521   bind(SHORT_LAST2);
1522   beq(tmp2, t0, DONE);
1523   sub(result, tmp2, t0);
1524 
1525   j(DONE);
1526   bind(SHORT_LAST_INIT);
1527   (this->*str2_load_chr)(cnt1, Address(str2), t0);
1528   addi(str2, str2, str2_chr_size);
1529   bind(SHORT_LAST);
1530   beq(tmp1, cnt1, DONE);
1531   sub(result, tmp1, cnt1);
1532 
1533   bind(DONE);
1534 
1535   BLOCK_COMMENT("} string_compare");
1536 }
1537 
1538 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1539                                       Register tmp4, Register tmp5, Register tmp6, Register result,
1540                                       Register cnt1, int elem_size) {
1541   Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1542   Register tmp1 = t0;
1543   Register tmp2 = t1;
1544   Register cnt2 = tmp2;  // cnt2 only used in array length compare
1545   Register elem_per_word = tmp6;
1546   int log_elem_size = exact_log2(elem_size);
1547   int length_offset = arrayOopDesc::length_offset_in_bytes();
1548   int base_offset   = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1549 
1550   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1551   assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1552   mv(elem_per_word, wordSize / elem_size);
1553 
1554   BLOCK_COMMENT("arrays_equals {");
1555 
1556   // if (a1 == a2), return true
1557   beq(a1, a2, SAME);
1558 
1559   mv(result, false);
1560   beqz(a1, DONE);
1561   beqz(a2, DONE);
1562   lwu(cnt1, Address(a1, length_offset));
1563   lwu(cnt2, Address(a2, length_offset));
1564   bne(cnt2, cnt1, DONE);
1565   beqz(cnt1, SAME);
1566 
1567   slli(tmp5, cnt1, 3 + log_elem_size);
1568   sub(tmp5, zr, tmp5);
1569   add(a1, a1, base_offset);
1570   add(a2, a2, base_offset);
1571   ld(tmp3, Address(a1, 0));
1572   ld(tmp4, Address(a2, 0));
1573   ble(cnt1, elem_per_word, SHORT); // short or same
1574 
1575   // Main 16 byte comparison loop with 2 exits
1576   bind(NEXT_DWORD); {
1577     ld(tmp1, Address(a1, wordSize));
1578     ld(tmp2, Address(a2, wordSize));
1579     sub(cnt1, cnt1, 2 * wordSize / elem_size);
1580     blez(cnt1, TAIL);
1581     bne(tmp3, tmp4, DONE);
1582     ld(tmp3, Address(a1, 2 * wordSize));
1583     ld(tmp4, Address(a2, 2 * wordSize));
1584     add(a1, a1, 2 * wordSize);
1585     add(a2, a2, 2 * wordSize);
1586     ble(cnt1, elem_per_word, TAIL2);
1587   } beq(tmp1, tmp2, NEXT_DWORD);
1588   j(DONE);
1589 
1590   bind(TAIL);
1591   xorr(tmp4, tmp3, tmp4);
1592   xorr(tmp2, tmp1, tmp2);
1593   sll(tmp2, tmp2, tmp5);
1594   orr(tmp5, tmp4, tmp2);
1595   j(IS_TMP5_ZR);
1596 
1597   bind(TAIL2);
1598   bne(tmp1, tmp2, DONE);
1599 
1600   bind(SHORT);
1601   xorr(tmp4, tmp3, tmp4);
1602   sll(tmp5, tmp4, tmp5);
1603 
1604   bind(IS_TMP5_ZR);
1605   bnez(tmp5, DONE);
1606 
1607   bind(SAME);
1608   mv(result, true);
1609   // That's it.
1610   bind(DONE);
1611 
1612   BLOCK_COMMENT("} array_equals");
1613 }
1614 
1615 // Compare Strings
1616 
1617 // For Strings we're passed the address of the first characters in a1
1618 // and a2 and the length in cnt1.
1619 // elem_size is the element size in bytes: either 1 or 2.
1620 // There are two implementations.  For arrays >= 8 bytes, all
1621 // comparisons (for hw supporting unaligned access: including the final one,
1622 // which may overlap) are performed 8 bytes at a time.
1623 // For strings < 8 bytes (and for tails of long strings when
1624 // AvoidUnalignedAccesses is true), we compare a
1625 // halfword, then a short, and then a byte.
1626 
1627 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1628                                       Register result, Register cnt1, int elem_size)
1629 {
1630   Label SAME, DONE, SHORT, NEXT_WORD;
1631   Register tmp1 = t0;
1632   Register tmp2 = t1;
1633 
1634   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
1635   assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1636 
1637   BLOCK_COMMENT("string_equals {");
1638 
1639   beqz(cnt1, SAME);
1640   mv(result, false);
1641 
1642   // Check for short strings, i.e. smaller than wordSize.
1643   sub(cnt1, cnt1, wordSize);
1644   bltz(cnt1, SHORT);
1645 
1646   // Main 8 byte comparison loop.
1647   bind(NEXT_WORD); {
1648     ld(tmp1, Address(a1, 0));
1649     add(a1, a1, wordSize);
1650     ld(tmp2, Address(a2, 0));
1651     add(a2, a2, wordSize);
1652     sub(cnt1, cnt1, wordSize);
1653     bne(tmp1, tmp2, DONE);
1654   } bgez(cnt1, NEXT_WORD);
1655 
1656   if (!AvoidUnalignedAccesses) {
1657     // Last longword.  In the case where length == 4 we compare the
1658     // same longword twice, but that's still faster than another
1659     // conditional branch.
1660     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1661     // length == 4.
1662     add(tmp1, a1, cnt1);
1663     ld(tmp1, Address(tmp1, 0));
1664     add(tmp2, a2, cnt1);
1665     ld(tmp2, Address(tmp2, 0));
1666     bne(tmp1, tmp2, DONE);
1667     j(SAME);
1668   } else {
1669     add(tmp1, cnt1, wordSize);
1670     beqz(tmp1, SAME);
1671   }
1672 
1673   bind(SHORT);
1674   Label TAIL03, TAIL01;
1675 
1676   // 0-7 bytes left.
1677   test_bit(tmp1, cnt1, 2);
1678   beqz(tmp1, TAIL03);
1679   {
1680     lwu(tmp1, Address(a1, 0));
1681     add(a1, a1, 4);
1682     lwu(tmp2, Address(a2, 0));
1683     add(a2, a2, 4);
1684     bne(tmp1, tmp2, DONE);
1685   }
1686 
1687   bind(TAIL03);
1688   // 0-3 bytes left.
1689   test_bit(tmp1, cnt1, 1);
1690   beqz(tmp1, TAIL01);
1691   {
1692     lhu(tmp1, Address(a1, 0));
1693     add(a1, a1, 2);
1694     lhu(tmp2, Address(a2, 0));
1695     add(a2, a2, 2);
1696     bne(tmp1, tmp2, DONE);
1697   }
1698 
1699   bind(TAIL01);
1700   if (elem_size == 1) { // Only needed when comparing 1-byte elements
1701     // 0-1 bytes left.
1702     test_bit(tmp1, cnt1, 0);
1703     beqz(tmp1, SAME);
1704     {
1705       lbu(tmp1, Address(a1, 0));
1706       lbu(tmp2, Address(a2, 0));
1707       bne(tmp1, tmp2, DONE);
1708     }
1709   }
1710 
1711   // Arrays are equal.
1712   bind(SAME);
1713   mv(result, true);
1714 
1715   // That's it.
1716   bind(DONE);
1717   BLOCK_COMMENT("} string_equals");
1718 }
1719 
1720 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1721 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1722                                                               bool is_far, bool is_unordered);
1723 
1724 static conditional_branch_insn conditional_branches[] =
1725 {
1726   /* SHORT branches */
1727   (conditional_branch_insn)&MacroAssembler::beq,
1728   (conditional_branch_insn)&MacroAssembler::bgt,
1729   nullptr, // BoolTest::overflow
1730   (conditional_branch_insn)&MacroAssembler::blt,
1731   (conditional_branch_insn)&MacroAssembler::bne,
1732   (conditional_branch_insn)&MacroAssembler::ble,
1733   nullptr, // BoolTest::no_overflow
1734   (conditional_branch_insn)&MacroAssembler::bge,
1735 
1736   /* UNSIGNED branches */
1737   (conditional_branch_insn)&MacroAssembler::beq,
1738   (conditional_branch_insn)&MacroAssembler::bgtu,
1739   nullptr,
1740   (conditional_branch_insn)&MacroAssembler::bltu,
1741   (conditional_branch_insn)&MacroAssembler::bne,
1742   (conditional_branch_insn)&MacroAssembler::bleu,
1743   nullptr,
1744   (conditional_branch_insn)&MacroAssembler::bgeu
1745 };
1746 
1747 static float_conditional_branch_insn float_conditional_branches[] =
1748 {
1749   /* FLOAT SHORT branches */
1750   (float_conditional_branch_insn)&MacroAssembler::float_beq,
1751   (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1752   nullptr,  // BoolTest::overflow
1753   (float_conditional_branch_insn)&MacroAssembler::float_blt,
1754   (float_conditional_branch_insn)&MacroAssembler::float_bne,
1755   (float_conditional_branch_insn)&MacroAssembler::float_ble,
1756   nullptr, // BoolTest::no_overflow
1757   (float_conditional_branch_insn)&MacroAssembler::float_bge,
1758 
1759   /* DOUBLE SHORT branches */
1760   (float_conditional_branch_insn)&MacroAssembler::double_beq,
1761   (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1762   nullptr,
1763   (float_conditional_branch_insn)&MacroAssembler::double_blt,
1764   (float_conditional_branch_insn)&MacroAssembler::double_bne,
1765   (float_conditional_branch_insn)&MacroAssembler::double_ble,
1766   nullptr,
1767   (float_conditional_branch_insn)&MacroAssembler::double_bge
1768 };
1769 
1770 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1771   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1772          "invalid conditional branch index");
1773   (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1774 }
1775 
1776 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1777 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1778 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1779   assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1780          "invalid float conditional branch index");
1781   int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1782   (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1783     (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1784 }
1785 
1786 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1787   switch (cmpFlag) {
1788     case BoolTest::eq:
1789     case BoolTest::le:
1790       beqz(op1, L, is_far);
1791       break;
1792     case BoolTest::ne:
1793     case BoolTest::gt:
1794       bnez(op1, L, is_far);
1795       break;
1796     default:
1797       ShouldNotReachHere();
1798   }
1799 }
1800 
1801 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1802   switch (cmpFlag) {
1803     case BoolTest::eq:
1804       beqz(op1, L, is_far);
1805       break;
1806     case BoolTest::ne:
1807       bnez(op1, L, is_far);
1808       break;
1809     default:
1810       ShouldNotReachHere();
1811   }
1812 }
1813 
1814 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1815   Label L;
1816   cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1817   mv(dst, src);
1818   bind(L);
1819 }
1820 
1821 // Set dst to NaN if any NaN input.
1822 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1823                                   bool is_double, bool is_min) {
1824   assert_different_registers(dst, src1, src2);
1825 
1826   Label Done, Compare;
1827 
1828   is_double ? fclass_d(t0, src1)
1829             : fclass_s(t0, src1);
1830   is_double ? fclass_d(t1, src2)
1831             : fclass_s(t1, src2);
1832   orr(t0, t0, t1);
1833   andi(t0, t0, 0b1100000000); //if src1 or src2 is quiet or signaling NaN then return NaN
1834   beqz(t0, Compare);
1835   is_double ? fadd_d(dst, src1, src2)
1836             : fadd_s(dst, src1, src2);
1837   j(Done);
1838 
1839   bind(Compare);
1840   if (is_double) {
1841     is_min ? fmin_d(dst, src1, src2)
1842            : fmax_d(dst, src1, src2);
1843   } else {
1844     is_min ? fmin_s(dst, src1, src2)
1845            : fmax_s(dst, src1, src2);
1846   }
1847 
1848   bind(Done);
1849 }
1850 
1851 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1852                                         VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1853   Label loop;
1854   Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1855 
1856   bind(loop);
1857   vsetvli(tmp1, cnt, sew, Assembler::m2);
1858   vlex_v(vr1, a1, sew);
1859   vlex_v(vr2, a2, sew);
1860   vmsne_vv(vrs, vr1, vr2);
1861   vfirst_m(tmp2, vrs);
1862   bgez(tmp2, DONE);
1863   sub(cnt, cnt, tmp1);
1864   if (!islatin) {
1865     slli(tmp1, tmp1, 1); // get byte counts
1866   }
1867   add(a1, a1, tmp1);
1868   add(a2, a2, tmp1);
1869   bnez(cnt, loop);
1870 
1871   mv(result, true);
1872 }
1873 
1874 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt, int elem_size) {
1875   Label DONE;
1876   Register tmp1 = t0;
1877   Register tmp2 = t1;
1878 
1879   BLOCK_COMMENT("string_equals_v {");
1880 
1881   mv(result, false);
1882 
1883   if (elem_size == 2) {
1884     srli(cnt, cnt, 1);
1885   }
1886 
1887   element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1888 
1889   bind(DONE);
1890   BLOCK_COMMENT("} string_equals_v");
1891 }
1892 
1893 // used by C2 ClearArray patterns.
1894 // base: Address of a buffer to be zeroed
1895 // cnt: Count in HeapWords
1896 //
1897 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
1898 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
1899   Label loop;
1900 
1901   // making zero words
1902   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1903   vxor_vv(v4, v4, v4);
1904 
1905   bind(loop);
1906   vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1907   vse64_v(v4, base);
1908   sub(cnt, cnt, t0);
1909   shadd(base, t0, base, t0, 3);
1910   bnez(cnt, loop);
1911 }
1912 
1913 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
1914                                         Register cnt1, int elem_size) {
1915   Label DONE;
1916   Register tmp1 = t0;
1917   Register tmp2 = t1;
1918   Register cnt2 = tmp2;
1919   int length_offset = arrayOopDesc::length_offset_in_bytes();
1920   int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1921 
1922   BLOCK_COMMENT("arrays_equals_v {");
1923 
1924   // if (a1 == a2), return true
1925   mv(result, true);
1926   beq(a1, a2, DONE);
1927 
1928   mv(result, false);
1929   // if a1 == null or a2 == null, return false
1930   beqz(a1, DONE);
1931   beqz(a2, DONE);
1932   // if (a1.length != a2.length), return false
1933   lwu(cnt1, Address(a1, length_offset));
1934   lwu(cnt2, Address(a2, length_offset));
1935   bne(cnt1, cnt2, DONE);
1936 
1937   la(a1, Address(a1, base_offset));
1938   la(a2, Address(a2, base_offset));
1939 
1940   element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1941 
1942   bind(DONE);
1943 
1944   BLOCK_COMMENT("} arrays_equals_v");
1945 }
1946 
1947 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
1948                                          Register result, Register tmp1, Register tmp2, int encForm) {
1949   Label DIFFERENCE, DONE, L, loop;
1950   bool encLL = encForm == StrIntrinsicNode::LL;
1951   bool encLU = encForm == StrIntrinsicNode::LU;
1952   bool encUL = encForm == StrIntrinsicNode::UL;
1953 
1954   bool str1_isL = encLL || encLU;
1955   bool str2_isL = encLL || encUL;
1956 
1957   int minCharsInWord = encLL ? wordSize : wordSize / 2;
1958 
1959   BLOCK_COMMENT("string_compare {");
1960 
1961   // for Latin strings, 1 byte for 1 character
1962   // for UTF16 strings, 2 bytes for 1 character
1963   if (!str1_isL)
1964     sraiw(cnt1, cnt1, 1);
1965   if (!str2_isL)
1966     sraiw(cnt2, cnt2, 1);
1967 
1968   // if str1 == str2, return the difference
1969   // save the minimum of the string lengths in cnt2.
1970   sub(result, cnt1, cnt2);
1971   bgt(cnt1, cnt2, L);
1972   mv(cnt2, cnt1);
1973   bind(L);
1974 
1975   if (str1_isL == str2_isL) { // LL or UU
1976     element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE);
1977     j(DONE);
1978   } else { // LU or UL
1979     Register strL = encLU ? str1 : str2;
1980     Register strU = encLU ? str2 : str1;
1981     VectorRegister vstr1 = encLU ? v8 : v4;
1982     VectorRegister vstr2 = encLU ? v4 : v8;
1983 
1984     bind(loop);
1985     vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
1986     vle8_v(vstr1, strL);
1987     vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
1988     vzext_vf2(vstr2, vstr1);
1989     vle16_v(vstr1, strU);
1990     vmsne_vv(v4, vstr2, vstr1);
1991     vfirst_m(tmp2, v4);
1992     bgez(tmp2, DIFFERENCE);
1993     sub(cnt2, cnt2, tmp1);
1994     add(strL, strL, tmp1);
1995     shadd(strU, tmp1, strU, tmp1, 1);
1996     bnez(cnt2, loop);
1997     j(DONE);
1998   }
1999 
2000   bind(DIFFERENCE);
2001   slli(tmp1, tmp2, 1);
2002   add(str1, str1, str1_isL ? tmp2 : tmp1);
2003   add(str2, str2, str2_isL ? tmp2 : tmp1);
2004   str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2005   str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2006   sub(result, tmp1, tmp2);
2007 
2008   bind(DONE);
2009 }
2010 
2011 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2012   Label loop;
2013   assert_different_registers(src, dst, len, tmp, t0);
2014 
2015   BLOCK_COMMENT("byte_array_inflate_v {");
2016   bind(loop);
2017   vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2018   vle8_v(v6, src);
2019   vsetvli(t0, len, Assembler::e16, Assembler::m4);
2020   vzext_vf2(v4, v6);
2021   vse16_v(v4, dst);
2022   sub(len, len, tmp);
2023   add(src, src, tmp);
2024   shadd(dst, tmp, dst, tmp, 1);
2025   bnez(len, loop);
2026   BLOCK_COMMENT("} byte_array_inflate_v");
2027 }
2028 
2029 // Compress char[] array to byte[].
2030 // result: the array length if every element in array can be encoded; 0, otherwise.
2031 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2032                                               Register result, Register tmp) {
2033   Label done;
2034   encode_iso_array_v(src, dst, len, result, tmp, false);
2035   beqz(len, done);
2036   mv(result, zr);
2037   bind(done);
2038 }
2039 
2040 // Intrinsic for
2041 //
2042 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2043 //     return the number of characters copied.
2044 // - java/lang/StringUTF16.compress
2045 //     return zero (0) if copy fails, otherwise 'len'.
2046 //
2047 // This version always returns the number of characters copied. A successful
2048 // copy will complete with the post-condition: 'res' == 'len', while an
2049 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2050 //
2051 // Clobbers: src, dst, len, result, t0
2052 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2053                                            Register result, Register tmp, bool ascii) {
2054   Label loop, fail, done;
2055 
2056   BLOCK_COMMENT("encode_iso_array_v {");
2057   mv(result, 0);
2058 
2059   bind(loop);
2060   mv(tmp, ascii ? 0x7f : 0xff);
2061   vsetvli(t0, len, Assembler::e16, Assembler::m2);
2062   vle16_v(v2, src);
2063 
2064   vmsgtu_vx(v1, v2, tmp);
2065   vfirst_m(tmp, v1);
2066   vmsbf_m(v0, v1);
2067   // compress char to byte
2068   vsetvli(t0, len, Assembler::e8);
2069   vncvt_x_x_w(v1, v2, Assembler::v0_t);
2070   vse8_v(v1, dst, Assembler::v0_t);
2071 
2072   // fail if char > 0x7f/0xff
2073   bgez(tmp, fail);
2074   add(result, result, t0);
2075   add(dst, dst, t0);
2076   sub(len, len, t0);
2077   shadd(src, t0, src, t0, 1);
2078   bnez(len, loop);
2079   j(done);
2080 
2081   bind(fail);
2082   add(result, result, tmp);
2083 
2084   bind(done);
2085   BLOCK_COMMENT("} encode_iso_array_v");
2086 }
2087 
2088 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2089   Label LOOP, SET_RESULT, DONE;
2090 
2091   BLOCK_COMMENT("count_positives_v {");
2092   assert_different_registers(ary, len, result, tmp);
2093 
2094   mv(result, zr);
2095 
2096   bind(LOOP);
2097   vsetvli(t0, len, Assembler::e8, Assembler::m4);
2098   vle8_v(v4, ary);
2099   vmslt_vx(v4, v4, zr);
2100   vfirst_m(tmp, v4);
2101   bgez(tmp, SET_RESULT);
2102   // if tmp == -1, all bytes are positive
2103   add(result, result, t0);
2104 
2105   sub(len, len, t0);
2106   add(ary, ary, t0);
2107   bnez(len, LOOP);
2108   j(DONE);
2109 
2110   // add remaining positive bytes count
2111   bind(SET_RESULT);
2112   add(result, result, tmp);
2113 
2114   bind(DONE);
2115   BLOCK_COMMENT("} count_positives_v");
2116 }
2117 
2118 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2119                                               Register ch, Register result,
2120                                               Register tmp1, Register tmp2,
2121                                               bool isL) {
2122   mv(result, zr);
2123 
2124   Label loop, MATCH, DONE;
2125   Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2126   bind(loop);
2127   vsetvli(tmp1, cnt1, sew, Assembler::m4);
2128   vlex_v(v4, str1, sew);
2129   vmseq_vx(v4, v4, ch);
2130   vfirst_m(tmp2, v4);
2131   bgez(tmp2, MATCH); // if equal, return index
2132 
2133   add(result, result, tmp1);
2134   sub(cnt1, cnt1, tmp1);
2135   if (!isL) slli(tmp1, tmp1, 1);
2136   add(str1, str1, tmp1);
2137   bnez(cnt1, loop);
2138 
2139   mv(result, -1);
2140   j(DONE);
2141 
2142   bind(MATCH);
2143   add(result, result, tmp2);
2144 
2145   bind(DONE);
2146 }
2147 
2148 // Set dst to NaN if any NaN input.
2149 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2150                                     bool is_double, bool is_min, int vector_length) {
2151   assert_different_registers(dst, src1, src2);
2152 
2153   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2154 
2155   is_min ? vfmin_vv(dst, src1, src2)
2156          : vfmax_vv(dst, src1, src2);
2157 
2158   vmfne_vv(v0,  src1, src1);
2159   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2160   vmfne_vv(v0,  src2, src2);
2161   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2162 }
2163 
2164 // Set dst to NaN if any NaN input.
2165 // The destination vector register elements corresponding to masked-off elements
2166 // are handled with a mask-undisturbed policy.
2167 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2168                                            VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2169                                            bool is_double, bool is_min, int vector_length) {
2170   assert_different_registers(src1, src2, tmp1, tmp2);
2171   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2172 
2173   // Check vector elements of src1 and src2 for NaN.
2174   vmfeq_vv(tmp1, src1, src1);
2175   vmfeq_vv(tmp2, src2, src2);
2176 
2177   vmandn_mm(v0, vmask, tmp1);
2178   vfadd_vv(dst, src1, src1, Assembler::v0_t);
2179   vmandn_mm(v0, vmask, tmp2);
2180   vfadd_vv(dst, src2, src2, Assembler::v0_t);
2181 
2182   vmand_mm(tmp2, tmp1, tmp2);
2183   vmand_mm(v0, vmask, tmp2);
2184   is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2185          : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2186 }
2187 
2188 // Set dst to NaN if any NaN input.
2189 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2190                                            FloatRegister src1, VectorRegister src2,
2191                                            VectorRegister tmp1, VectorRegister tmp2,
2192                                            bool is_double, bool is_min, int vector_length, VectorMask vm) {
2193   assert_different_registers(dst, src1);
2194   assert_different_registers(src2, tmp1, tmp2);
2195 
2196   Label L_done, L_NaN_1, L_NaN_2;
2197   // Set dst to src1 if src1 is NaN
2198   is_double ? feq_d(t0, src1, src1)
2199             : feq_s(t0, src1, src1);
2200   beqz(t0, L_NaN_2);
2201 
2202   vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2203   vfmv_s_f(tmp2, src1);
2204 
2205   is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2206          : vfredmax_vs(tmp1, src2, tmp2, vm);
2207   vfmv_f_s(dst, tmp1);
2208 
2209   // Checking NaNs in src2
2210   vmfne_vv(tmp1, src2, src2, vm);
2211   vcpop_m(t0, tmp1, vm);
2212   beqz(t0, L_done);
2213 
2214   bind(L_NaN_1);
2215   vfredusum_vs(tmp1, src2, tmp2, vm);
2216   vfmv_f_s(dst, tmp1);
2217   j(L_done);
2218 
2219   bind(L_NaN_2);
2220   is_double ? fmv_d(dst, src1)
2221             : fmv_s(dst, src1);
2222   bind(L_done);
2223 }
2224 
2225 bool C2_MacroAssembler::in_scratch_emit_size() {
2226   if (ciEnv::current()->task() != nullptr) {
2227     PhaseOutput* phase_output = Compile::current()->output();
2228     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2229       return true;
2230     }
2231   }
2232   return MacroAssembler::in_scratch_emit_size();
2233 }
2234 
2235 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2236                                           VectorRegister src2, VectorRegister tmp,
2237                                           int opc, BasicType bt, int vector_length, VectorMask vm) {
2238   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2239   vsetvli_helper(bt, vector_length);
2240   vmv_s_x(tmp, src1);
2241   switch (opc) {
2242     case Op_AddReductionVI:
2243     case Op_AddReductionVL:
2244       vredsum_vs(tmp, src2, tmp, vm);
2245       break;
2246     case Op_AndReductionV:
2247       vredand_vs(tmp, src2, tmp, vm);
2248       break;
2249     case Op_OrReductionV:
2250       vredor_vs(tmp, src2, tmp, vm);
2251       break;
2252     case Op_XorReductionV:
2253       vredxor_vs(tmp, src2, tmp, vm);
2254       break;
2255     case Op_MaxReductionV:
2256       vredmax_vs(tmp, src2, tmp, vm);
2257       break;
2258     case Op_MinReductionV:
2259       vredmin_vs(tmp, src2, tmp, vm);
2260       break;
2261     default:
2262       ShouldNotReachHere();
2263   }
2264   vmv_x_s(dst, tmp);
2265 }
2266 
2267 // Set vl and vtype for full and partial vector operations.
2268 // (vma = mu, vta = tu, vill = false)
2269 void C2_MacroAssembler::vsetvli_helper(BasicType bt, int vector_length, LMUL vlmul, Register tmp) {
2270   Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2271   if (vector_length <= 31) {
2272     vsetivli(tmp, vector_length, sew, vlmul);
2273   } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2274     vsetvli(tmp, x0, sew, vlmul);
2275   } else {
2276     mv(tmp, vector_length);
2277     vsetvli(tmp, tmp, sew, vlmul);
2278   }
2279 }
2280 
2281 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2282                                            int cond, BasicType bt, int vector_length, VectorMask vm) {
2283   assert(is_integral_type(bt), "unsupported element type");
2284   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2285   vsetvli_helper(bt, vector_length);
2286   vmclr_m(vd);
2287   switch (cond) {
2288     case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2289     case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2290     case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2291     case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2292     case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2293     case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2294     default:
2295       assert(false, "unsupported compare condition");
2296       ShouldNotReachHere();
2297   }
2298 }
2299 
2300 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2301                                      int cond, BasicType bt, int vector_length, VectorMask vm) {
2302   assert(is_floating_point_type(bt), "unsupported element type");
2303   assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2304   vsetvli_helper(bt, vector_length);
2305   vmclr_m(vd);
2306   switch (cond) {
2307     case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2308     case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2309     case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2310     case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2311     case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2312     case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2313     default:
2314       assert(false, "unsupported compare condition");
2315       ShouldNotReachHere();
2316   }
2317 }
2318 
2319 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2320                                          VectorRegister src, BasicType src_bt) {
2321   assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2322   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2323   // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2324   // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2325   // and the overlap is in the highest-numbered part of the destination register group.
2326   // Since LMUL=1, vd and vs cannot be the same.
2327   assert_different_registers(dst, src);
2328 
2329   vsetvli_helper(dst_bt, vector_length);
2330   if (src_bt == T_BYTE) {
2331     switch (dst_bt) {
2332     case T_SHORT:
2333       vsext_vf2(dst, src);
2334       break;
2335     case T_INT:
2336       vsext_vf4(dst, src);
2337       break;
2338     case T_LONG:
2339       vsext_vf8(dst, src);
2340       break;
2341     default:
2342       ShouldNotReachHere();
2343     }
2344   } else if (src_bt == T_SHORT) {
2345     if (dst_bt == T_INT) {
2346       vsext_vf2(dst, src);
2347     } else {
2348       vsext_vf4(dst, src);
2349     }
2350   } else if (src_bt == T_INT) {
2351     vsext_vf2(dst, src);
2352   }
2353 }
2354 
2355 // Vector narrow from src to dst with specified element sizes.
2356 // High part of dst vector will be filled with zero.
2357 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2358                                          VectorRegister src, BasicType src_bt) {
2359   assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2360   assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2361   mv(t0, vector_length);
2362   if (src_bt == T_LONG) {
2363     // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2364     // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2365     // So we can currently only scale down by 1/2 the width at a time.
2366     vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2367     vncvt_x_x_w(dst, src);
2368     if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2369       vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2370       vncvt_x_x_w(dst, dst);
2371       if (dst_bt == T_BYTE) {
2372         vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2373         vncvt_x_x_w(dst, dst);
2374       }
2375     }
2376   } else if (src_bt == T_INT) {
2377     // T_SHORT
2378     vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2379     vncvt_x_x_w(dst, src);
2380     if (dst_bt == T_BYTE) {
2381       vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2382       vncvt_x_x_w(dst, dst);
2383     }
2384   } else if (src_bt == T_SHORT) {
2385     vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2386     vncvt_x_x_w(dst, src);
2387   }
2388 }
2389 
2390 #define VFCVT_SAFE(VFLOATCVT)                                                      \
2391 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2392   assert_different_registers(dst, src);                                            \
2393   vxor_vv(dst, dst, dst);                                                          \
2394   vmfeq_vv(v0, src, src);                                                          \
2395   VFLOATCVT(dst, src, Assembler::v0_t);                                            \
2396 }
2397 
2398 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2399 
2400 #undef VFCVT_SAFE
2401 
2402 // Extract a scalar element from an vector at position 'idx'.
2403 // The input elements in src are expected to be of integral type.
2404 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2405                                   int idx, VectorRegister tmp) {
2406   assert(is_integral_type(bt), "unsupported element type");
2407   assert(idx >= 0, "idx cannot be negative");
2408   // Only need the first element after vector slidedown
2409   vsetvli_helper(bt, 1);
2410   if (idx == 0) {
2411     vmv_x_s(dst, src);
2412   } else if (idx <= 31) {
2413     vslidedown_vi(tmp, src, idx);
2414     vmv_x_s(dst, tmp);
2415   } else {
2416     mv(t0, idx);
2417     vslidedown_vx(tmp, src, t0);
2418     vmv_x_s(dst, tmp);
2419   }
2420 }
2421 
2422 // Extract a scalar element from an vector at position 'idx'.
2423 // The input elements in src are expected to be of floating point type.
2424 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2425                                      int idx, VectorRegister tmp) {
2426   assert(is_floating_point_type(bt), "unsupported element type");
2427   assert(idx >= 0, "idx cannot be negative");
2428   // Only need the first element after vector slidedown
2429   vsetvli_helper(bt, 1);
2430   if (idx == 0) {
2431     vfmv_f_s(dst, src);
2432   } else if (idx <= 31) {
2433     vslidedown_vi(tmp, src, idx);
2434     vfmv_f_s(dst, tmp);
2435   } else {
2436     mv(t0, idx);
2437     vslidedown_vx(tmp, src, t0);
2438     vfmv_f_s(dst, tmp);
2439   }
2440 }