1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/compile.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, 48 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) { 49 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure. 50 Register flag = t1; 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmp1Reg; 54 Register tmp = tmp2Reg; 55 Label object_has_monitor; 56 // Finish fast lock successfully. MUST branch to with flag == 0 57 Label locked; 58 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 59 Label slow_path; 60 61 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 62 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0); 63 64 mv(flag, 1); 65 66 // Load markWord from object into displaced_header. 67 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 68 69 if (DiagnoseSyncOnValueBasedClasses != 0) { 70 load_klass(tmp, oop); 71 lbu(tmp, Address(tmp, Klass::misc_flags_offset())); 72 test_bit(tmp, tmp, exact_log2(KlassFlags::_misc_is_value_based_class)); 73 bnez(tmp, slow_path); 74 } 75 76 if (LockingMode == LM_MONITOR) { 77 j(slow_path); 78 } else { 79 assert(LockingMode == LM_LEGACY, "must be"); 80 81 // Check for existing monitor 82 test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value)); 83 bnez(tmp, object_has_monitor); 84 85 // Set tmp to be (markWord of object | UNLOCK_VALUE). 86 ori(tmp, disp_hdr, markWord::unlocked_value); 87 88 // Initialize the box. (Must happen before we update the object mark!) 89 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 90 91 // Compare object markWord with an unlocked value (tmp) and if 92 // equal exchange the stack address of our box with object markWord. 93 // On failure disp_hdr contains the possibly locked markWord. 94 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, 95 Assembler::aq, Assembler::rl, /*result*/disp_hdr); 96 beq(disp_hdr, tmp, locked); 97 98 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 99 100 // If the compare-and-exchange succeeded, then we found an unlocked 101 // object, will have now locked it will continue at label locked 102 // We did not see an unlocked object so try the fast recursive case. 103 104 // Check if the owner is self by comparing the value in the 105 // markWord of object (disp_hdr) with the stack pointer. 106 sub(disp_hdr, disp_hdr, sp); 107 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place)); 108 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label 109 // locked, hence we can store 0 as the displaced header in the box, which indicates that it 110 // is a recursive lock. 111 andr(tmp/*==0?*/, disp_hdr, tmp); 112 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 113 beqz(tmp, locked); 114 j(slow_path); 115 } 116 117 // Handle existing monitor. 118 bind(object_has_monitor); 119 120 // The object's monitor m is unlocked iff m->owner == nullptr, 121 // otherwise m->owner may contain a thread id, a stack address for LM_LEGACY, 122 // the ANONYMOUS_OWNER constant for LM_LIGHTWEIGHT. 123 // 124 // Try to CAS m->owner from null to current thread id. 125 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value)); 126 Register tid = disp_hdr; 127 ld(tid, Address(xthread, JavaThread::lock_id_offset())); 128 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/tid, Assembler::int64, 129 Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected) 130 131 // Store a non-null value into the box to avoid looking like a re-entrant 132 // lock. The fast-path monitor unlock code checks for 133 // markWord::monitor_value so use markWord::unused_mark which has the 134 // relevant bit set, and also matches ObjectSynchronizer::slow_enter. 135 mv(tmp, (address)markWord::unused_mark().value()); 136 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 137 138 beqz(tmp3Reg, locked); // CAS success means locking succeeded 139 140 bne(tmp3Reg, tid, slow_path); // Check for recursive locking 141 142 // Recursive lock case 143 // Reload markWord from object into displaced_header. 144 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 145 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg); 146 147 bind(locked); 148 mv(flag, zr); 149 inc_held_monitor_count(); 150 151 #ifdef ASSERT 152 // Check that locked label is reached with flag == 0. 153 Label flag_correct; 154 beqz(flag, flag_correct); 155 stop("Fast Lock Flag != 0"); 156 #endif 157 158 bind(slow_path); 159 #ifdef ASSERT 160 // Check that slow_path label is reached with flag != 0. 161 bnez(flag, flag_correct); 162 stop("Fast Lock Flag == 0"); 163 bind(flag_correct); 164 #endif 165 // C2 uses the value of flag (0 vs !0) to determine the continuation. 166 } 167 168 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, 169 Register tmp1Reg, Register tmp2Reg) { 170 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure. 171 Register flag = t1; 172 Register oop = objectReg; 173 Register box = boxReg; 174 Register disp_hdr = tmp1Reg; 175 Register owner_addr = tmp1Reg; 176 Register tmp = tmp2Reg; 177 Label object_has_monitor; 178 // Finish fast lock successfully. MUST branch to with flag == 0 179 Label unlocked; 180 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 181 Label slow_path; 182 183 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 184 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0); 185 186 mv(flag, 1); 187 188 if (LockingMode == LM_MONITOR) { 189 j(slow_path); 190 } else { 191 assert(LockingMode == LM_LEGACY, "must be"); 192 } 193 194 // Find the lock address and load the displaced header from the stack. 195 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 196 197 // If the displaced header is 0, we have a recursive unlock. 198 beqz(disp_hdr, unlocked); 199 200 // Handle existing monitor. 201 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 202 test_bit(t0, tmp, exact_log2(markWord::monitor_value)); 203 bnez(t0, object_has_monitor); 204 205 // Check if it is still a light weight lock, this is true if we 206 // see the stack address of the basicLock in the markWord of the 207 // object. 208 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, 209 Assembler::relaxed, Assembler::rl, /*result*/tmp); 210 beq(box, tmp, unlocked); // box == tmp if cas succeeds 211 j(slow_path); 212 213 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 214 215 // Handle existing monitor. 216 bind(object_has_monitor); 217 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 218 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 219 220 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 221 222 Label notRecursive; 223 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive. 224 225 // Recursive lock 226 addi(disp_hdr, disp_hdr, -1); 227 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 228 j(unlocked); 229 230 bind(notRecursive); 231 // Compute owner address. 232 la(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 233 234 // Set owner to null. 235 // Release to satisfy the JMM 236 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 237 sd(zr, Address(owner_addr)); 238 // We need a full fence after clearing owner to avoid stranding. 239 // StoreLoad achieves this. 240 membar(StoreLoad); 241 242 // Check if the entry lists are empty. 243 ld(t0, Address(tmp, ObjectMonitor::EntryList_offset())); 244 ld(tmp1Reg, Address(tmp, ObjectMonitor::cxq_offset())); 245 orr(t0, t0, tmp1Reg); 246 beqz(t0, unlocked); // If so we are done. 247 248 // Check if there is a successor. 249 ld(t0, Address(tmp, ObjectMonitor::succ_offset())); 250 bnez(t0, unlocked); // If so we are done. 251 252 // Save the monitor pointer in the current thread, so we can try to 253 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 254 sd(tmp, Address(xthread, JavaThread::unlocked_inflated_monitor_offset())); 255 256 mv(flag, 1); 257 j(slow_path); 258 259 bind(unlocked); 260 mv(flag, zr); 261 dec_held_monitor_count(); 262 263 #ifdef ASSERT 264 // Check that unlocked label is reached with flag == 0. 265 Label flag_correct; 266 beqz(flag, flag_correct); 267 stop("Fast Lock Flag != 0"); 268 #endif 269 270 bind(slow_path); 271 #ifdef ASSERT 272 // Check that slow_path label is reached with flag != 0. 273 bnez(flag, flag_correct); 274 stop("Fast Lock Flag == 0"); 275 bind(flag_correct); 276 #endif 277 // C2 uses the value of flag (0 vs !0) to determine the continuation. 278 } 279 280 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, 281 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 282 // Flag register, zero for success; non-zero for failure. 283 Register flag = t1; 284 285 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 286 assert_different_registers(obj, box, tmp1, tmp2, tmp3, tmp4, flag, t0); 287 288 mv(flag, 1); 289 290 // Handle inflated monitor. 291 Label inflated; 292 // Finish fast lock successfully. MUST branch to with flag == 0 293 Label locked; 294 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 295 Label slow_path; 296 297 if (UseObjectMonitorTable) { 298 // Clear cache in case fast locking succeeds. 299 sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 300 } 301 302 if (DiagnoseSyncOnValueBasedClasses != 0) { 303 load_klass(tmp1, obj); 304 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset())); 305 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class)); 306 bnez(tmp1, slow_path); 307 } 308 309 const Register tmp1_mark = tmp1; 310 const Register tmp3_t = tmp3; 311 312 { // Lightweight locking 313 314 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0 315 Label push; 316 317 const Register tmp2_top = tmp2; 318 319 // Check if lock-stack is full. 320 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 321 mv(tmp3_t, (unsigned)LockStack::end_offset()); 322 bge(tmp2_top, tmp3_t, slow_path); 323 324 // Check if recursive. 325 add(tmp3_t, xthread, tmp2_top); 326 ld(tmp3_t, Address(tmp3_t, -oopSize)); 327 beq(obj, tmp3_t, push); 328 329 // Relaxed normal load to check for monitor. Optimization for monitor case. 330 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 331 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 332 bnez(tmp3_t, inflated); 333 334 // Not inflated 335 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la"); 336 337 // Try to lock. Transition lock-bits 0b01 => 0b00 338 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value); 339 xori(tmp3_t, tmp1_mark, markWord::unlocked_value); 340 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 341 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t); 342 bne(tmp1_mark, tmp3_t, slow_path); 343 344 bind(push); 345 // After successful lock, push object on lock-stack. 346 add(tmp3_t, xthread, tmp2_top); 347 sd(obj, Address(tmp3_t)); 348 addw(tmp2_top, tmp2_top, oopSize); 349 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 350 j(locked); 351 } 352 353 { // Handle inflated monitor. 354 bind(inflated); 355 356 const Register tmp1_monitor = tmp1; 357 358 if (!UseObjectMonitorTable) { 359 assert(tmp1_monitor == tmp1_mark, "should be the same here"); 360 } else { 361 Label monitor_found; 362 363 // Load cache address 364 la(tmp3_t, Address(xthread, JavaThread::om_cache_oops_offset())); 365 366 const int num_unrolled = 2; 367 for (int i = 0; i < num_unrolled; i++) { 368 ld(tmp1, Address(tmp3_t)); 369 beq(obj, tmp1, monitor_found); 370 add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference())); 371 } 372 373 Label loop; 374 375 // Search for obj in cache. 376 bind(loop); 377 378 // Check for match. 379 ld(tmp1, Address(tmp3_t)); 380 beq(obj, tmp1, monitor_found); 381 382 // Search until null encountered, guaranteed _null_sentinel at end. 383 add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference())); 384 bnez(tmp1, loop); 385 // Cache Miss. Take the slowpath. 386 j(slow_path); 387 388 bind(monitor_found); 389 ld(tmp1_monitor, Address(tmp3_t, OMCache::oop_to_monitor_difference())); 390 } 391 392 const Register tmp2_owner_addr = tmp2; 393 const Register tmp3_owner = tmp3; 394 395 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 396 const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 397 const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 398 399 Label monitor_locked; 400 401 // Compute owner address. 402 la(tmp2_owner_addr, owner_address); 403 404 // CAS owner (null => current thread). 405 ld(tmp4, Address(xthread, JavaThread::lock_id_offset())); 406 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tmp4, Assembler::int64, 407 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner); 408 beqz(tmp3_owner, monitor_locked); 409 410 // Check if recursive. 411 bne(tmp3_owner, tmp4, slow_path); 412 413 // Recursive. 414 increment(recursions_address, 1, tmp2, tmp3); 415 416 bind(monitor_locked); 417 if (UseObjectMonitorTable) { 418 sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 419 } 420 } 421 422 bind(locked); 423 mv(flag, zr); 424 425 #ifdef ASSERT 426 // Check that locked label is reached with flag == 0. 427 Label flag_correct; 428 beqz(flag, flag_correct); 429 stop("Fast Lock Flag != 0"); 430 #endif 431 432 bind(slow_path); 433 #ifdef ASSERT 434 // Check that slow_path label is reached with flag != 0. 435 bnez(flag, flag_correct); 436 stop("Fast Lock Flag == 0"); 437 bind(flag_correct); 438 #endif 439 // C2 uses the value of flag (0 vs !0) to determine the continuation. 440 } 441 442 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, 443 Register tmp1, Register tmp2, Register tmp3) { 444 // Flag register, zero for success; non-zero for failure. 445 Register flag = t1; 446 447 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 448 assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0); 449 450 mv(flag, 1); 451 452 // Handle inflated monitor. 453 Label inflated, inflated_load_mark; 454 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0 455 Label unlocked; 456 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0 457 Label slow_path; 458 459 const Register tmp1_mark = tmp1; 460 const Register tmp2_top = tmp2; 461 const Register tmp3_t = tmp3; 462 463 { // Lightweight unlock 464 Label push_and_slow_path; 465 466 // Check if obj is top of lock-stack. 467 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 468 subw(tmp2_top, tmp2_top, oopSize); 469 add(tmp3_t, xthread, tmp2_top); 470 ld(tmp3_t, Address(tmp3_t)); 471 // Top of lock stack was not obj. Must be monitor. 472 bne(obj, tmp3_t, inflated_load_mark); 473 474 // Pop lock-stack. 475 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 476 DEBUG_ONLY(sd(zr, Address(tmp3_t));) 477 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 478 479 // Check if recursive. 480 add(tmp3_t, xthread, tmp2_top); 481 ld(tmp3_t, Address(tmp3_t, -oopSize)); 482 beq(obj, tmp3_t, unlocked); 483 484 // Not recursive. 485 // Load Mark. 486 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 487 488 // Check header for monitor (0b10). 489 // Because we got here by popping (meaning we pushed in locked) 490 // there will be no monitor in the box. So we need to push back the obj 491 // so that the runtime can fix any potential anonymous owner. 492 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 493 bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated); 494 495 // Try to unlock. Transition lock bits 0b00 => 0b01 496 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 497 ori(tmp3_t, tmp1_mark, markWord::unlocked_value); 498 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 499 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t); 500 beq(tmp1_mark, tmp3_t, unlocked); 501 502 bind(push_and_slow_path); 503 // Compare and exchange failed. 504 // Restore lock-stack and handle the unlock in runtime. 505 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 506 DEBUG_ONLY(sd(obj, Address(tmp3_t));) 507 addw(tmp2_top, tmp2_top, oopSize); 508 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 509 j(slow_path); 510 } 511 512 { // Handle inflated monitor. 513 bind(inflated_load_mark); 514 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 515 #ifdef ASSERT 516 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 517 bnez(tmp3_t, inflated); 518 stop("Fast Unlock not monitor"); 519 #endif 520 521 bind(inflated); 522 523 #ifdef ASSERT 524 Label check_done; 525 subw(tmp2_top, tmp2_top, oopSize); 526 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset())); 527 blt(tmp2_top, tmp3_t, check_done); 528 add(tmp3_t, xthread, tmp2_top); 529 ld(tmp3_t, Address(tmp3_t)); 530 bne(obj, tmp3_t, inflated); 531 stop("Fast Unlock lock on stack"); 532 bind(check_done); 533 #endif 534 535 const Register tmp1_monitor = tmp1; 536 537 if (!UseObjectMonitorTable) { 538 assert(tmp1_monitor == tmp1_mark, "should be the same here"); 539 // Untag the monitor. 540 add(tmp1_monitor, tmp1_mark, -(int)markWord::monitor_value); 541 } else { 542 ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 543 // No valid pointer below alignof(ObjectMonitor*). Take the slow path. 544 mv(tmp3_t, alignof(ObjectMonitor*)); 545 bltu(tmp1_monitor, tmp3_t, slow_path); 546 } 547 548 const Register tmp2_recursions = tmp2; 549 Label not_recursive; 550 551 // Check if recursive. 552 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 553 beqz(tmp2_recursions, not_recursive); 554 555 // Recursive unlock. 556 addi(tmp2_recursions, tmp2_recursions, -1); 557 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 558 j(unlocked); 559 560 bind(not_recursive); 561 562 const Register tmp2_owner_addr = tmp2; 563 564 // Compute owner address. 565 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset())); 566 567 // Set owner to null. 568 // Release to satisfy the JMM 569 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 570 sd(zr, Address(tmp2_owner_addr)); 571 // We need a full fence after clearing owner to avoid stranding. 572 // StoreLoad achieves this. 573 membar(StoreLoad); 574 575 // Check if the entry lists are empty. 576 ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset())); 577 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset())); 578 orr(t0, t0, tmp3_t); 579 beqz(t0, unlocked); // If so we are done. 580 581 // Check if there is a successor. 582 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset())); 583 bnez(tmp3_t, unlocked); // If so we are done. 584 585 // Save the monitor pointer in the current thread, so we can try 586 // to reacquire the lock in SharedRuntime::monitor_exit_helper(). 587 sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset())); 588 589 mv(flag, 1); 590 j(slow_path); 591 } 592 593 bind(unlocked); 594 mv(flag, zr); 595 596 #ifdef ASSERT 597 // Check that unlocked label is reached with flag == 0. 598 Label flag_correct; 599 beqz(flag, flag_correct); 600 stop("Fast Lock Flag != 0"); 601 #endif 602 603 bind(slow_path); 604 #ifdef ASSERT 605 // Check that slow_path label is reached with flag != 0. 606 bnez(flag, flag_correct); 607 stop("Fast Lock Flag == 0"); 608 bind(flag_correct); 609 #endif 610 // C2 uses the value of flag (0 vs !0) to determine the continuation. 611 } 612 613 // short string 614 // StringUTF16.indexOfChar 615 // StringLatin1.indexOfChar 616 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1, 617 Register ch, Register result, 618 bool isL) 619 { 620 Register ch1 = t0; 621 Register index = t1; 622 623 BLOCK_COMMENT("string_indexof_char_short {"); 624 625 Label LOOP, LOOP1, LOOP4, LOOP8; 626 Label MATCH, MATCH1, MATCH2, MATCH3, 627 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH; 628 629 mv(result, -1); 630 mv(index, zr); 631 632 bind(LOOP); 633 addi(t0, index, 8); 634 ble(t0, cnt1, LOOP8); 635 addi(t0, index, 4); 636 ble(t0, cnt1, LOOP4); 637 j(LOOP1); 638 639 bind(LOOP8); 640 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 641 beq(ch, ch1, MATCH); 642 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 643 beq(ch, ch1, MATCH1); 644 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 645 beq(ch, ch1, MATCH2); 646 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 647 beq(ch, ch1, MATCH3); 648 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8)); 649 beq(ch, ch1, MATCH4); 650 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10)); 651 beq(ch, ch1, MATCH5); 652 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12)); 653 beq(ch, ch1, MATCH6); 654 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14)); 655 beq(ch, ch1, MATCH7); 656 addi(index, index, 8); 657 addi(str1, str1, isL ? 8 : 16); 658 blt(index, cnt1, LOOP); 659 j(NOMATCH); 660 661 bind(LOOP4); 662 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 663 beq(ch, ch1, MATCH); 664 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 665 beq(ch, ch1, MATCH1); 666 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 667 beq(ch, ch1, MATCH2); 668 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 669 beq(ch, ch1, MATCH3); 670 addi(index, index, 4); 671 addi(str1, str1, isL ? 4 : 8); 672 bge(index, cnt1, NOMATCH); 673 674 bind(LOOP1); 675 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1)); 676 beq(ch, ch1, MATCH); 677 addi(index, index, 1); 678 addi(str1, str1, isL ? 1 : 2); 679 blt(index, cnt1, LOOP1); 680 j(NOMATCH); 681 682 bind(MATCH1); 683 addi(index, index, 1); 684 j(MATCH); 685 686 bind(MATCH2); 687 addi(index, index, 2); 688 j(MATCH); 689 690 bind(MATCH3); 691 addi(index, index, 3); 692 j(MATCH); 693 694 bind(MATCH4); 695 addi(index, index, 4); 696 j(MATCH); 697 698 bind(MATCH5); 699 addi(index, index, 5); 700 j(MATCH); 701 702 bind(MATCH6); 703 addi(index, index, 6); 704 j(MATCH); 705 706 bind(MATCH7); 707 addi(index, index, 7); 708 709 bind(MATCH); 710 mv(result, index); 711 bind(NOMATCH); 712 BLOCK_COMMENT("} string_indexof_char_short"); 713 } 714 715 // StringUTF16.indexOfChar 716 // StringLatin1.indexOfChar 717 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 718 Register ch, Register result, 719 Register tmp1, Register tmp2, 720 Register tmp3, Register tmp4, 721 bool isL) 722 { 723 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG; 724 Register ch1 = t0; 725 Register orig_cnt = t1; 726 Register mask1 = tmp3; 727 Register mask2 = tmp2; 728 Register match_mask = tmp1; 729 Register trailing_char = tmp4; 730 Register unaligned_elems = tmp4; 731 732 BLOCK_COMMENT("string_indexof_char {"); 733 beqz(cnt1, NOMATCH); 734 735 addi(t0, cnt1, isL ? -32 : -16); 736 bgtz(t0, DO_LONG); 737 string_indexof_char_short(str1, cnt1, ch, result, isL); 738 j(DONE); 739 740 bind(DO_LONG); 741 mv(orig_cnt, cnt1); 742 if (AvoidUnalignedAccesses) { 743 Label ALIGNED; 744 andi(unaligned_elems, str1, 0x7); 745 beqz(unaligned_elems, ALIGNED); 746 sub(unaligned_elems, unaligned_elems, 8); 747 neg(unaligned_elems, unaligned_elems); 748 if (!isL) { 749 srli(unaligned_elems, unaligned_elems, 1); 750 } 751 // do unaligned part per element 752 string_indexof_char_short(str1, unaligned_elems, ch, result, isL); 753 bgez(result, DONE); 754 mv(orig_cnt, cnt1); 755 sub(cnt1, cnt1, unaligned_elems); 756 bind(ALIGNED); 757 } 758 759 // duplicate ch 760 if (isL) { 761 slli(ch1, ch, 8); 762 orr(ch, ch1, ch); 763 } 764 slli(ch1, ch, 16); 765 orr(ch, ch1, ch); 766 slli(ch1, ch, 32); 767 orr(ch, ch1, ch); 768 769 if (!isL) { 770 slli(cnt1, cnt1, 1); 771 } 772 773 uint64_t mask0101 = UCONST64(0x0101010101010101); 774 uint64_t mask0001 = UCONST64(0x0001000100010001); 775 mv(mask1, isL ? mask0101 : mask0001); 776 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 777 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 778 mv(mask2, isL ? mask7f7f : mask7fff); 779 780 bind(CH1_LOOP); 781 ld(ch1, Address(str1)); 782 addi(str1, str1, 8); 783 addi(cnt1, cnt1, -8); 784 compute_match_mask(ch1, ch, match_mask, mask1, mask2); 785 bnez(match_mask, HIT); 786 bgtz(cnt1, CH1_LOOP); 787 j(NOMATCH); 788 789 bind(HIT); 790 ctzc_bit(trailing_char, match_mask, isL, ch1, result); 791 srli(trailing_char, trailing_char, 3); 792 addi(cnt1, cnt1, 8); 793 ble(cnt1, trailing_char, NOMATCH); 794 // match case 795 if (!isL) { 796 srli(cnt1, cnt1, 1); 797 srli(trailing_char, trailing_char, 1); 798 } 799 800 sub(result, orig_cnt, cnt1); 801 add(result, result, trailing_char); 802 j(DONE); 803 804 bind(NOMATCH); 805 mv(result, -1); 806 807 bind(DONE); 808 BLOCK_COMMENT("} string_indexof_char"); 809 } 810 811 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp); 812 813 // Search for needle in haystack and return index or -1 814 // x10: result 815 // x11: haystack 816 // x12: haystack_len 817 // x13: needle 818 // x14: needle_len 819 void C2_MacroAssembler::string_indexof(Register haystack, Register needle, 820 Register haystack_len, Register needle_len, 821 Register tmp1, Register tmp2, 822 Register tmp3, Register tmp4, 823 Register tmp5, Register tmp6, 824 Register result, int ae) 825 { 826 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 827 828 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH; 829 830 Register ch1 = t0; 831 Register ch2 = t1; 832 Register nlen_tmp = tmp1; // needle len tmp 833 Register hlen_tmp = tmp2; // haystack len tmp 834 Register result_tmp = tmp4; 835 836 bool isLL = ae == StrIntrinsicNode::LL; 837 838 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 839 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 840 int needle_chr_shift = needle_isL ? 0 : 1; 841 int haystack_chr_shift = haystack_isL ? 0 : 1; 842 int needle_chr_size = needle_isL ? 1 : 2; 843 int haystack_chr_size = haystack_isL ? 1 : 2; 844 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 845 (load_chr_insn)&MacroAssembler::lhu; 846 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 847 (load_chr_insn)&MacroAssembler::lhu; 848 849 BLOCK_COMMENT("string_indexof {"); 850 851 // Note, inline_string_indexOf() generates checks: 852 // if (pattern.count > src.count) return -1; 853 // if (pattern.count == 0) return 0; 854 855 // We have two strings, a source string in haystack, haystack_len and a pattern string 856 // in needle, needle_len. Find the first occurrence of pattern in source or return -1. 857 858 // For larger pattern and source we use a simplified Boyer Moore algorithm. 859 // With a small pattern and source we use linear scan. 860 861 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm. 862 sub(result_tmp, haystack_len, needle_len); 863 // needle_len < 8, use linear scan 864 sub(t0, needle_len, 8); 865 bltz(t0, LINEARSEARCH); 866 // needle_len >= 256, use linear scan 867 sub(t0, needle_len, 256); 868 bgez(t0, LINEARSTUB); 869 // needle_len >= haystack_len/4, use linear scan 870 srli(t0, haystack_len, 2); 871 bge(needle_len, t0, LINEARSTUB); 872 873 // Boyer-Moore-Horspool introduction: 874 // The Boyer Moore alogorithm is based on the description here:- 875 // 876 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 877 // 878 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 879 // and the 'Good Suffix' rule. 880 // 881 // These rules are essentially heuristics for how far we can shift the 882 // pattern along the search string. 883 // 884 // The implementation here uses the 'Bad Character' rule only because of the 885 // complexity of initialisation for the 'Good Suffix' rule. 886 // 887 // This is also known as the Boyer-Moore-Horspool algorithm: 888 // 889 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 890 // 891 // #define ASIZE 256 892 // 893 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) { 894 // int i, j; 895 // unsigned c; 896 // unsigned char bc[ASIZE]; 897 // 898 // /* Preprocessing */ 899 // for (i = 0; i < ASIZE; ++i) 900 // bc[i] = m; 901 // for (i = 0; i < m - 1; ) { 902 // c = pattern[i]; 903 // ++i; 904 // // c < 256 for Latin1 string, so, no need for branch 905 // #ifdef PATTERN_STRING_IS_LATIN1 906 // bc[c] = m - i; 907 // #else 908 // if (c < ASIZE) bc[c] = m - i; 909 // #endif 910 // } 911 // 912 // /* Searching */ 913 // j = 0; 914 // while (j <= n - m) { 915 // c = src[i+j]; 916 // if (pattern[m-1] == c) 917 // int k; 918 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 919 // if (k < 0) return j; 920 // // c < 256 for Latin1 string, so, no need for branch 921 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1 922 // // LL case: (c< 256) always true. Remove branch 923 // j += bc[pattern[j+m-1]]; 924 // #endif 925 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF 926 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 927 // if (c < ASIZE) 928 // j += bc[pattern[j+m-1]]; 929 // else 930 // j += 1 931 // #endif 932 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1 933 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 934 // if (c < ASIZE) 935 // j += bc[pattern[j+m-1]]; 936 // else 937 // j += m 938 // #endif 939 // } 940 // return -1; 941 // } 942 943 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result 944 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 945 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 946 947 Register haystack_end = haystack_len; 948 Register skipch = tmp2; 949 950 // pattern length is >=8, so, we can read at least 1 register for cases when 951 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 952 // UL case. We'll re-read last character in inner pre-loop code to have 953 // single outer pre-loop load 954 const int firstStep = isLL ? 7 : 3; 955 956 const int ASIZE = 256; 957 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd) 958 959 sub(sp, sp, ASIZE); 960 961 // init BC offset table with default value: needle_len 962 slli(t0, needle_len, 8); 963 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len] 964 slli(tmp1, t0, 16); 965 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len] 966 slli(tmp1, t0, 32); 967 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len] 968 969 mv(ch1, sp); // ch1 is t0 970 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations 971 972 bind(BM_INIT_LOOP); 973 // for (i = 0; i < ASIZE; ++i) 974 // bc[i] = m; 975 for (int i = 0; i < 4; i++) { 976 sd(tmp5, Address(ch1, i * wordSize)); 977 } 978 add(ch1, ch1, 32); 979 sub(tmp6, tmp6, 4); 980 bgtz(tmp6, BM_INIT_LOOP); 981 982 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern 983 Register orig_haystack = tmp5; 984 mv(orig_haystack, haystack); 985 // result_tmp = tmp4 986 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift); 987 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1 988 mv(tmp3, needle); 989 990 // for (i = 0; i < m - 1; ) { 991 // c = pattern[i]; 992 // ++i; 993 // // c < 256 for Latin1 string, so, no need for branch 994 // #ifdef PATTERN_STRING_IS_LATIN1 995 // bc[c] = m - i; 996 // #else 997 // if (c < ASIZE) bc[c] = m - i; 998 // #endif 999 // } 1000 bind(BCLOOP); 1001 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg); 1002 add(tmp3, tmp3, needle_chr_size); 1003 if (!needle_isL) { 1004 // ae == StrIntrinsicNode::UU 1005 mv(tmp6, ASIZE); 1006 bgeu(ch1, tmp6, BCSKIP); 1007 } 1008 add(tmp4, sp, ch1); 1009 sb(ch2, Address(tmp4)); // store skip offset to BC offset table 1010 1011 bind(BCSKIP); 1012 sub(ch2, ch2, 1); // for next pattern element, skip distance -1 1013 bgtz(ch2, BCLOOP); 1014 1015 // tmp6: pattern end, address after needle 1016 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift); 1017 if (needle_isL == haystack_isL) { 1018 // load last 8 bytes (8LL/4UU symbols) 1019 ld(tmp6, Address(tmp6, -wordSize)); 1020 } else { 1021 // UL: from UTF-16(source) search Latin1(pattern) 1022 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols) 1023 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d 1024 // We'll have to wait until load completed, but it's still faster than per-character loads+checks 1025 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a 1026 slli(ch2, tmp6, XLEN - 24); 1027 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b 1028 slli(ch1, tmp6, XLEN - 16); 1029 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c 1030 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d 1031 slli(ch2, ch2, 16); 1032 orr(ch2, ch2, ch1); // 0x00000b0c 1033 slli(result, tmp3, 48); // use result as temp register 1034 orr(tmp6, tmp6, result); // 0x0a00000d 1035 slli(result, ch2, 16); 1036 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d 1037 } 1038 1039 // i = m - 1; 1040 // skipch = j + i; 1041 // if (skipch == pattern[m - 1] 1042 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 1043 // else 1044 // move j with bad char offset table 1045 bind(BMLOOPSTR2); 1046 // compare pattern to source string backward 1047 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift); 1048 (this->*haystack_load_1chr)(skipch, Address(result), noreg); 1049 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8 1050 if (needle_isL == haystack_isL) { 1051 // re-init tmp3. It's for free because it's executed in parallel with 1052 // load above. Alternative is to initialize it before loop, but it'll 1053 // affect performance on in-order systems with 2 or more ld/st pipelines 1054 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1] 1055 } 1056 if (!isLL) { // UU/UL case 1057 slli(ch2, nlen_tmp, 1); // offsets in bytes 1058 } 1059 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char 1060 add(result, haystack, isLL ? nlen_tmp : ch2); 1061 // load 8 bytes from source string 1062 // if isLL is false then read granularity can be 2 1063 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway 1064 mv(ch1, tmp6); 1065 if (isLL) { 1066 j(BMLOOPSTR1_AFTER_LOAD); 1067 } else { 1068 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 1069 j(BMLOOPSTR1_CMP); 1070 } 1071 1072 bind(BMLOOPSTR1); 1073 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift); 1074 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 1075 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift); 1076 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1077 1078 bind(BMLOOPSTR1_AFTER_LOAD); 1079 sub(nlen_tmp, nlen_tmp, 1); 1080 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP); 1081 1082 bind(BMLOOPSTR1_CMP); 1083 beq(ch1, ch2, BMLOOPSTR1); 1084 1085 bind(BMSKIP); 1086 if (!isLL) { 1087 // if we've met UTF symbol while searching Latin1 pattern, then we can 1088 // skip needle_len symbols 1089 if (needle_isL != haystack_isL) { 1090 mv(result_tmp, needle_len); 1091 } else { 1092 mv(result_tmp, 1); 1093 } 1094 mv(t0, ASIZE); 1095 bgeu(skipch, t0, BMADV); 1096 } 1097 add(result_tmp, sp, skipch); 1098 lbu(result_tmp, Address(result_tmp)); // load skip offset 1099 1100 bind(BMADV); 1101 sub(nlen_tmp, needle_len, 1); 1102 // move haystack after bad char skip offset 1103 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift); 1104 ble(haystack, haystack_end, BMLOOPSTR2); 1105 add(sp, sp, ASIZE); 1106 j(NOMATCH); 1107 1108 bind(BMLOOPSTR1_LASTCMP); 1109 bne(ch1, ch2, BMSKIP); 1110 1111 bind(BMMATCH); 1112 sub(result, haystack, orig_haystack); 1113 if (!haystack_isL) { 1114 srli(result, result, 1); 1115 } 1116 add(sp, sp, ASIZE); 1117 j(DONE); 1118 1119 bind(LINEARSTUB); 1120 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm 1121 bltz(t0, LINEARSEARCH); 1122 mv(result, zr); 1123 RuntimeAddress stub = nullptr; 1124 if (isLL) { 1125 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll()); 1126 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 1127 } else if (needle_isL) { 1128 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul()); 1129 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 1130 } else { 1131 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu()); 1132 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 1133 } 1134 address call = reloc_call(stub); 1135 if (call == nullptr) { 1136 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH)); 1137 ciEnv::current()->record_failure("CodeCache is full"); 1138 return; 1139 } 1140 j(DONE); 1141 1142 bind(NOMATCH); 1143 mv(result, -1); 1144 j(DONE); 1145 1146 bind(LINEARSEARCH); 1147 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae); 1148 1149 bind(DONE); 1150 BLOCK_COMMENT("} string_indexof"); 1151 } 1152 1153 // string_indexof 1154 // result: x10 1155 // src: x11 1156 // src_count: x12 1157 // pattern: x13 1158 // pattern_count: x14 or 1/2/3/4 1159 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle, 1160 Register haystack_len, Register needle_len, 1161 Register tmp1, Register tmp2, 1162 Register tmp3, Register tmp4, 1163 int needle_con_cnt, Register result, int ae) 1164 { 1165 // Note: 1166 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant 1167 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1 1168 assert(needle_con_cnt <= 4, "Invalid needle constant count"); 1169 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1170 1171 Register ch1 = t0; 1172 Register ch2 = t1; 1173 Register hlen_neg = haystack_len, nlen_neg = needle_len; 1174 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4; 1175 1176 bool isLL = ae == StrIntrinsicNode::LL; 1177 1178 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 1179 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 1180 int needle_chr_shift = needle_isL ? 0 : 1; 1181 int haystack_chr_shift = haystack_isL ? 0 : 1; 1182 int needle_chr_size = needle_isL ? 1 : 2; 1183 int haystack_chr_size = haystack_isL ? 1 : 2; 1184 1185 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 1186 (load_chr_insn)&MacroAssembler::lhu; 1187 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 1188 (load_chr_insn)&MacroAssembler::lhu; 1189 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu; 1190 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld; 1191 1192 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE; 1193 1194 Register first = tmp3; 1195 1196 if (needle_con_cnt == -1) { 1197 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 1198 1199 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2); 1200 bltz(t0, DOSHORT); 1201 1202 (this->*needle_load_1chr)(first, Address(needle), noreg); 1203 slli(t0, needle_len, needle_chr_shift); 1204 add(needle, needle, t0); 1205 neg(nlen_neg, t0); 1206 slli(t0, result_tmp, haystack_chr_shift); 1207 add(haystack, haystack, t0); 1208 neg(hlen_neg, t0); 1209 1210 bind(FIRST_LOOP); 1211 add(t0, haystack, hlen_neg); 1212 (this->*haystack_load_1chr)(ch2, Address(t0), noreg); 1213 beq(first, ch2, STR1_LOOP); 1214 1215 bind(STR2_NEXT); 1216 add(hlen_neg, hlen_neg, haystack_chr_size); 1217 blez(hlen_neg, FIRST_LOOP); 1218 j(NOMATCH); 1219 1220 bind(STR1_LOOP); 1221 add(nlen_tmp, nlen_neg, needle_chr_size); 1222 add(hlen_tmp, hlen_neg, haystack_chr_size); 1223 bgez(nlen_tmp, MATCH); 1224 1225 bind(STR1_NEXT); 1226 add(ch1, needle, nlen_tmp); 1227 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 1228 add(ch2, haystack, hlen_tmp); 1229 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1230 bne(ch1, ch2, STR2_NEXT); 1231 add(nlen_tmp, nlen_tmp, needle_chr_size); 1232 add(hlen_tmp, hlen_tmp, haystack_chr_size); 1233 bltz(nlen_tmp, STR1_NEXT); 1234 j(MATCH); 1235 1236 bind(DOSHORT); 1237 if (needle_isL == haystack_isL) { 1238 sub(t0, needle_len, 2); 1239 bltz(t0, DO1); 1240 bgtz(t0, DO3); 1241 } 1242 } 1243 1244 if (needle_con_cnt == 4) { 1245 Label CH1_LOOP; 1246 (this->*load_4chr)(ch1, Address(needle), noreg); 1247 sub(result_tmp, haystack_len, 4); 1248 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp 1249 add(haystack, haystack, tmp3); 1250 neg(hlen_neg, tmp3); 1251 if (AvoidUnalignedAccesses) { 1252 // preload first value, then we will read by 1 character per loop, instead of four 1253 // just shifting previous ch2 right by size of character in bits 1254 add(tmp3, haystack, hlen_neg); 1255 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1256 if (isLL) { 1257 // need to erase 1 most significant byte in 32-bit value of ch2 1258 slli(ch2, ch2, 40); 1259 srli(ch2, ch2, 32); 1260 } else { 1261 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation 1262 } 1263 } 1264 1265 bind(CH1_LOOP); 1266 add(tmp3, haystack, hlen_neg); 1267 if (AvoidUnalignedAccesses) { 1268 srli(ch2, ch2, isLL ? 8 : 16); 1269 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg); 1270 slli(tmp3, tmp3, isLL ? 24 : 48); 1271 add(ch2, ch2, tmp3); 1272 } else { 1273 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1274 } 1275 beq(ch1, ch2, MATCH); 1276 add(hlen_neg, hlen_neg, haystack_chr_size); 1277 blez(hlen_neg, CH1_LOOP); 1278 j(NOMATCH); 1279 } 1280 1281 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) { 1282 Label CH1_LOOP; 1283 BLOCK_COMMENT("string_indexof DO2 {"); 1284 bind(DO2); 1285 (this->*load_2chr)(ch1, Address(needle), noreg); 1286 if (needle_con_cnt == 2) { 1287 sub(result_tmp, haystack_len, 2); 1288 } 1289 slli(tmp3, result_tmp, haystack_chr_shift); 1290 add(haystack, haystack, tmp3); 1291 neg(hlen_neg, tmp3); 1292 if (AvoidUnalignedAccesses) { 1293 // preload first value, then we will read by 1 character per loop, instead of two 1294 // just shifting previous ch2 right by size of character in bits 1295 add(tmp3, haystack, hlen_neg); 1296 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1297 slli(ch2, ch2, isLL ? 8 : 16); 1298 } 1299 bind(CH1_LOOP); 1300 add(tmp3, haystack, hlen_neg); 1301 if (AvoidUnalignedAccesses) { 1302 srli(ch2, ch2, isLL ? 8 : 16); 1303 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg); 1304 slli(tmp3, tmp3, isLL ? 8 : 16); 1305 add(ch2, ch2, tmp3); 1306 } else { 1307 (this->*load_2chr)(ch2, Address(tmp3), noreg); 1308 } 1309 beq(ch1, ch2, MATCH); 1310 add(hlen_neg, hlen_neg, haystack_chr_size); 1311 blez(hlen_neg, CH1_LOOP); 1312 j(NOMATCH); 1313 BLOCK_COMMENT("} string_indexof DO2"); 1314 } 1315 1316 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) { 1317 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1318 BLOCK_COMMENT("string_indexof DO3 {"); 1319 1320 bind(DO3); 1321 (this->*load_2chr)(first, Address(needle), noreg); 1322 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg); 1323 if (needle_con_cnt == 3) { 1324 sub(result_tmp, haystack_len, 3); 1325 } 1326 slli(hlen_tmp, result_tmp, haystack_chr_shift); 1327 add(haystack, haystack, hlen_tmp); 1328 neg(hlen_neg, hlen_tmp); 1329 1330 bind(FIRST_LOOP); 1331 add(ch2, haystack, hlen_neg); 1332 if (AvoidUnalignedAccesses) { 1333 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2 1334 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1335 slli(tmp2, tmp2, isLL ? 8 : 16); 1336 add(ch2, ch2, tmp2); 1337 } else { 1338 (this->*load_2chr)(ch2, Address(ch2), noreg); 1339 } 1340 beq(first, ch2, STR1_LOOP); 1341 1342 bind(STR2_NEXT); 1343 add(hlen_neg, hlen_neg, haystack_chr_size); 1344 blez(hlen_neg, FIRST_LOOP); 1345 j(NOMATCH); 1346 1347 bind(STR1_LOOP); 1348 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size); 1349 add(ch2, haystack, hlen_tmp); 1350 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1351 bne(ch1, ch2, STR2_NEXT); 1352 j(MATCH); 1353 BLOCK_COMMENT("} string_indexof DO3"); 1354 } 1355 1356 if (needle_con_cnt == -1 || needle_con_cnt == 1) { 1357 Label DO1_LOOP; 1358 1359 BLOCK_COMMENT("string_indexof DO1 {"); 1360 bind(DO1); 1361 (this->*needle_load_1chr)(ch1, Address(needle), noreg); 1362 sub(result_tmp, haystack_len, 1); 1363 slli(tmp3, result_tmp, haystack_chr_shift); 1364 add(haystack, haystack, tmp3); 1365 neg(hlen_neg, tmp3); 1366 1367 bind(DO1_LOOP); 1368 add(tmp3, haystack, hlen_neg); 1369 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1370 beq(ch1, ch2, MATCH); 1371 add(hlen_neg, hlen_neg, haystack_chr_size); 1372 blez(hlen_neg, DO1_LOOP); 1373 BLOCK_COMMENT("} string_indexof DO1"); 1374 } 1375 1376 bind(NOMATCH); 1377 mv(result, -1); 1378 j(DONE); 1379 1380 bind(MATCH); 1381 srai(t0, hlen_neg, haystack_chr_shift); 1382 add(result, result_tmp, t0); 1383 1384 bind(DONE); 1385 } 1386 1387 // Compare strings. 1388 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1389 Register cnt1, Register cnt2, Register result, 1390 Register tmp1, Register tmp2, Register tmp3, 1391 int ae) 1392 { 1393 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1394 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1395 SHORT_LOOP_START, TAIL_CHECK, L; 1396 1397 const int STUB_THRESHOLD = 64 + 8; 1398 bool isLL = ae == StrIntrinsicNode::LL; 1399 bool isLU = ae == StrIntrinsicNode::LU; 1400 bool isUL = ae == StrIntrinsicNode::UL; 1401 1402 bool str1_isL = isLL || isLU; 1403 bool str2_isL = isLL || isUL; 1404 1405 // for L strings, 1 byte for 1 character 1406 // for U strings, 2 bytes for 1 character 1407 int str1_chr_size = str1_isL ? 1 : 2; 1408 int str2_chr_size = str2_isL ? 1 : 2; 1409 int minCharsInWord = isLL ? wordSize : wordSize / 2; 1410 1411 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1412 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1413 1414 BLOCK_COMMENT("string_compare {"); 1415 1416 // Bizarrely, the counts are passed in bytes, regardless of whether they 1417 // are L or U strings, however the result is always in characters. 1418 if (!str1_isL) { 1419 sraiw(cnt1, cnt1, 1); 1420 } 1421 if (!str2_isL) { 1422 sraiw(cnt2, cnt2, 1); 1423 } 1424 1425 // Compute the minimum of the string lengths and save the difference in result. 1426 sub(result, cnt1, cnt2); 1427 bgt(cnt1, cnt2, L); 1428 mv(cnt2, cnt1); 1429 bind(L); 1430 1431 // A very short string 1432 mv(t0, minCharsInWord); 1433 ble(cnt2, t0, SHORT_STRING); 1434 1435 // Compare longwords 1436 // load first parts of strings and finish initialization while loading 1437 { 1438 if (str1_isL == str2_isL) { // LL or UU 1439 // check if str1 and str2 is same pointer 1440 beq(str1, str2, DONE); 1441 // load 8 bytes once to compare 1442 ld(tmp1, Address(str1)); 1443 ld(tmp2, Address(str2)); 1444 mv(t0, STUB_THRESHOLD); 1445 bge(cnt2, t0, STUB); 1446 sub(cnt2, cnt2, minCharsInWord); 1447 beqz(cnt2, TAIL_CHECK); 1448 // convert cnt2 from characters to bytes 1449 if (!str1_isL) { 1450 slli(cnt2, cnt2, 1); 1451 } 1452 add(str2, str2, cnt2); 1453 add(str1, str1, cnt2); 1454 sub(cnt2, zr, cnt2); 1455 } else if (isLU) { // LU case 1456 lwu(tmp1, Address(str1)); 1457 ld(tmp2, Address(str2)); 1458 mv(t0, STUB_THRESHOLD); 1459 bge(cnt2, t0, STUB); 1460 addi(cnt2, cnt2, -4); 1461 add(str1, str1, cnt2); 1462 sub(cnt1, zr, cnt2); 1463 slli(cnt2, cnt2, 1); 1464 add(str2, str2, cnt2); 1465 inflate_lo32(tmp3, tmp1); 1466 mv(tmp1, tmp3); 1467 sub(cnt2, zr, cnt2); 1468 addi(cnt1, cnt1, 4); 1469 } else { // UL case 1470 ld(tmp1, Address(str1)); 1471 lwu(tmp2, Address(str2)); 1472 mv(t0, STUB_THRESHOLD); 1473 bge(cnt2, t0, STUB); 1474 addi(cnt2, cnt2, -4); 1475 slli(t0, cnt2, 1); 1476 sub(cnt1, zr, t0); 1477 add(str1, str1, t0); 1478 add(str2, str2, cnt2); 1479 inflate_lo32(tmp3, tmp2); 1480 mv(tmp2, tmp3); 1481 sub(cnt2, zr, cnt2); 1482 addi(cnt1, cnt1, 8); 1483 } 1484 addi(cnt2, cnt2, isUL ? 4 : 8); 1485 bne(tmp1, tmp2, DIFFERENCE); 1486 bgez(cnt2, TAIL); 1487 1488 // main loop 1489 bind(NEXT_WORD); 1490 if (str1_isL == str2_isL) { // LL or UU 1491 add(t0, str1, cnt2); 1492 ld(tmp1, Address(t0)); 1493 add(t0, str2, cnt2); 1494 ld(tmp2, Address(t0)); 1495 addi(cnt2, cnt2, 8); 1496 } else if (isLU) { // LU case 1497 add(t0, str1, cnt1); 1498 lwu(tmp1, Address(t0)); 1499 add(t0, str2, cnt2); 1500 ld(tmp2, Address(t0)); 1501 addi(cnt1, cnt1, 4); 1502 inflate_lo32(tmp3, tmp1); 1503 mv(tmp1, tmp3); 1504 addi(cnt2, cnt2, 8); 1505 } else { // UL case 1506 add(t0, str2, cnt2); 1507 lwu(tmp2, Address(t0)); 1508 add(t0, str1, cnt1); 1509 ld(tmp1, Address(t0)); 1510 inflate_lo32(tmp3, tmp2); 1511 mv(tmp2, tmp3); 1512 addi(cnt1, cnt1, 8); 1513 addi(cnt2, cnt2, 4); 1514 } 1515 bne(tmp1, tmp2, DIFFERENCE); 1516 bltz(cnt2, NEXT_WORD); 1517 bind(TAIL); 1518 if (str1_isL == str2_isL) { // LL or UU 1519 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2); 1520 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2); 1521 } else if (isLU) { // LU case 1522 load_int_misaligned(tmp1, Address(str1), tmp3, false); 1523 load_long_misaligned(tmp2, Address(str2), tmp3, 2); 1524 inflate_lo32(tmp3, tmp1); 1525 mv(tmp1, tmp3); 1526 } else { // UL case 1527 load_int_misaligned(tmp2, Address(str2), tmp3, false); 1528 load_long_misaligned(tmp1, Address(str1), tmp3, 2); 1529 inflate_lo32(tmp3, tmp2); 1530 mv(tmp2, tmp3); 1531 } 1532 bind(TAIL_CHECK); 1533 beq(tmp1, tmp2, DONE); 1534 1535 // Find the first different characters in the longwords and 1536 // compute their difference. 1537 bind(DIFFERENCE); 1538 xorr(tmp3, tmp1, tmp2); 1539 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb 1540 srl(tmp1, tmp1, result); 1541 srl(tmp2, tmp2, result); 1542 if (isLL) { 1543 andi(tmp1, tmp1, 0xFF); 1544 andi(tmp2, tmp2, 0xFF); 1545 } else { 1546 andi(tmp1, tmp1, 0xFFFF); 1547 andi(tmp2, tmp2, 0xFFFF); 1548 } 1549 sub(result, tmp1, tmp2); 1550 j(DONE); 1551 } 1552 1553 bind(STUB); 1554 RuntimeAddress stub = nullptr; 1555 switch (ae) { 1556 case StrIntrinsicNode::LL: 1557 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL()); 1558 break; 1559 case StrIntrinsicNode::UU: 1560 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU()); 1561 break; 1562 case StrIntrinsicNode::LU: 1563 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU()); 1564 break; 1565 case StrIntrinsicNode::UL: 1566 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL()); 1567 break; 1568 default: 1569 ShouldNotReachHere(); 1570 } 1571 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1572 address call = reloc_call(stub); 1573 if (call == nullptr) { 1574 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1575 ciEnv::current()->record_failure("CodeCache is full"); 1576 return; 1577 } 1578 j(DONE); 1579 1580 bind(SHORT_STRING); 1581 // Is the minimum length zero? 1582 beqz(cnt2, DONE); 1583 // arrange code to do most branches while loading and loading next characters 1584 // while comparing previous 1585 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1586 addi(str1, str1, str1_chr_size); 1587 addi(cnt2, cnt2, -1); 1588 beqz(cnt2, SHORT_LAST_INIT); 1589 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1590 addi(str2, str2, str2_chr_size); 1591 j(SHORT_LOOP_START); 1592 bind(SHORT_LOOP); 1593 addi(cnt2, cnt2, -1); 1594 beqz(cnt2, SHORT_LAST); 1595 bind(SHORT_LOOP_START); 1596 (this->*str1_load_chr)(tmp2, Address(str1), t0); 1597 addi(str1, str1, str1_chr_size); 1598 (this->*str2_load_chr)(t0, Address(str2), t0); 1599 addi(str2, str2, str2_chr_size); 1600 bne(tmp1, cnt1, SHORT_LOOP_TAIL); 1601 addi(cnt2, cnt2, -1); 1602 beqz(cnt2, SHORT_LAST2); 1603 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1604 addi(str1, str1, str1_chr_size); 1605 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1606 addi(str2, str2, str2_chr_size); 1607 beq(tmp2, t0, SHORT_LOOP); 1608 sub(result, tmp2, t0); 1609 j(DONE); 1610 bind(SHORT_LOOP_TAIL); 1611 sub(result, tmp1, cnt1); 1612 j(DONE); 1613 bind(SHORT_LAST2); 1614 beq(tmp2, t0, DONE); 1615 sub(result, tmp2, t0); 1616 1617 j(DONE); 1618 bind(SHORT_LAST_INIT); 1619 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1620 addi(str2, str2, str2_chr_size); 1621 bind(SHORT_LAST); 1622 beq(tmp1, cnt1, DONE); 1623 sub(result, tmp1, cnt1); 1624 1625 bind(DONE); 1626 1627 BLOCK_COMMENT("} string_compare"); 1628 } 1629 1630 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, 1631 Register tmp1, Register tmp2, Register tmp3, 1632 Register result, int elem_size) { 1633 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 1634 assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0); 1635 1636 int elem_per_word = wordSize/elem_size; 1637 int log_elem_size = exact_log2(elem_size); 1638 int length_offset = arrayOopDesc::length_offset_in_bytes(); 1639 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 1640 1641 Register cnt1 = tmp3; 1642 Register cnt2 = tmp1; // cnt2 only used in array length compare 1643 Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01; 1644 1645 BLOCK_COMMENT("arrays_equals {"); 1646 1647 // if (a1 == a2), return true 1648 beq(a1, a2, SAME); 1649 1650 mv(result, false); 1651 // if (a1 == nullptr || a2 == nullptr) 1652 // return false; 1653 beqz(a1, DONE); 1654 beqz(a2, DONE); 1655 1656 // if (a1.length != a2.length) 1657 // return false; 1658 lwu(cnt1, Address(a1, length_offset)); 1659 lwu(cnt2, Address(a2, length_offset)); 1660 bne(cnt1, cnt2, DONE); 1661 1662 la(a1, Address(a1, base_offset)); 1663 la(a2, Address(a2, base_offset)); 1664 // Check for short strings, i.e. smaller than wordSize. 1665 addi(cnt1, cnt1, -elem_per_word); 1666 bltz(cnt1, SHORT); 1667 1668 // Main 8 byte comparison loop. 1669 bind(NEXT_WORD); { 1670 ld(tmp1, Address(a1)); 1671 ld(tmp2, Address(a2)); 1672 addi(cnt1, cnt1, -elem_per_word); 1673 addi(a1, a1, wordSize); 1674 addi(a2, a2, wordSize); 1675 bne(tmp1, tmp2, DONE); 1676 } bgez(cnt1, NEXT_WORD); 1677 1678 addi(tmp1, cnt1, elem_per_word); 1679 beqz(tmp1, SAME); 1680 1681 bind(SHORT); 1682 test_bit(tmp1, cnt1, 2 - log_elem_size); 1683 beqz(tmp1, TAIL03); // 0-7 bytes left. 1684 { 1685 lwu(tmp1, Address(a1)); 1686 lwu(tmp2, Address(a2)); 1687 addi(a1, a1, 4); 1688 addi(a2, a2, 4); 1689 bne(tmp1, tmp2, DONE); 1690 } 1691 1692 bind(TAIL03); 1693 test_bit(tmp1, cnt1, 1 - log_elem_size); 1694 beqz(tmp1, TAIL01); // 0-3 bytes left. 1695 { 1696 lhu(tmp1, Address(a1)); 1697 lhu(tmp2, Address(a2)); 1698 addi(a1, a1, 2); 1699 addi(a2, a2, 2); 1700 bne(tmp1, tmp2, DONE); 1701 } 1702 1703 bind(TAIL01); 1704 if (elem_size == 1) { // Only needed when comparing byte arrays. 1705 test_bit(tmp1, cnt1, 0); 1706 beqz(tmp1, SAME); // 0-1 bytes left. 1707 { 1708 lbu(tmp1, Address(a1)); 1709 lbu(tmp2, Address(a2)); 1710 bne(tmp1, tmp2, DONE); 1711 } 1712 } 1713 1714 bind(SAME); 1715 mv(result, true); 1716 // That's it. 1717 bind(DONE); 1718 1719 BLOCK_COMMENT("} arrays_equals"); 1720 } 1721 1722 // Compare Strings 1723 1724 // For Strings we're passed the address of the first characters in a1 and a2 1725 // and the length in cnt1. There are two implementations. 1726 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed 1727 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte. 1728 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte. 1729 1730 void C2_MacroAssembler::string_equals(Register a1, Register a2, 1731 Register result, Register cnt1) 1732 { 1733 Label SAME, DONE, SHORT, NEXT_WORD; 1734 Register tmp1 = t0; 1735 Register tmp2 = t1; 1736 1737 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); 1738 1739 BLOCK_COMMENT("string_equals {"); 1740 1741 mv(result, false); 1742 1743 // Check for short strings, i.e. smaller than wordSize. 1744 addi(cnt1, cnt1, -wordSize); 1745 bltz(cnt1, SHORT); 1746 1747 // Main 8 byte comparison loop. 1748 bind(NEXT_WORD); { 1749 ld(tmp1, Address(a1)); 1750 ld(tmp2, Address(a2)); 1751 addi(cnt1, cnt1, -wordSize); 1752 addi(a1, a1, wordSize); 1753 addi(a2, a2, wordSize); 1754 bne(tmp1, tmp2, DONE); 1755 } bgez(cnt1, NEXT_WORD); 1756 1757 addi(tmp1, cnt1, wordSize); 1758 beqz(tmp1, SAME); 1759 1760 bind(SHORT); 1761 Label TAIL03, TAIL01; 1762 1763 // 0-7 bytes left. 1764 test_bit(tmp1, cnt1, 2); 1765 beqz(tmp1, TAIL03); 1766 { 1767 lwu(tmp1, Address(a1)); 1768 lwu(tmp2, Address(a2)); 1769 addi(a1, a1, 4); 1770 addi(a2, a2, 4); 1771 bne(tmp1, tmp2, DONE); 1772 } 1773 1774 bind(TAIL03); 1775 // 0-3 bytes left. 1776 test_bit(tmp1, cnt1, 1); 1777 beqz(tmp1, TAIL01); 1778 { 1779 lhu(tmp1, Address(a1)); 1780 lhu(tmp2, Address(a2)); 1781 addi(a1, a1, 2); 1782 addi(a2, a2, 2); 1783 bne(tmp1, tmp2, DONE); 1784 } 1785 1786 bind(TAIL01); 1787 // 0-1 bytes left. 1788 test_bit(tmp1, cnt1, 0); 1789 beqz(tmp1, SAME); 1790 { 1791 lbu(tmp1, Address(a1)); 1792 lbu(tmp2, Address(a2)); 1793 bne(tmp1, tmp2, DONE); 1794 } 1795 1796 // Arrays are equal. 1797 bind(SAME); 1798 mv(result, true); 1799 1800 // That's it. 1801 bind(DONE); 1802 BLOCK_COMMENT("} string_equals"); 1803 } 1804 1805 // jdk.internal.util.ArraysSupport.vectorizedHashCode 1806 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 1807 Register tmp1, Register tmp2, Register tmp3, 1808 Register tmp4, Register tmp5, Register tmp6, 1809 BasicType eltype) 1810 { 1811 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); 1812 1813 const int elsize = arrays_hashcode_elsize(eltype); 1814 const int chunks_end_shift = exact_log2(elsize); 1815 1816 switch (eltype) { 1817 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 1818 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 1819 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 1820 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 1821 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 1822 default: 1823 ShouldNotReachHere(); 1824 } 1825 1826 const int stride = 4; 1827 const Register pow31_4 = tmp1; 1828 const Register pow31_3 = tmp2; 1829 const Register pow31_2 = tmp3; 1830 const Register chunks = tmp4; 1831 const Register chunks_end = chunks; 1832 1833 Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP; 1834 1835 // result has a value initially 1836 1837 beqz(cnt, DONE); 1838 1839 andi(chunks, cnt, ~(stride-1)); 1840 beqz(chunks, TAIL); 1841 1842 mv(pow31_4, 923521); // [31^^4] 1843 mv(pow31_3, 29791); // [31^^3] 1844 mv(pow31_2, 961); // [31^^2] 1845 1846 slli(chunks_end, chunks, chunks_end_shift); 1847 add(chunks_end, ary, chunks_end); 1848 andi(cnt, cnt, stride-1); // don't forget about tail! 1849 1850 bind(WIDE_LOOP); 1851 mulw(result, result, pow31_4); // 31^^4 * h 1852 arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype); 1853 arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype); 1854 arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype); 1855 arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype); 1856 mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0] 1857 addw(result, result, t0); 1858 mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1] 1859 addw(result, result, t1); 1860 slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2] 1861 subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2] 1862 addw(result, result, tmp5); 1863 addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1] 1864 // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3] 1865 addi(ary, ary, elsize * stride); 1866 bne(ary, chunks_end, WIDE_LOOP); 1867 beqz(cnt, DONE); 1868 1869 bind(TAIL); 1870 slli(chunks_end, cnt, chunks_end_shift); 1871 add(chunks_end, ary, chunks_end); 1872 1873 bind(TAIL_LOOP); 1874 arrays_hashcode_elload(t0, Address(ary), eltype); 1875 slli(t1, result, 5); // optimize 31 * result 1876 subw(result, t1, result); // with result<<5 - result 1877 addw(result, result, t0); 1878 addi(ary, ary, elsize); 1879 bne(ary, chunks_end, TAIL_LOOP); 1880 1881 bind(DONE); 1882 BLOCK_COMMENT("} // arrays_hashcode"); 1883 } 1884 1885 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 1886 switch (eltype) { 1887 case T_BOOLEAN: return sizeof(jboolean); 1888 case T_BYTE: return sizeof(jbyte); 1889 case T_SHORT: return sizeof(jshort); 1890 case T_CHAR: return sizeof(jchar); 1891 case T_INT: return sizeof(jint); 1892 default: 1893 ShouldNotReachHere(); 1894 return -1; 1895 } 1896 } 1897 1898 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 1899 switch (eltype) { 1900 // T_BOOLEAN used as surrogate for unsigned byte 1901 case T_BOOLEAN: lbu(dst, src); break; 1902 case T_BYTE: lb(dst, src); break; 1903 case T_SHORT: lh(dst, src); break; 1904 case T_CHAR: lhu(dst, src); break; 1905 case T_INT: lw(dst, src); break; 1906 default: 1907 ShouldNotReachHere(); 1908 } 1909 } 1910 1911 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); 1912 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, 1913 bool is_far, bool is_unordered); 1914 1915 static conditional_branch_insn conditional_branches[] = 1916 { 1917 /* SHORT branches */ 1918 (conditional_branch_insn)&MacroAssembler::beq, 1919 (conditional_branch_insn)&MacroAssembler::bgt, 1920 nullptr, // BoolTest::overflow 1921 (conditional_branch_insn)&MacroAssembler::blt, 1922 (conditional_branch_insn)&MacroAssembler::bne, 1923 (conditional_branch_insn)&MacroAssembler::ble, 1924 nullptr, // BoolTest::no_overflow 1925 (conditional_branch_insn)&MacroAssembler::bge, 1926 1927 /* UNSIGNED branches */ 1928 (conditional_branch_insn)&MacroAssembler::beq, 1929 (conditional_branch_insn)&MacroAssembler::bgtu, 1930 nullptr, 1931 (conditional_branch_insn)&MacroAssembler::bltu, 1932 (conditional_branch_insn)&MacroAssembler::bne, 1933 (conditional_branch_insn)&MacroAssembler::bleu, 1934 nullptr, 1935 (conditional_branch_insn)&MacroAssembler::bgeu 1936 }; 1937 1938 static float_conditional_branch_insn float_conditional_branches[] = 1939 { 1940 /* FLOAT SHORT branches */ 1941 (float_conditional_branch_insn)&MacroAssembler::float_beq, 1942 (float_conditional_branch_insn)&MacroAssembler::float_bgt, 1943 nullptr, // BoolTest::overflow 1944 (float_conditional_branch_insn)&MacroAssembler::float_blt, 1945 (float_conditional_branch_insn)&MacroAssembler::float_bne, 1946 (float_conditional_branch_insn)&MacroAssembler::float_ble, 1947 nullptr, // BoolTest::no_overflow 1948 (float_conditional_branch_insn)&MacroAssembler::float_bge, 1949 1950 /* DOUBLE SHORT branches */ 1951 (float_conditional_branch_insn)&MacroAssembler::double_beq, 1952 (float_conditional_branch_insn)&MacroAssembler::double_bgt, 1953 nullptr, 1954 (float_conditional_branch_insn)&MacroAssembler::double_blt, 1955 (float_conditional_branch_insn)&MacroAssembler::double_bne, 1956 (float_conditional_branch_insn)&MacroAssembler::double_ble, 1957 nullptr, 1958 (float_conditional_branch_insn)&MacroAssembler::double_bge 1959 }; 1960 1961 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) { 1962 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])), 1963 "invalid conditional branch index"); 1964 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far); 1965 } 1966 1967 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use 1968 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode(). 1969 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) { 1970 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])), 1971 "invalid float conditional branch index"); 1972 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask); 1973 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far, 1974 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true); 1975 } 1976 1977 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1978 switch (cmpFlag) { 1979 case BoolTest::eq: 1980 case BoolTest::le: 1981 beqz(op1, L, is_far); 1982 break; 1983 case BoolTest::ne: 1984 case BoolTest::gt: 1985 bnez(op1, L, is_far); 1986 break; 1987 default: 1988 ShouldNotReachHere(); 1989 } 1990 } 1991 1992 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1993 switch (cmpFlag) { 1994 case BoolTest::eq: 1995 beqz(op1, L, is_far); 1996 break; 1997 case BoolTest::ne: 1998 bnez(op1, L, is_far); 1999 break; 2000 default: 2001 ShouldNotReachHere(); 2002 } 2003 } 2004 2005 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) { 2006 Label L; 2007 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L); 2008 mv(dst, src); 2009 bind(L); 2010 } 2011 2012 // Set dst to NaN if any NaN input. 2013 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2, 2014 bool is_double, bool is_min) { 2015 assert_different_registers(dst, src1, src2); 2016 2017 Label Done, Compare; 2018 2019 is_double ? fclass_d(t0, src1) 2020 : fclass_s(t0, src1); 2021 is_double ? fclass_d(t1, src2) 2022 : fclass_s(t1, src2); 2023 orr(t0, t0, t1); 2024 andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN 2025 beqz(t0, Compare); 2026 is_double ? fadd_d(dst, src1, src2) 2027 : fadd_s(dst, src1, src2); 2028 j(Done); 2029 2030 bind(Compare); 2031 if (is_double) { 2032 is_min ? fmin_d(dst, src1, src2) 2033 : fmax_d(dst, src1, src2); 2034 } else { 2035 is_min ? fmin_s(dst, src1, src2) 2036 : fmax_s(dst, src1, src2); 2037 } 2038 2039 bind(Done); 2040 } 2041 2042 // According to Java SE specification, for floating-point round operations, if 2043 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the 2044 // rounded result; this differs from behavior of RISC-V fcvt instructions (which 2045 // round out-of-range values to the nearest max or min value), therefore special 2046 // handling is needed by NaN, +/-Infinity, +/-0. 2047 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode, 2048 Register tmp1, Register tmp2, Register tmp3) { 2049 2050 assert_different_registers(dst, src); 2051 assert_different_registers(tmp1, tmp2, tmp3); 2052 2053 // Set rounding mode for conversions 2054 // Here we use similar modes to double->long and long->double conversions 2055 // Different mode for long->double conversion matter only if long value was not representable as double, 2056 // we got long value as a result of double->long conversion so, it is definitely representable 2057 RoundingMode rm; 2058 switch (round_mode) { 2059 case RoundDoubleModeNode::rmode_ceil: 2060 rm = RoundingMode::rup; 2061 break; 2062 case RoundDoubleModeNode::rmode_floor: 2063 rm = RoundingMode::rdn; 2064 break; 2065 case RoundDoubleModeNode::rmode_rint: 2066 rm = RoundingMode::rne; 2067 break; 2068 default: 2069 ShouldNotReachHere(); 2070 } 2071 2072 // tmp1 - is a register to store double converted to long int 2073 // tmp2 - is a register to create constant for comparison 2074 // tmp3 - is a register where we store modified result of double->long conversion 2075 Label done, bad_val; 2076 2077 // Conversion from double to long 2078 fcvt_l_d(tmp1, src, rm); 2079 2080 // Generate constant (tmp2) 2081 // tmp2 = 100...0000 2082 addi(tmp2, zr, 1); 2083 slli(tmp2, tmp2, 63); 2084 2085 // Prepare converted long (tmp1) 2086 // as a result when conversion overflow we got: 2087 // tmp1 = 011...1111 or 100...0000 2088 // Convert it to: tmp3 = 100...0000 2089 addi(tmp3, tmp1, 1); 2090 andi(tmp3, tmp3, -2); 2091 beq(tmp3, tmp2, bad_val); 2092 2093 // Conversion from long to double 2094 fcvt_d_l(dst, tmp1, rm); 2095 // Add sign of input value to result for +/- 0 cases 2096 fsgnj_d(dst, dst, src); 2097 j(done); 2098 2099 // If got conversion overflow return src 2100 bind(bad_val); 2101 fmv_d(dst, src); 2102 2103 bind(done); 2104 } 2105 2106 // According to Java SE specification, for floating-point signum operations, if 2107 // on input we have NaN or +/-0.0 value we should return it, 2108 // otherwise return +/- 1.0 using sign of input. 2109 // one - gives us a floating-point 1.0 (got from matching rule) 2110 // bool is_double - specifies single or double precision operations will be used. 2111 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) { 2112 Label done; 2113 2114 is_double ? fclass_d(t0, dst) 2115 : fclass_s(t0, dst); 2116 2117 // check if input is -0, +0, signaling NaN or quiet NaN 2118 andi(t0, t0, fclass_mask::zero | fclass_mask::nan); 2119 2120 bnez(t0, done); 2121 2122 // use floating-point 1.0 with a sign of input 2123 is_double ? fsgnj_d(dst, one, dst) 2124 : fsgnj_s(dst, one, dst); 2125 2126 bind(done); 2127 } 2128 2129 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) { 2130 #define __ masm. 2131 FloatRegister dst = stub.data<0>(); 2132 Register src = stub.data<1>(); 2133 Register tmp = stub.data<2>(); 2134 __ bind(stub.entry()); 2135 2136 // following instructions mainly focus on NaN, as riscv does not handle 2137 // NaN well with fcvt, but the code also works for Inf at the same time. 2138 2139 // construct a NaN in 32 bits from the NaN in 16 bits, 2140 // we need the payloads of non-canonical NaNs to be preserved. 2141 __ mv(tmp, 0x7f800000); 2142 // sign-bit was already set via sign-extension if necessary. 2143 __ slli(t0, src, 13); 2144 __ orr(tmp, t0, tmp); 2145 __ fmv_w_x(dst, tmp); 2146 2147 __ j(stub.continuation()); 2148 #undef __ 2149 } 2150 2151 // j.l.Float.float16ToFloat 2152 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) { 2153 auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path); 2154 2155 // On riscv, NaN needs a special process as fcvt does not work in that case. 2156 // On riscv, Inf does not need a special process as fcvt can handle it correctly. 2157 // but we consider to get the slow path to process NaN and Inf at the same time, 2158 // as both of them are rare cases, and if we try to get the slow path to handle 2159 // only NaN case it would sacrifise the performance for normal cases, 2160 // i.e. non-NaN and non-Inf cases. 2161 2162 // check whether it's a NaN or +/- Inf. 2163 mv(t0, 0x7c00); 2164 andr(tmp, src, t0); 2165 // jump to stub processing NaN and Inf cases. 2166 beq(t0, tmp, stub->entry()); 2167 2168 // non-NaN or non-Inf cases, just use built-in instructions. 2169 fmv_h_x(dst, src); 2170 fcvt_s_h(dst, dst); 2171 2172 bind(stub->continuation()); 2173 } 2174 2175 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) { 2176 #define __ masm. 2177 Register dst = stub.data<0>(); 2178 FloatRegister src = stub.data<1>(); 2179 Register tmp = stub.data<2>(); 2180 __ bind(stub.entry()); 2181 2182 __ fmv_x_w(dst, src); 2183 2184 // preserve the payloads of non-canonical NaNs. 2185 __ srai(dst, dst, 13); 2186 // preserve the sign bit. 2187 __ srai(tmp, dst, 13); 2188 __ slli(tmp, tmp, 10); 2189 __ mv(t0, 0x3ff); 2190 __ orr(tmp, tmp, t0); 2191 2192 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2193 __ andr(dst, dst, tmp); 2194 2195 __ j(stub.continuation()); 2196 #undef __ 2197 } 2198 2199 // j.l.Float.floatToFloat16 2200 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) { 2201 auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path); 2202 2203 // On riscv, NaN needs a special process as fcvt does not work in that case. 2204 2205 // check whether it's a NaN. 2206 // replace fclass with feq as performance optimization. 2207 feq_s(t0, src, src); 2208 // jump to stub processing NaN cases. 2209 beqz(t0, stub->entry()); 2210 2211 // non-NaN cases, just use built-in instructions. 2212 fcvt_h_s(ftmp, src); 2213 fmv_x_h(dst, ftmp); 2214 2215 bind(stub->continuation()); 2216 } 2217 2218 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) { 2219 #define __ masm. 2220 VectorRegister dst = stub.data<0>(); 2221 VectorRegister src = stub.data<1>(); 2222 uint vector_length = stub.data<2>(); 2223 __ bind(stub.entry()); 2224 2225 // following instructions mainly focus on NaN, as riscv does not handle 2226 // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time. 2227 // 2228 // construct NaN's in 32 bits from the NaN's in 16 bits, 2229 // we need the payloads of non-canonical NaNs to be preserved. 2230 2231 // adjust vector type to 2 * SEW. 2232 __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1); 2233 // widen and sign-extend src data. 2234 __ vsext_vf2(dst, src, Assembler::v0_t); 2235 __ mv(t0, 0x7f800000); 2236 // sign-bit was already set via sign-extension if necessary. 2237 __ vsll_vi(dst, dst, 13, Assembler::v0_t); 2238 __ vor_vx(dst, dst, t0, Assembler::v0_t); 2239 2240 __ j(stub.continuation()); 2241 #undef __ 2242 } 2243 2244 // j.l.Float.float16ToFloat 2245 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) { 2246 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint> 2247 (dst, src, vector_length, 24, float16_to_float_v_slow_path); 2248 assert_different_registers(dst, src); 2249 2250 // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case. 2251 // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly. 2252 // but we consider to get the slow path to process NaN and Inf at the same time, 2253 // as both of them are rare cases, and if we try to get the slow path to handle 2254 // only NaN case it would sacrifise the performance for normal cases, 2255 // i.e. non-NaN and non-Inf cases. 2256 2257 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2); 2258 2259 // check whether there is a NaN or +/- Inf. 2260 mv(t0, 0x7c00); 2261 vand_vx(v0, src, t0); 2262 // v0 will be used as mask in slow path. 2263 vmseq_vx(v0, v0, t0); 2264 vcpop_m(t0, v0); 2265 2266 // For non-NaN or non-Inf cases, just use built-in instructions. 2267 vfwcvt_f_f_v(dst, src); 2268 2269 // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide. 2270 bnez(t0, stub->entry()); 2271 2272 bind(stub->continuation()); 2273 } 2274 2275 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm, 2276 C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) { 2277 #define __ masm. 2278 VectorRegister dst = stub.data<0>(); 2279 VectorRegister src = stub.data<1>(); 2280 VectorRegister tmp = stub.data<2>(); 2281 __ bind(stub.entry()); 2282 2283 // mul is already set to mf2 in float_to_float16_v. 2284 2285 // preserve the payloads of non-canonical NaNs. 2286 __ vnsra_wi(dst, src, 13, Assembler::v0_t); 2287 2288 // preserve the sign bit. 2289 __ vnsra_wi(tmp, src, 26, Assembler::v0_t); 2290 __ vsll_vi(tmp, tmp, 10, Assembler::v0_t); 2291 __ mv(t0, 0x3ff); 2292 __ vor_vx(tmp, tmp, t0, Assembler::v0_t); 2293 2294 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2295 __ vand_vv(dst, dst, tmp, Assembler::v0_t); 2296 2297 __ j(stub.continuation()); 2298 #undef __ 2299 } 2300 2301 // j.l.Float.float16ToFloat 2302 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp, 2303 Register tmp, uint vector_length) { 2304 assert_different_registers(dst, src, vtmp); 2305 2306 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister> 2307 (dst, src, vtmp, 28, float_to_float16_v_slow_path); 2308 2309 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case. 2310 2311 vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1); 2312 2313 // check whether there is a NaN. 2314 // replace v_fclass with vmseq_vv as performance optimization. 2315 vmfne_vv(v0, src, src); 2316 vcpop_m(t0, v0); 2317 2318 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp); 2319 2320 // For non-NaN cases, just use built-in instructions. 2321 vfncvt_f_f_w(dst, src); 2322 2323 // jump to stub processing NaN cases. 2324 bnez(t0, stub->entry()); 2325 2326 bind(stub->continuation()); 2327 } 2328 2329 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) { 2330 vsetvli_helper(bt, vlen); 2331 2332 // check if input is -0, +0, signaling NaN or quiet NaN 2333 vfclass_v(v0, dst); 2334 mv(t0, fclass_mask::zero | fclass_mask::nan); 2335 vand_vx(v0, v0, t0); 2336 vmseq_vi(v0, v0, 0); 2337 2338 // use floating-point 1.0 with a sign of input 2339 vfsgnj_vv(dst, one, dst, v0_t); 2340 } 2341 2342 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) { 2343 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2344 // intrinsic is enabled when MaxVectorSize >= 16 2345 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2346 long len = is_long ? 64 : 32; 2347 2348 // load the src data(in bits) to be compressed. 2349 vsetivli(x0, 1, sew, Assembler::m1); 2350 vmv_s_x(v0, src); 2351 // reset the src data(in bytes) to zero. 2352 mv(t0, len); 2353 vsetvli(x0, t0, Assembler::e8, lmul); 2354 vmv_v_i(v4, 0); 2355 // convert the src data from bits to bytes. 2356 vmerge_vim(v4, v4, 1); // v0 as the implicit mask register 2357 // reset the dst data(in bytes) to zero. 2358 vmv_v_i(v8, 0); 2359 // load the mask data(in bits). 2360 vsetivli(x0, 1, sew, Assembler::m1); 2361 vmv_s_x(v0, mask); 2362 // compress the src data(in bytes) to dst(in bytes). 2363 vsetvli(x0, t0, Assembler::e8, lmul); 2364 vcompress_vm(v8, v4, v0); 2365 // convert the dst data from bytes to bits. 2366 vmseq_vi(v0, v8, 1); 2367 // store result back. 2368 vsetivli(x0, 1, sew, Assembler::m1); 2369 vmv_x_s(dst, v0); 2370 } 2371 2372 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) { 2373 compress_bits_v(dst, src, mask, /* is_long */ false); 2374 } 2375 2376 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) { 2377 compress_bits_v(dst, src, mask, /* is_long */ true); 2378 } 2379 2380 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) { 2381 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2382 // intrinsic is enabled when MaxVectorSize >= 16 2383 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2384 long len = is_long ? 64 : 32; 2385 2386 // load the src data(in bits) to be expanded. 2387 vsetivli(x0, 1, sew, Assembler::m1); 2388 vmv_s_x(v0, src); 2389 // reset the src data(in bytes) to zero. 2390 mv(t0, len); 2391 vsetvli(x0, t0, Assembler::e8, lmul); 2392 vmv_v_i(v4, 0); 2393 // convert the src data from bits to bytes. 2394 vmerge_vim(v4, v4, 1); // v0 as implicit mask register 2395 // reset the dst data(in bytes) to zero. 2396 vmv_v_i(v12, 0); 2397 // load the mask data(in bits). 2398 vsetivli(x0, 1, sew, Assembler::m1); 2399 vmv_s_x(v0, mask); 2400 // expand the src data(in bytes) to dst(in bytes). 2401 vsetvli(x0, t0, Assembler::e8, lmul); 2402 viota_m(v8, v0); 2403 vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register 2404 // convert the dst data from bytes to bits. 2405 vmseq_vi(v0, v12, 1); 2406 // store result back. 2407 vsetivli(x0, 1, sew, Assembler::m1); 2408 vmv_x_s(dst, v0); 2409 } 2410 2411 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) { 2412 expand_bits_v(dst, src, mask, /* is_long */ false); 2413 } 2414 2415 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) { 2416 expand_bits_v(dst, src, mask, /* is_long */ true); 2417 } 2418 2419 // j.l.Math.round(float) 2420 // Returns the closest int to the argument, with ties rounding to positive infinity. 2421 // We need to handle 3 special cases defined by java api spec: 2422 // NaN, 2423 // float >= Integer.MAX_VALUE, 2424 // float <= Integer.MIN_VALUE. 2425 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, 2426 BasicType bt, uint vector_length) { 2427 // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined, 2428 // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g. 2429 // RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted 2430 // to 2, instead of 2 and 3 respectively. 2431 // RUP does not work either, although java api requires "rounding to positive infinity", 2432 // but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively. 2433 // 2434 // The optimal solution for non-NaN cases is: 2435 // src+0.5 => dst, with rdn rounding mode, 2436 // convert dst from float to int, with rnd rounding mode. 2437 // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE. 2438 // 2439 // But, we still need to handle NaN explicilty with vector mask instructions. 2440 // 2441 // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details. 2442 2443 csrwi(CSR_FRM, C2_MacroAssembler::rdn); 2444 vsetvli_helper(bt, vector_length); 2445 2446 // don't rearrage the instructions sequence order without performance testing. 2447 // check MacroAssembler::java_round_float in riscv64 for more details. 2448 mv(t0, jint_cast(0.5f)); 2449 fmv_w_x(ftmp, t0); 2450 2451 // replacing vfclass with feq as performance optimization 2452 vmfeq_vv(v0, src, src); 2453 // set dst = 0 in cases of NaN 2454 vmv_v_x(dst, zr); 2455 2456 // dst = (src + 0.5) rounded down towards negative infinity 2457 vfadd_vf(dst, src, ftmp, Assembler::v0_t); 2458 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn 2459 2460 csrwi(CSR_FRM, C2_MacroAssembler::rne); 2461 } 2462 2463 // java.lang.Math.round(double a) 2464 // Returns the closest long to the argument, with ties rounding to positive infinity. 2465 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, 2466 BasicType bt, uint vector_length) { 2467 // check C2_MacroAssembler::java_round_float_v above for more details. 2468 2469 csrwi(CSR_FRM, C2_MacroAssembler::rdn); 2470 vsetvli_helper(bt, vector_length); 2471 2472 mv(t0, julong_cast(0.5)); 2473 fmv_d_x(ftmp, t0); 2474 2475 // replacing vfclass with feq as performance optimization 2476 vmfeq_vv(v0, src, src); 2477 // set dst = 0 in cases of NaN 2478 vmv_v_x(dst, zr); 2479 2480 // dst = (src + 0.5) rounded down towards negative infinity 2481 vfadd_vf(dst, src, ftmp, Assembler::v0_t); 2482 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn 2483 2484 csrwi(CSR_FRM, C2_MacroAssembler::rne); 2485 } 2486 2487 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, 2488 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE, 2489 Assembler::LMUL lmul) { 2490 Label loop; 2491 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16; 2492 2493 bind(loop); 2494 vsetvli(tmp1, cnt, sew, lmul); 2495 vlex_v(vr1, a1, sew); 2496 vlex_v(vr2, a2, sew); 2497 vmsne_vv(vrs, vr1, vr2); 2498 vfirst_m(tmp2, vrs); 2499 bgez(tmp2, DONE); 2500 sub(cnt, cnt, tmp1); 2501 if (!islatin) { 2502 slli(tmp1, tmp1, 1); // get byte counts 2503 } 2504 add(a1, a1, tmp1); 2505 add(a2, a2, tmp1); 2506 bnez(cnt, loop); 2507 2508 mv(result, true); 2509 } 2510 2511 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) { 2512 Label DONE; 2513 Register tmp1 = t0; 2514 Register tmp2 = t1; 2515 2516 BLOCK_COMMENT("string_equals_v {"); 2517 2518 mv(result, false); 2519 2520 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2); 2521 2522 bind(DONE); 2523 BLOCK_COMMENT("} string_equals_v"); 2524 } 2525 2526 // used by C2 ClearArray patterns. 2527 // base: Address of a buffer to be zeroed 2528 // cnt: Count in HeapWords 2529 // 2530 // base, cnt, v4, v5, v6, v7 and t0 are clobbered. 2531 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) { 2532 Label loop; 2533 2534 // making zero words 2535 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2536 vxor_vv(v4, v4, v4); 2537 2538 bind(loop); 2539 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2540 vse64_v(v4, base); 2541 sub(cnt, cnt, t0); 2542 shadd(base, t0, base, t0, 3); 2543 bnez(cnt, loop); 2544 } 2545 2546 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result, 2547 Register cnt1, int elem_size) { 2548 Label DONE; 2549 Register tmp1 = t0; 2550 Register tmp2 = t1; 2551 Register cnt2 = tmp2; 2552 int length_offset = arrayOopDesc::length_offset_in_bytes(); 2553 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 2554 2555 BLOCK_COMMENT("arrays_equals_v {"); 2556 2557 // if (a1 == a2), return true 2558 mv(result, true); 2559 beq(a1, a2, DONE); 2560 2561 mv(result, false); 2562 // if a1 == null or a2 == null, return false 2563 beqz(a1, DONE); 2564 beqz(a2, DONE); 2565 // if (a1.length != a2.length), return false 2566 lwu(cnt1, Address(a1, length_offset)); 2567 lwu(cnt2, Address(a2, length_offset)); 2568 bne(cnt1, cnt2, DONE); 2569 2570 la(a1, Address(a1, base_offset)); 2571 la(a2, Address(a2, base_offset)); 2572 2573 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2); 2574 2575 bind(DONE); 2576 2577 BLOCK_COMMENT("} arrays_equals_v"); 2578 } 2579 2580 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2, 2581 Register result, Register tmp1, Register tmp2, int encForm) { 2582 Label DIFFERENCE, DONE, L, loop; 2583 bool encLL = encForm == StrIntrinsicNode::LL; 2584 bool encLU = encForm == StrIntrinsicNode::LU; 2585 bool encUL = encForm == StrIntrinsicNode::UL; 2586 2587 bool str1_isL = encLL || encLU; 2588 bool str2_isL = encLL || encUL; 2589 2590 int minCharsInWord = encLL ? wordSize : wordSize / 2; 2591 2592 BLOCK_COMMENT("string_compare {"); 2593 2594 // for Latin strings, 1 byte for 1 character 2595 // for UTF16 strings, 2 bytes for 1 character 2596 if (!str1_isL) 2597 sraiw(cnt1, cnt1, 1); 2598 if (!str2_isL) 2599 sraiw(cnt2, cnt2, 1); 2600 2601 // if str1 == str2, return the difference 2602 // save the minimum of the string lengths in cnt2. 2603 sub(result, cnt1, cnt2); 2604 bgt(cnt1, cnt2, L); 2605 mv(cnt2, cnt1); 2606 bind(L); 2607 2608 // We focus on the optimization of small sized string. 2609 // Please check below document for string size distribution statistics. 2610 // https://cr.openjdk.org/~shade/density/string-density-report.pdf 2611 if (str1_isL == str2_isL) { // LL or UU 2612 // Below construction of v regs and lmul is based on test on 2 different boards, 2613 // vlen == 128 and vlen == 256 respectively. 2614 if (!encLL && MaxVectorSize == 16) { // UU 2615 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4); 2616 } else { // UU + MaxVectorSize or LL 2617 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2); 2618 } 2619 2620 j(DONE); 2621 } else { // LU or UL 2622 Register strL = encLU ? str1 : str2; 2623 Register strU = encLU ? str2 : str1; 2624 VectorRegister vstr1 = encLU ? v8 : v4; 2625 VectorRegister vstr2 = encLU ? v4 : v8; 2626 2627 bind(loop); 2628 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2); 2629 vle8_v(vstr1, strL); 2630 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4); 2631 vzext_vf2(vstr2, vstr1); 2632 vle16_v(vstr1, strU); 2633 vmsne_vv(v4, vstr2, vstr1); 2634 vfirst_m(tmp2, v4); 2635 bgez(tmp2, DIFFERENCE); 2636 sub(cnt2, cnt2, tmp1); 2637 add(strL, strL, tmp1); 2638 shadd(strU, tmp1, strU, tmp1, 1); 2639 bnez(cnt2, loop); 2640 j(DONE); 2641 } 2642 2643 bind(DIFFERENCE); 2644 slli(tmp1, tmp2, 1); 2645 add(str1, str1, str1_isL ? tmp2 : tmp1); 2646 add(str2, str2, str2_isL ? tmp2 : tmp1); 2647 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0)); 2648 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0)); 2649 sub(result, tmp1, tmp2); 2650 2651 bind(DONE); 2652 } 2653 2654 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) { 2655 Label loop; 2656 assert_different_registers(src, dst, len, tmp, t0); 2657 2658 BLOCK_COMMENT("byte_array_inflate_v {"); 2659 bind(loop); 2660 vsetvli(tmp, len, Assembler::e8, Assembler::m2); 2661 vle8_v(v6, src); 2662 vsetvli(t0, len, Assembler::e16, Assembler::m4); 2663 vzext_vf2(v4, v6); 2664 vse16_v(v4, dst); 2665 sub(len, len, tmp); 2666 add(src, src, tmp); 2667 shadd(dst, tmp, dst, tmp, 1); 2668 bnez(len, loop); 2669 BLOCK_COMMENT("} byte_array_inflate_v"); 2670 } 2671 2672 // Compress char[] array to byte[]. 2673 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) 2674 // result: the array length if every element in array can be encoded, 2675 // otherwise, the index of first non-latin1 (> 0xff) character. 2676 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, 2677 Register result, Register tmp) { 2678 encode_iso_array_v(src, dst, len, result, tmp, false); 2679 } 2680 2681 // Intrinsic for 2682 // 2683 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray 2684 // return the number of characters copied. 2685 // - java/lang/StringUTF16.compress 2686 // return index of non-latin1 character if copy fails, otherwise 'len'. 2687 // 2688 // This version always returns the number of characters copied. A successful 2689 // copy will complete with the post-condition: 'res' == 'len', while an 2690 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'. 2691 // 2692 // Clobbers: src, dst, len, result, t0 2693 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, 2694 Register result, Register tmp, bool ascii) { 2695 Label loop, fail, done; 2696 2697 BLOCK_COMMENT("encode_iso_array_v {"); 2698 mv(result, 0); 2699 2700 bind(loop); 2701 mv(tmp, ascii ? 0x7f : 0xff); 2702 vsetvli(t0, len, Assembler::e16, Assembler::m2); 2703 vle16_v(v2, src); 2704 2705 vmsgtu_vx(v1, v2, tmp); 2706 vfirst_m(tmp, v1); 2707 vmsbf_m(v0, v1); 2708 // compress char to byte 2709 vsetvli(t0, len, Assembler::e8); 2710 vncvt_x_x_w(v1, v2, Assembler::v0_t); 2711 vse8_v(v1, dst, Assembler::v0_t); 2712 2713 // fail if char > 0x7f/0xff 2714 bgez(tmp, fail); 2715 add(result, result, t0); 2716 add(dst, dst, t0); 2717 sub(len, len, t0); 2718 shadd(src, t0, src, t0, 1); 2719 bnez(len, loop); 2720 j(done); 2721 2722 bind(fail); 2723 add(result, result, tmp); 2724 2725 bind(done); 2726 BLOCK_COMMENT("} encode_iso_array_v"); 2727 } 2728 2729 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) { 2730 Label LOOP, SET_RESULT, DONE; 2731 2732 BLOCK_COMMENT("count_positives_v {"); 2733 assert_different_registers(ary, len, result, tmp); 2734 2735 mv(result, zr); 2736 2737 bind(LOOP); 2738 vsetvli(t0, len, Assembler::e8, Assembler::m4); 2739 vle8_v(v4, ary); 2740 vmslt_vx(v4, v4, zr); 2741 vfirst_m(tmp, v4); 2742 bgez(tmp, SET_RESULT); 2743 // if tmp == -1, all bytes are positive 2744 add(result, result, t0); 2745 2746 sub(len, len, t0); 2747 add(ary, ary, t0); 2748 bnez(len, LOOP); 2749 j(DONE); 2750 2751 // add remaining positive bytes count 2752 bind(SET_RESULT); 2753 add(result, result, tmp); 2754 2755 bind(DONE); 2756 BLOCK_COMMENT("} count_positives_v"); 2757 } 2758 2759 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1, 2760 Register ch, Register result, 2761 Register tmp1, Register tmp2, 2762 bool isL) { 2763 mv(result, zr); 2764 2765 Label loop, MATCH, DONE; 2766 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16; 2767 bind(loop); 2768 vsetvli(tmp1, cnt1, sew, Assembler::m4); 2769 vlex_v(v4, str1, sew); 2770 vmseq_vx(v4, v4, ch); 2771 vfirst_m(tmp2, v4); 2772 bgez(tmp2, MATCH); // if equal, return index 2773 2774 add(result, result, tmp1); 2775 sub(cnt1, cnt1, tmp1); 2776 if (!isL) slli(tmp1, tmp1, 1); 2777 add(str1, str1, tmp1); 2778 bnez(cnt1, loop); 2779 2780 mv(result, -1); 2781 j(DONE); 2782 2783 bind(MATCH); 2784 add(result, result, tmp2); 2785 2786 bind(DONE); 2787 } 2788 2789 // Set dst to NaN if any NaN input. 2790 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2791 BasicType bt, bool is_min, uint vector_length) { 2792 assert_different_registers(dst, src1, src2); 2793 2794 vsetvli_helper(bt, vector_length); 2795 2796 is_min ? vfmin_vv(dst, src1, src2) 2797 : vfmax_vv(dst, src1, src2); 2798 2799 vmfne_vv(v0, src1, src1); 2800 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2801 vmfne_vv(v0, src2, src2); 2802 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2803 } 2804 2805 // Set dst to NaN if any NaN input. 2806 // The destination vector register elements corresponding to masked-off elements 2807 // are handled with a mask-undisturbed policy. 2808 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2809 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2, 2810 BasicType bt, bool is_min, uint vector_length) { 2811 assert_different_registers(src1, src2, tmp1, tmp2); 2812 vsetvli_helper(bt, vector_length); 2813 2814 // Check vector elements of src1 and src2 for NaN. 2815 vmfeq_vv(tmp1, src1, src1); 2816 vmfeq_vv(tmp2, src2, src2); 2817 2818 vmandn_mm(v0, vmask, tmp1); 2819 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2820 vmandn_mm(v0, vmask, tmp2); 2821 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2822 2823 vmand_mm(tmp2, tmp1, tmp2); 2824 vmand_mm(v0, vmask, tmp2); 2825 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t) 2826 : vfmax_vv(dst, src1, src2, Assembler::v0_t); 2827 } 2828 2829 // Set dst to NaN if any NaN input. 2830 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst, 2831 FloatRegister src1, VectorRegister src2, 2832 VectorRegister tmp1, VectorRegister tmp2, 2833 bool is_double, bool is_min, uint vector_length, VectorMask vm) { 2834 assert_different_registers(dst, src1); 2835 assert_different_registers(src2, tmp1, tmp2); 2836 2837 Label L_done, L_NaN_1, L_NaN_2; 2838 // Set dst to src1 if src1 is NaN 2839 is_double ? feq_d(t0, src1, src1) 2840 : feq_s(t0, src1, src1); 2841 beqz(t0, L_NaN_2); 2842 2843 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 2844 vfmv_s_f(tmp2, src1); 2845 2846 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm) 2847 : vfredmax_vs(tmp1, src2, tmp2, vm); 2848 vfmv_f_s(dst, tmp1); 2849 2850 // Checking NaNs in src2 2851 vmfne_vv(tmp1, src2, src2, vm); 2852 vcpop_m(t0, tmp1, vm); 2853 beqz(t0, L_done); 2854 2855 bind(L_NaN_1); 2856 vfredusum_vs(tmp1, src2, tmp2, vm); 2857 vfmv_f_s(dst, tmp1); 2858 j(L_done); 2859 2860 bind(L_NaN_2); 2861 is_double ? fmv_d(dst, src1) 2862 : fmv_s(dst, src1); 2863 bind(L_done); 2864 } 2865 2866 bool C2_MacroAssembler::in_scratch_emit_size() { 2867 if (ciEnv::current()->task() != nullptr) { 2868 PhaseOutput* phase_output = Compile::current()->output(); 2869 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2870 return true; 2871 } 2872 } 2873 return MacroAssembler::in_scratch_emit_size(); 2874 } 2875 2876 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, 2877 VectorRegister src2, VectorRegister tmp, 2878 int opc, BasicType bt, uint vector_length, VectorMask vm) { 2879 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2880 vsetvli_helper(bt, vector_length); 2881 vmv_s_x(tmp, src1); 2882 switch (opc) { 2883 case Op_AddReductionVI: 2884 case Op_AddReductionVL: 2885 vredsum_vs(tmp, src2, tmp, vm); 2886 break; 2887 case Op_AndReductionV: 2888 vredand_vs(tmp, src2, tmp, vm); 2889 break; 2890 case Op_OrReductionV: 2891 vredor_vs(tmp, src2, tmp, vm); 2892 break; 2893 case Op_XorReductionV: 2894 vredxor_vs(tmp, src2, tmp, vm); 2895 break; 2896 case Op_MaxReductionV: 2897 vredmax_vs(tmp, src2, tmp, vm); 2898 break; 2899 case Op_MinReductionV: 2900 vredmin_vs(tmp, src2, tmp, vm); 2901 break; 2902 default: 2903 ShouldNotReachHere(); 2904 } 2905 vmv_x_s(dst, tmp); 2906 } 2907 2908 // Set vl and vtype for full and partial vector operations. 2909 // (vma = mu, vta = tu, vill = false) 2910 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) { 2911 Assembler::SEW sew = Assembler::elemtype_to_sew(bt); 2912 if (vector_length <= 31) { 2913 vsetivli(tmp, vector_length, sew, vlmul); 2914 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) { 2915 vsetvli(tmp, x0, sew, vlmul); 2916 } else { 2917 mv(tmp, vector_length); 2918 vsetvli(tmp, tmp, sew, vlmul); 2919 } 2920 } 2921 2922 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2923 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2924 assert(is_integral_type(bt), "unsupported element type"); 2925 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2926 vsetvli_helper(bt, vector_length); 2927 vmclr_m(vd); 2928 switch (cond) { 2929 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break; 2930 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break; 2931 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break; 2932 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break; 2933 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break; 2934 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break; 2935 case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break; 2936 case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break; 2937 case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break; 2938 case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break; 2939 default: 2940 assert(false, "unsupported compare condition"); 2941 ShouldNotReachHere(); 2942 } 2943 } 2944 2945 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2946 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2947 assert(is_floating_point_type(bt), "unsupported element type"); 2948 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2949 vsetvli_helper(bt, vector_length); 2950 vmclr_m(vd); 2951 switch (cond) { 2952 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break; 2953 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break; 2954 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break; 2955 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break; 2956 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break; 2957 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break; 2958 default: 2959 assert(false, "unsupported compare condition"); 2960 ShouldNotReachHere(); 2961 } 2962 } 2963 2964 // In Matcher::scalable_predicate_reg_slots, 2965 // we assume each predicate register is one-eighth of the size of 2966 // scalable vector register, one mask bit per vector byte. 2967 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) { 2968 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2969 add(t0, sp, offset); 2970 vse8_v(v, t0); 2971 } 2972 2973 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) { 2974 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2975 add(t0, sp, offset); 2976 vle8_v(v, t0); 2977 } 2978 2979 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2980 VectorRegister src, BasicType src_bt, bool is_signed) { 2981 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size"); 2982 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2983 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands 2984 // The destination EEW is greater than the source EEW, the source EMUL is at least 1, 2985 // and the overlap is in the highest-numbered part of the destination register group. 2986 // Since LMUL=1, vd and vs cannot be the same. 2987 assert_different_registers(dst, src); 2988 2989 vsetvli_helper(dst_bt, vector_length); 2990 if (is_signed) { 2991 if (src_bt == T_BYTE) { 2992 switch (dst_bt) { 2993 case T_SHORT: 2994 vsext_vf2(dst, src); 2995 break; 2996 case T_INT: 2997 vsext_vf4(dst, src); 2998 break; 2999 case T_LONG: 3000 vsext_vf8(dst, src); 3001 break; 3002 default: 3003 ShouldNotReachHere(); 3004 } 3005 } else if (src_bt == T_SHORT) { 3006 if (dst_bt == T_INT) { 3007 vsext_vf2(dst, src); 3008 } else { 3009 vsext_vf4(dst, src); 3010 } 3011 } else if (src_bt == T_INT) { 3012 vsext_vf2(dst, src); 3013 } 3014 } else { 3015 if (src_bt == T_BYTE) { 3016 switch (dst_bt) { 3017 case T_SHORT: 3018 vzext_vf2(dst, src); 3019 break; 3020 case T_INT: 3021 vzext_vf4(dst, src); 3022 break; 3023 case T_LONG: 3024 vzext_vf8(dst, src); 3025 break; 3026 default: 3027 ShouldNotReachHere(); 3028 } 3029 } else if (src_bt == T_SHORT) { 3030 if (dst_bt == T_INT) { 3031 vzext_vf2(dst, src); 3032 } else { 3033 vzext_vf4(dst, src); 3034 } 3035 } else if (src_bt == T_INT) { 3036 vzext_vf2(dst, src); 3037 } 3038 } 3039 } 3040 3041 // Vector narrow from src to dst with specified element sizes. 3042 // High part of dst vector will be filled with zero. 3043 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 3044 VectorRegister src, BasicType src_bt) { 3045 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size"); 3046 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 3047 mv(t0, vector_length); 3048 if (src_bt == T_LONG) { 3049 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions 3050 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source. 3051 // So we can currently only scale down by 1/2 the width at a time. 3052 vsetvli(t0, t0, Assembler::e32, Assembler::mf2); 3053 vncvt_x_x_w(dst, src); 3054 if (dst_bt == T_SHORT || dst_bt == T_BYTE) { 3055 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 3056 vncvt_x_x_w(dst, dst); 3057 if (dst_bt == T_BYTE) { 3058 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 3059 vncvt_x_x_w(dst, dst); 3060 } 3061 } 3062 } else if (src_bt == T_INT) { 3063 // T_SHORT 3064 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 3065 vncvt_x_x_w(dst, src); 3066 if (dst_bt == T_BYTE) { 3067 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 3068 vncvt_x_x_w(dst, dst); 3069 } 3070 } else if (src_bt == T_SHORT) { 3071 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 3072 vncvt_x_x_w(dst, src); 3073 } 3074 } 3075 3076 #define VFCVT_SAFE(VFLOATCVT) \ 3077 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \ 3078 assert_different_registers(dst, src); \ 3079 vxor_vv(dst, dst, dst); \ 3080 vmfeq_vv(v0, src, src); \ 3081 VFLOATCVT(dst, src, Assembler::v0_t); \ 3082 } 3083 3084 VFCVT_SAFE(vfcvt_rtz_x_f_v); 3085 3086 #undef VFCVT_SAFE 3087 3088 // Extract a scalar element from an vector at position 'idx'. 3089 // The input elements in src are expected to be of integral type. 3090 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt, 3091 int idx, VectorRegister tmp) { 3092 assert(is_integral_type(bt), "unsupported element type"); 3093 assert(idx >= 0, "idx cannot be negative"); 3094 // Only need the first element after vector slidedown 3095 vsetvli_helper(bt, 1); 3096 if (idx == 0) { 3097 vmv_x_s(dst, src); 3098 } else if (idx <= 31) { 3099 vslidedown_vi(tmp, src, idx); 3100 vmv_x_s(dst, tmp); 3101 } else { 3102 mv(t0, idx); 3103 vslidedown_vx(tmp, src, t0); 3104 vmv_x_s(dst, tmp); 3105 } 3106 } 3107 3108 // Extract a scalar element from an vector at position 'idx'. 3109 // The input elements in src are expected to be of floating point type. 3110 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, 3111 int idx, VectorRegister tmp) { 3112 assert(is_floating_point_type(bt), "unsupported element type"); 3113 assert(idx >= 0, "idx cannot be negative"); 3114 // Only need the first element after vector slidedown 3115 vsetvli_helper(bt, 1); 3116 if (idx == 0) { 3117 vfmv_f_s(dst, src); 3118 } else if (idx <= 31) { 3119 vslidedown_vi(tmp, src, idx); 3120 vfmv_f_s(dst, tmp); 3121 } else { 3122 mv(t0, idx); 3123 vslidedown_vx(tmp, src, t0); 3124 vfmv_f_s(dst, tmp); 3125 } 3126 } --- EOF ---