1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/compile.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, 48 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) { 49 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure. 50 Register flag = t1; 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmp1Reg; 54 Register tmp = tmp2Reg; 55 Label object_has_monitor; 56 // Finish fast lock successfully. MUST branch to with flag == 0 57 Label locked; 58 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 59 Label slow_path; 60 61 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 62 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0); 63 64 mv(flag, 1); 65 66 // Load markWord from object into displaced_header. 67 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 68 69 if (DiagnoseSyncOnValueBasedClasses != 0) { 70 load_klass(tmp, oop); 71 lbu(tmp, Address(tmp, Klass::misc_flags_offset())); 72 test_bit(tmp, tmp, exact_log2(KlassFlags::_misc_is_value_based_class)); 73 bnez(tmp, slow_path); 74 } 75 76 // Check for existing monitor 77 test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value)); 78 bnez(tmp, object_has_monitor); 79 80 if (LockingMode == LM_MONITOR) { 81 j(slow_path); 82 } else { 83 assert(LockingMode == LM_LEGACY, "must be"); 84 // Set tmp to be (markWord of object | UNLOCK_VALUE). 85 ori(tmp, disp_hdr, markWord::unlocked_value); 86 87 // Initialize the box. (Must happen before we update the object mark!) 88 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 89 90 // Compare object markWord with an unlocked value (tmp) and if 91 // equal exchange the stack address of our box with object markWord. 92 // On failure disp_hdr contains the possibly locked markWord. 93 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, 94 Assembler::aq, Assembler::rl, /*result*/disp_hdr); 95 beq(disp_hdr, tmp, locked); 96 97 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 98 99 // If the compare-and-exchange succeeded, then we found an unlocked 100 // object, will have now locked it will continue at label locked 101 // We did not see an unlocked object so try the fast recursive case. 102 103 // Check if the owner is self by comparing the value in the 104 // markWord of object (disp_hdr) with the stack pointer. 105 sub(disp_hdr, disp_hdr, sp); 106 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place)); 107 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked, 108 // hence we can store 0 as the displaced header in the box, which indicates that it is a 109 // recursive lock. 110 andr(tmp/*==0?*/, disp_hdr, tmp); 111 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 112 beqz(tmp, locked); 113 j(slow_path); 114 } 115 116 // Handle existing monitor. 117 bind(object_has_monitor); 118 // The object's monitor m is unlocked iff m->owner == nullptr, 119 // otherwise m->owner may contain a thread or a stack address. 120 // 121 // Try to CAS m->owner from null to current thread. 122 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value)); 123 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, 124 Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected) 125 126 // Store a non-null value into the box to avoid looking like a re-entrant 127 // lock. The fast-path monitor unlock code checks for 128 // markWord::monitor_value so use markWord::unused_mark which has the 129 // relevant bit set, and also matches ObjectSynchronizer::slow_enter. 130 mv(tmp, (address)markWord::unused_mark().value()); 131 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 132 133 beqz(tmp3Reg, locked); // CAS success means locking succeeded 134 135 bne(tmp3Reg, xthread, slow_path); // Check for recursive locking 136 137 // Recursive lock case 138 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg); 139 140 bind(locked); 141 mv(flag, zr); 142 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg); 143 144 #ifdef ASSERT 145 // Check that locked label is reached with flag == 0. 146 Label flag_correct; 147 beqz(flag, flag_correct); 148 stop("Fast Lock Flag != 0"); 149 #endif 150 151 bind(slow_path); 152 #ifdef ASSERT 153 // Check that slow_path label is reached with flag != 0. 154 bnez(flag, flag_correct); 155 stop("Fast Lock Flag == 0"); 156 bind(flag_correct); 157 #endif 158 // C2 uses the value of flag (0 vs !0) to determine the continuation. 159 } 160 161 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, 162 Register tmp1Reg, Register tmp2Reg) { 163 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure. 164 Register flag = t1; 165 Register oop = objectReg; 166 Register box = boxReg; 167 Register disp_hdr = tmp1Reg; 168 Register owner_addr = tmp1Reg; 169 Register tmp = tmp2Reg; 170 Label object_has_monitor; 171 // Finish fast lock successfully. MUST branch to with flag == 0 172 Label unlocked; 173 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 174 Label slow_path; 175 176 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 177 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0); 178 179 mv(flag, 1); 180 181 if (LockingMode == LM_LEGACY) { 182 // Find the lock address and load the displaced header from the stack. 183 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 184 185 // If the displaced header is 0, we have a recursive unlock. 186 beqz(disp_hdr, unlocked); 187 } 188 189 // Handle existing monitor. 190 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 191 test_bit(t0, tmp, exact_log2(markWord::monitor_value)); 192 bnez(t0, object_has_monitor); 193 194 if (LockingMode == LM_MONITOR) { 195 j(slow_path); 196 } else { 197 assert(LockingMode == LM_LEGACY, "must be"); 198 // Check if it is still a light weight lock, this is true if we 199 // see the stack address of the basicLock in the markWord of the 200 // object. 201 202 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, 203 Assembler::relaxed, Assembler::rl, /*result*/tmp); 204 beq(box, tmp, unlocked); // box == tmp if cas succeeds 205 j(slow_path); 206 } 207 208 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 209 210 // Handle existing monitor. 211 bind(object_has_monitor); 212 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 213 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 214 215 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 216 217 Label notRecursive; 218 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive. 219 220 // Recursive lock 221 addi(disp_hdr, disp_hdr, -1); 222 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 223 j(unlocked); 224 225 bind(notRecursive); 226 // Compute owner address. 227 la(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 228 229 // Set owner to null. 230 // Release to satisfy the JMM 231 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 232 sd(zr, Address(owner_addr)); 233 // We need a full fence after clearing owner to avoid stranding. 234 // StoreLoad achieves this. 235 membar(StoreLoad); 236 237 // Check if the entry lists are empty (EntryList first - by convention). 238 ld(t0, Address(tmp, ObjectMonitor::EntryList_offset())); 239 ld(tmp1Reg, Address(tmp, ObjectMonitor::cxq_offset())); 240 orr(t0, t0, tmp1Reg); 241 beqz(t0, unlocked); // If so we are done. 242 243 // Check if there is a successor. 244 ld(t0, Address(tmp, ObjectMonitor::succ_offset())); 245 bnez(t0, unlocked); // If so we are done. 246 247 // Save the monitor pointer in the current thread, so we can try to 248 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 249 sd(tmp, Address(xthread, JavaThread::unlocked_inflated_monitor_offset())); 250 251 mv(flag, 1); 252 j(slow_path); 253 254 bind(unlocked); 255 mv(flag, zr); 256 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg); 257 258 #ifdef ASSERT 259 // Check that unlocked label is reached with flag == 0. 260 Label flag_correct; 261 beqz(flag, flag_correct); 262 stop("Fast Lock Flag != 0"); 263 #endif 264 265 bind(slow_path); 266 #ifdef ASSERT 267 // Check that slow_path label is reached with flag != 0. 268 bnez(flag, flag_correct); 269 stop("Fast Lock Flag == 0"); 270 bind(flag_correct); 271 #endif 272 // C2 uses the value of flag (0 vs !0) to determine the continuation. 273 } 274 275 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, 276 Register tmp1, Register tmp2, Register tmp3) { 277 // Flag register, zero for success; non-zero for failure. 278 Register flag = t1; 279 280 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 281 assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0); 282 283 mv(flag, 1); 284 285 // Handle inflated monitor. 286 Label inflated; 287 // Finish fast lock successfully. MUST branch to with flag == 0 288 Label locked; 289 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 290 Label slow_path; 291 292 if (UseObjectMonitorTable) { 293 // Clear cache in case fast locking succeeds. 294 sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 295 } 296 297 if (DiagnoseSyncOnValueBasedClasses != 0) { 298 load_klass(tmp1, obj); 299 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset())); 300 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class)); 301 bnez(tmp1, slow_path); 302 } 303 304 const Register tmp1_mark = tmp1; 305 const Register tmp3_t = tmp3; 306 307 { // Lightweight locking 308 309 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0 310 Label push; 311 312 const Register tmp2_top = tmp2; 313 314 // Check if lock-stack is full. 315 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 316 mv(tmp3_t, (unsigned)LockStack::end_offset()); 317 bge(tmp2_top, tmp3_t, slow_path); 318 319 // Check if recursive. 320 add(tmp3_t, xthread, tmp2_top); 321 ld(tmp3_t, Address(tmp3_t, -oopSize)); 322 beq(obj, tmp3_t, push); 323 324 // Relaxed normal load to check for monitor. Optimization for monitor case. 325 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 326 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 327 bnez(tmp3_t, inflated); 328 329 // Not inflated 330 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la"); 331 332 // Try to lock. Transition lock-bits 0b01 => 0b00 333 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value); 334 xori(tmp3_t, tmp1_mark, markWord::unlocked_value); 335 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 336 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t); 337 bne(tmp1_mark, tmp3_t, slow_path); 338 339 bind(push); 340 // After successful lock, push object on lock-stack. 341 add(tmp3_t, xthread, tmp2_top); 342 sd(obj, Address(tmp3_t)); 343 addw(tmp2_top, tmp2_top, oopSize); 344 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 345 j(locked); 346 } 347 348 { // Handle inflated monitor. 349 bind(inflated); 350 351 const Register tmp1_monitor = tmp1; 352 if (!UseObjectMonitorTable) { 353 assert(tmp1_monitor == tmp1_mark, "should be the same here"); 354 } else { 355 Label monitor_found; 356 357 // Load cache address 358 la(tmp3_t, Address(xthread, JavaThread::om_cache_oops_offset())); 359 360 const int num_unrolled = 2; 361 for (int i = 0; i < num_unrolled; i++) { 362 ld(tmp1, Address(tmp3_t)); 363 beq(obj, tmp1, monitor_found); 364 add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference())); 365 } 366 367 Label loop; 368 369 // Search for obj in cache. 370 bind(loop); 371 372 // Check for match. 373 ld(tmp1, Address(tmp3_t)); 374 beq(obj, tmp1, monitor_found); 375 376 // Search until null encountered, guaranteed _null_sentinel at end. 377 add(tmp3_t, tmp3_t, in_bytes(OMCache::oop_to_oop_difference())); 378 bnez(tmp1, loop); 379 // Cache Miss. Take the slowpath. 380 j(slow_path); 381 382 bind(monitor_found); 383 ld(tmp1_monitor, Address(tmp3_t, OMCache::oop_to_monitor_difference())); 384 } 385 386 const Register tmp2_owner_addr = tmp2; 387 const Register tmp3_owner = tmp3; 388 389 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 390 const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 391 const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 392 393 Label monitor_locked; 394 395 // Compute owner address. 396 la(tmp2_owner_addr, owner_address); 397 398 // CAS owner (null => current thread). 399 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64, 400 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner); 401 beqz(tmp3_owner, monitor_locked); 402 403 // Check if recursive. 404 bne(tmp3_owner, xthread, slow_path); 405 406 // Recursive. 407 increment(recursions_address, 1, tmp2, tmp3); 408 409 bind(monitor_locked); 410 if (UseObjectMonitorTable) { 411 sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 412 } 413 } 414 415 bind(locked); 416 mv(flag, zr); 417 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 418 419 #ifdef ASSERT 420 // Check that locked label is reached with flag == 0. 421 Label flag_correct; 422 beqz(flag, flag_correct); 423 stop("Fast Lock Flag != 0"); 424 #endif 425 426 bind(slow_path); 427 #ifdef ASSERT 428 // Check that slow_path label is reached with flag != 0. 429 bnez(flag, flag_correct); 430 stop("Fast Lock Flag == 0"); 431 bind(flag_correct); 432 #endif 433 // C2 uses the value of flag (0 vs !0) to determine the continuation. 434 } 435 436 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, 437 Register tmp1, Register tmp2, Register tmp3) { 438 // Flag register, zero for success; non-zero for failure. 439 Register flag = t1; 440 441 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 442 assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0); 443 444 mv(flag, 1); 445 446 // Handle inflated monitor. 447 Label inflated, inflated_load_mark; 448 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0 449 Label unlocked; 450 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0 451 Label slow_path; 452 453 const Register tmp1_mark = tmp1; 454 const Register tmp2_top = tmp2; 455 const Register tmp3_t = tmp3; 456 457 { // Lightweight unlock 458 Label push_and_slow_path; 459 460 // Check if obj is top of lock-stack. 461 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 462 subw(tmp2_top, tmp2_top, oopSize); 463 add(tmp3_t, xthread, tmp2_top); 464 ld(tmp3_t, Address(tmp3_t)); 465 // Top of lock stack was not obj. Must be monitor. 466 bne(obj, tmp3_t, inflated_load_mark); 467 468 // Pop lock-stack. 469 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 470 DEBUG_ONLY(sd(zr, Address(tmp3_t));) 471 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 472 473 // Check if recursive. 474 add(tmp3_t, xthread, tmp2_top); 475 ld(tmp3_t, Address(tmp3_t, -oopSize)); 476 beq(obj, tmp3_t, unlocked); 477 478 // Not recursive. 479 // Load Mark. 480 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 481 482 // Check header for monitor (0b10). 483 // Because we got here by popping (meaning we pushed in locked) 484 // there will be no monitor in the box. So we need to push back the obj 485 // so that the runtime can fix any potential anonymous owner. 486 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 487 bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated); 488 489 // Try to unlock. Transition lock bits 0b00 => 0b01 490 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 491 ori(tmp3_t, tmp1_mark, markWord::unlocked_value); 492 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 493 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t); 494 beq(tmp1_mark, tmp3_t, unlocked); 495 496 bind(push_and_slow_path); 497 // Compare and exchange failed. 498 // Restore lock-stack and handle the unlock in runtime. 499 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 500 DEBUG_ONLY(sd(obj, Address(tmp3_t));) 501 addw(tmp2_top, tmp2_top, oopSize); 502 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 503 j(slow_path); 504 } 505 506 { // Handle inflated monitor. 507 bind(inflated_load_mark); 508 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 509 #ifdef ASSERT 510 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 511 bnez(tmp3_t, inflated); 512 stop("Fast Unlock not monitor"); 513 #endif 514 515 bind(inflated); 516 517 #ifdef ASSERT 518 Label check_done; 519 subw(tmp2_top, tmp2_top, oopSize); 520 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset())); 521 blt(tmp2_top, tmp3_t, check_done); 522 add(tmp3_t, xthread, tmp2_top); 523 ld(tmp3_t, Address(tmp3_t)); 524 bne(obj, tmp3_t, inflated); 525 stop("Fast Unlock lock on stack"); 526 bind(check_done); 527 #endif 528 529 const Register tmp1_monitor = tmp1; 530 531 if (!UseObjectMonitorTable) { 532 assert(tmp1_monitor == tmp1_mark, "should be the same here"); 533 // Untag the monitor. 534 add(tmp1_monitor, tmp1_mark, -(int)markWord::monitor_value); 535 } else { 536 ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 537 // No valid pointer below alignof(ObjectMonitor*). Take the slow path. 538 mv(tmp3_t, alignof(ObjectMonitor*)); 539 bltu(tmp1_monitor, tmp3_t, slow_path); 540 } 541 542 const Register tmp2_recursions = tmp2; 543 Label not_recursive; 544 545 // Check if recursive. 546 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 547 beqz(tmp2_recursions, not_recursive); 548 549 // Recursive unlock. 550 addi(tmp2_recursions, tmp2_recursions, -1); 551 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 552 j(unlocked); 553 554 bind(not_recursive); 555 556 const Register tmp2_owner_addr = tmp2; 557 558 // Compute owner address. 559 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset())); 560 561 // Set owner to null. 562 // Release to satisfy the JMM 563 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 564 sd(zr, Address(tmp2_owner_addr)); 565 // We need a full fence after clearing owner to avoid stranding. 566 // StoreLoad achieves this. 567 membar(StoreLoad); 568 569 // Check if the entry lists are empty (EntryList first - by convention). 570 ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset())); 571 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset())); 572 orr(t0, t0, tmp3_t); 573 beqz(t0, unlocked); // If so we are done. 574 575 // Check if there is a successor. 576 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset())); 577 bnez(tmp3_t, unlocked); // If so we are done. 578 579 // Save the monitor pointer in the current thread, so we can try 580 // to reacquire the lock in SharedRuntime::monitor_exit_helper(). 581 sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset())); 582 583 mv(flag, 1); 584 j(slow_path); 585 } 586 587 bind(unlocked); 588 mv(flag, zr); 589 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 590 591 #ifdef ASSERT 592 // Check that unlocked label is reached with flag == 0. 593 Label flag_correct; 594 beqz(flag, flag_correct); 595 stop("Fast Lock Flag != 0"); 596 #endif 597 598 bind(slow_path); 599 #ifdef ASSERT 600 // Check that slow_path label is reached with flag != 0. 601 bnez(flag, flag_correct); 602 stop("Fast Lock Flag == 0"); 603 bind(flag_correct); 604 #endif 605 // C2 uses the value of flag (0 vs !0) to determine the continuation. 606 } 607 608 // short string 609 // StringUTF16.indexOfChar 610 // StringLatin1.indexOfChar 611 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1, 612 Register ch, Register result, 613 bool isL) 614 { 615 Register ch1 = t0; 616 Register index = t1; 617 618 BLOCK_COMMENT("string_indexof_char_short {"); 619 620 Label LOOP, LOOP1, LOOP4, LOOP8; 621 Label MATCH, MATCH1, MATCH2, MATCH3, 622 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH; 623 624 mv(result, -1); 625 mv(index, zr); 626 627 bind(LOOP); 628 addi(t0, index, 8); 629 ble(t0, cnt1, LOOP8); 630 addi(t0, index, 4); 631 ble(t0, cnt1, LOOP4); 632 j(LOOP1); 633 634 bind(LOOP8); 635 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 636 beq(ch, ch1, MATCH); 637 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 638 beq(ch, ch1, MATCH1); 639 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 640 beq(ch, ch1, MATCH2); 641 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 642 beq(ch, ch1, MATCH3); 643 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8)); 644 beq(ch, ch1, MATCH4); 645 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10)); 646 beq(ch, ch1, MATCH5); 647 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12)); 648 beq(ch, ch1, MATCH6); 649 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14)); 650 beq(ch, ch1, MATCH7); 651 addi(index, index, 8); 652 addi(str1, str1, isL ? 8 : 16); 653 blt(index, cnt1, LOOP); 654 j(NOMATCH); 655 656 bind(LOOP4); 657 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 658 beq(ch, ch1, MATCH); 659 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 660 beq(ch, ch1, MATCH1); 661 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 662 beq(ch, ch1, MATCH2); 663 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 664 beq(ch, ch1, MATCH3); 665 addi(index, index, 4); 666 addi(str1, str1, isL ? 4 : 8); 667 bge(index, cnt1, NOMATCH); 668 669 bind(LOOP1); 670 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1)); 671 beq(ch, ch1, MATCH); 672 addi(index, index, 1); 673 addi(str1, str1, isL ? 1 : 2); 674 blt(index, cnt1, LOOP1); 675 j(NOMATCH); 676 677 bind(MATCH1); 678 addi(index, index, 1); 679 j(MATCH); 680 681 bind(MATCH2); 682 addi(index, index, 2); 683 j(MATCH); 684 685 bind(MATCH3); 686 addi(index, index, 3); 687 j(MATCH); 688 689 bind(MATCH4); 690 addi(index, index, 4); 691 j(MATCH); 692 693 bind(MATCH5); 694 addi(index, index, 5); 695 j(MATCH); 696 697 bind(MATCH6); 698 addi(index, index, 6); 699 j(MATCH); 700 701 bind(MATCH7); 702 addi(index, index, 7); 703 704 bind(MATCH); 705 mv(result, index); 706 bind(NOMATCH); 707 BLOCK_COMMENT("} string_indexof_char_short"); 708 } 709 710 // StringUTF16.indexOfChar 711 // StringLatin1.indexOfChar 712 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 713 Register ch, Register result, 714 Register tmp1, Register tmp2, 715 Register tmp3, Register tmp4, 716 bool isL) 717 { 718 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG; 719 Register ch1 = t0; 720 Register orig_cnt = t1; 721 Register mask1 = tmp3; 722 Register mask2 = tmp2; 723 Register match_mask = tmp1; 724 Register trailing_char = tmp4; 725 Register unaligned_elems = tmp4; 726 727 BLOCK_COMMENT("string_indexof_char {"); 728 beqz(cnt1, NOMATCH); 729 730 addi(t0, cnt1, isL ? -32 : -16); 731 bgtz(t0, DO_LONG); 732 string_indexof_char_short(str1, cnt1, ch, result, isL); 733 j(DONE); 734 735 bind(DO_LONG); 736 mv(orig_cnt, cnt1); 737 if (AvoidUnalignedAccesses) { 738 Label ALIGNED; 739 andi(unaligned_elems, str1, 0x7); 740 beqz(unaligned_elems, ALIGNED); 741 sub(unaligned_elems, unaligned_elems, 8); 742 neg(unaligned_elems, unaligned_elems); 743 if (!isL) { 744 srli(unaligned_elems, unaligned_elems, 1); 745 } 746 // do unaligned part per element 747 string_indexof_char_short(str1, unaligned_elems, ch, result, isL); 748 bgez(result, DONE); 749 mv(orig_cnt, cnt1); 750 sub(cnt1, cnt1, unaligned_elems); 751 bind(ALIGNED); 752 } 753 754 // duplicate ch 755 if (isL) { 756 slli(ch1, ch, 8); 757 orr(ch, ch1, ch); 758 } 759 slli(ch1, ch, 16); 760 orr(ch, ch1, ch); 761 slli(ch1, ch, 32); 762 orr(ch, ch1, ch); 763 764 if (!isL) { 765 slli(cnt1, cnt1, 1); 766 } 767 768 uint64_t mask0101 = UCONST64(0x0101010101010101); 769 uint64_t mask0001 = UCONST64(0x0001000100010001); 770 mv(mask1, isL ? mask0101 : mask0001); 771 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 772 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 773 mv(mask2, isL ? mask7f7f : mask7fff); 774 775 bind(CH1_LOOP); 776 ld(ch1, Address(str1)); 777 addi(str1, str1, 8); 778 addi(cnt1, cnt1, -8); 779 compute_match_mask(ch1, ch, match_mask, mask1, mask2); 780 bnez(match_mask, HIT); 781 bgtz(cnt1, CH1_LOOP); 782 j(NOMATCH); 783 784 bind(HIT); 785 ctzc_bit(trailing_char, match_mask, isL, ch1, result); 786 srli(trailing_char, trailing_char, 3); 787 addi(cnt1, cnt1, 8); 788 ble(cnt1, trailing_char, NOMATCH); 789 // match case 790 if (!isL) { 791 srli(cnt1, cnt1, 1); 792 srli(trailing_char, trailing_char, 1); 793 } 794 795 sub(result, orig_cnt, cnt1); 796 add(result, result, trailing_char); 797 j(DONE); 798 799 bind(NOMATCH); 800 mv(result, -1); 801 802 bind(DONE); 803 BLOCK_COMMENT("} string_indexof_char"); 804 } 805 806 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp); 807 808 // Search for needle in haystack and return index or -1 809 // x10: result 810 // x11: haystack 811 // x12: haystack_len 812 // x13: needle 813 // x14: needle_len 814 void C2_MacroAssembler::string_indexof(Register haystack, Register needle, 815 Register haystack_len, Register needle_len, 816 Register tmp1, Register tmp2, 817 Register tmp3, Register tmp4, 818 Register tmp5, Register tmp6, 819 Register result, int ae) 820 { 821 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 822 823 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH; 824 825 Register ch1 = t0; 826 Register ch2 = t1; 827 Register nlen_tmp = tmp1; // needle len tmp 828 Register hlen_tmp = tmp2; // haystack len tmp 829 Register result_tmp = tmp4; 830 831 bool isLL = ae == StrIntrinsicNode::LL; 832 833 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 834 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 835 int needle_chr_shift = needle_isL ? 0 : 1; 836 int haystack_chr_shift = haystack_isL ? 0 : 1; 837 int needle_chr_size = needle_isL ? 1 : 2; 838 int haystack_chr_size = haystack_isL ? 1 : 2; 839 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 840 (load_chr_insn)&MacroAssembler::lhu; 841 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 842 (load_chr_insn)&MacroAssembler::lhu; 843 844 BLOCK_COMMENT("string_indexof {"); 845 846 // Note, inline_string_indexOf() generates checks: 847 // if (pattern.count > src.count) return -1; 848 // if (pattern.count == 0) return 0; 849 850 // We have two strings, a source string in haystack, haystack_len and a pattern string 851 // in needle, needle_len. Find the first occurrence of pattern in source or return -1. 852 853 // For larger pattern and source we use a simplified Boyer Moore algorithm. 854 // With a small pattern and source we use linear scan. 855 856 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm. 857 sub(result_tmp, haystack_len, needle_len); 858 // needle_len < 8, use linear scan 859 sub(t0, needle_len, 8); 860 bltz(t0, LINEARSEARCH); 861 // needle_len >= 256, use linear scan 862 sub(t0, needle_len, 256); 863 bgez(t0, LINEARSTUB); 864 // needle_len >= haystack_len/4, use linear scan 865 srli(t0, haystack_len, 2); 866 bge(needle_len, t0, LINEARSTUB); 867 868 // Boyer-Moore-Horspool introduction: 869 // The Boyer Moore alogorithm is based on the description here:- 870 // 871 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 872 // 873 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 874 // and the 'Good Suffix' rule. 875 // 876 // These rules are essentially heuristics for how far we can shift the 877 // pattern along the search string. 878 // 879 // The implementation here uses the 'Bad Character' rule only because of the 880 // complexity of initialisation for the 'Good Suffix' rule. 881 // 882 // This is also known as the Boyer-Moore-Horspool algorithm: 883 // 884 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 885 // 886 // #define ASIZE 256 887 // 888 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) { 889 // int i, j; 890 // unsigned c; 891 // unsigned char bc[ASIZE]; 892 // 893 // /* Preprocessing */ 894 // for (i = 0; i < ASIZE; ++i) 895 // bc[i] = m; 896 // for (i = 0; i < m - 1; ) { 897 // c = pattern[i]; 898 // ++i; 899 // // c < 256 for Latin1 string, so, no need for branch 900 // #ifdef PATTERN_STRING_IS_LATIN1 901 // bc[c] = m - i; 902 // #else 903 // if (c < ASIZE) bc[c] = m - i; 904 // #endif 905 // } 906 // 907 // /* Searching */ 908 // j = 0; 909 // while (j <= n - m) { 910 // c = src[i+j]; 911 // if (pattern[m-1] == c) 912 // int k; 913 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 914 // if (k < 0) return j; 915 // // c < 256 for Latin1 string, so, no need for branch 916 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1 917 // // LL case: (c< 256) always true. Remove branch 918 // j += bc[pattern[j+m-1]]; 919 // #endif 920 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF 921 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 922 // if (c < ASIZE) 923 // j += bc[pattern[j+m-1]]; 924 // else 925 // j += 1 926 // #endif 927 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1 928 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 929 // if (c < ASIZE) 930 // j += bc[pattern[j+m-1]]; 931 // else 932 // j += m 933 // #endif 934 // } 935 // return -1; 936 // } 937 938 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result 939 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 940 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 941 942 Register haystack_end = haystack_len; 943 Register skipch = tmp2; 944 945 // pattern length is >=8, so, we can read at least 1 register for cases when 946 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 947 // UL case. We'll re-read last character in inner pre-loop code to have 948 // single outer pre-loop load 949 const int firstStep = isLL ? 7 : 3; 950 951 const int ASIZE = 256; 952 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd) 953 954 sub(sp, sp, ASIZE); 955 956 // init BC offset table with default value: needle_len 957 slli(t0, needle_len, 8); 958 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len] 959 slli(tmp1, t0, 16); 960 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len] 961 slli(tmp1, t0, 32); 962 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len] 963 964 mv(ch1, sp); // ch1 is t0 965 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations 966 967 bind(BM_INIT_LOOP); 968 // for (i = 0; i < ASIZE; ++i) 969 // bc[i] = m; 970 for (int i = 0; i < 4; i++) { 971 sd(tmp5, Address(ch1, i * wordSize)); 972 } 973 add(ch1, ch1, 32); 974 sub(tmp6, tmp6, 4); 975 bgtz(tmp6, BM_INIT_LOOP); 976 977 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern 978 Register orig_haystack = tmp5; 979 mv(orig_haystack, haystack); 980 // result_tmp = tmp4 981 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift); 982 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1 983 mv(tmp3, needle); 984 985 // for (i = 0; i < m - 1; ) { 986 // c = pattern[i]; 987 // ++i; 988 // // c < 256 for Latin1 string, so, no need for branch 989 // #ifdef PATTERN_STRING_IS_LATIN1 990 // bc[c] = m - i; 991 // #else 992 // if (c < ASIZE) bc[c] = m - i; 993 // #endif 994 // } 995 bind(BCLOOP); 996 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg); 997 add(tmp3, tmp3, needle_chr_size); 998 if (!needle_isL) { 999 // ae == StrIntrinsicNode::UU 1000 mv(tmp6, ASIZE); 1001 bgeu(ch1, tmp6, BCSKIP); 1002 } 1003 add(tmp4, sp, ch1); 1004 sb(ch2, Address(tmp4)); // store skip offset to BC offset table 1005 1006 bind(BCSKIP); 1007 sub(ch2, ch2, 1); // for next pattern element, skip distance -1 1008 bgtz(ch2, BCLOOP); 1009 1010 // tmp6: pattern end, address after needle 1011 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift); 1012 if (needle_isL == haystack_isL) { 1013 // load last 8 bytes (8LL/4UU symbols) 1014 ld(tmp6, Address(tmp6, -wordSize)); 1015 } else { 1016 // UL: from UTF-16(source) search Latin1(pattern) 1017 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols) 1018 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d 1019 // We'll have to wait until load completed, but it's still faster than per-character loads+checks 1020 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a 1021 slli(ch2, tmp6, XLEN - 24); 1022 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b 1023 slli(ch1, tmp6, XLEN - 16); 1024 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c 1025 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d 1026 slli(ch2, ch2, 16); 1027 orr(ch2, ch2, ch1); // 0x00000b0c 1028 slli(result, tmp3, 48); // use result as temp register 1029 orr(tmp6, tmp6, result); // 0x0a00000d 1030 slli(result, ch2, 16); 1031 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d 1032 } 1033 1034 // i = m - 1; 1035 // skipch = j + i; 1036 // if (skipch == pattern[m - 1] 1037 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 1038 // else 1039 // move j with bad char offset table 1040 bind(BMLOOPSTR2); 1041 // compare pattern to source string backward 1042 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift); 1043 (this->*haystack_load_1chr)(skipch, Address(result), noreg); 1044 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8 1045 if (needle_isL == haystack_isL) { 1046 // re-init tmp3. It's for free because it's executed in parallel with 1047 // load above. Alternative is to initialize it before loop, but it'll 1048 // affect performance on in-order systems with 2 or more ld/st pipelines 1049 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1] 1050 } 1051 if (!isLL) { // UU/UL case 1052 slli(ch2, nlen_tmp, 1); // offsets in bytes 1053 } 1054 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char 1055 add(result, haystack, isLL ? nlen_tmp : ch2); 1056 // load 8 bytes from source string 1057 // if isLL is false then read granularity can be 2 1058 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway 1059 mv(ch1, tmp6); 1060 if (isLL) { 1061 j(BMLOOPSTR1_AFTER_LOAD); 1062 } else { 1063 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 1064 j(BMLOOPSTR1_CMP); 1065 } 1066 1067 bind(BMLOOPSTR1); 1068 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift); 1069 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 1070 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift); 1071 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1072 1073 bind(BMLOOPSTR1_AFTER_LOAD); 1074 sub(nlen_tmp, nlen_tmp, 1); 1075 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP); 1076 1077 bind(BMLOOPSTR1_CMP); 1078 beq(ch1, ch2, BMLOOPSTR1); 1079 1080 bind(BMSKIP); 1081 if (!isLL) { 1082 // if we've met UTF symbol while searching Latin1 pattern, then we can 1083 // skip needle_len symbols 1084 if (needle_isL != haystack_isL) { 1085 mv(result_tmp, needle_len); 1086 } else { 1087 mv(result_tmp, 1); 1088 } 1089 mv(t0, ASIZE); 1090 bgeu(skipch, t0, BMADV); 1091 } 1092 add(result_tmp, sp, skipch); 1093 lbu(result_tmp, Address(result_tmp)); // load skip offset 1094 1095 bind(BMADV); 1096 sub(nlen_tmp, needle_len, 1); 1097 // move haystack after bad char skip offset 1098 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift); 1099 ble(haystack, haystack_end, BMLOOPSTR2); 1100 add(sp, sp, ASIZE); 1101 j(NOMATCH); 1102 1103 bind(BMLOOPSTR1_LASTCMP); 1104 bne(ch1, ch2, BMSKIP); 1105 1106 bind(BMMATCH); 1107 sub(result, haystack, orig_haystack); 1108 if (!haystack_isL) { 1109 srli(result, result, 1); 1110 } 1111 add(sp, sp, ASIZE); 1112 j(DONE); 1113 1114 bind(LINEARSTUB); 1115 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm 1116 bltz(t0, LINEARSEARCH); 1117 mv(result, zr); 1118 RuntimeAddress stub = nullptr; 1119 if (isLL) { 1120 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll()); 1121 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 1122 } else if (needle_isL) { 1123 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul()); 1124 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 1125 } else { 1126 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu()); 1127 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 1128 } 1129 address call = reloc_call(stub); 1130 if (call == nullptr) { 1131 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH)); 1132 ciEnv::current()->record_failure("CodeCache is full"); 1133 return; 1134 } 1135 j(DONE); 1136 1137 bind(NOMATCH); 1138 mv(result, -1); 1139 j(DONE); 1140 1141 bind(LINEARSEARCH); 1142 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae); 1143 1144 bind(DONE); 1145 BLOCK_COMMENT("} string_indexof"); 1146 } 1147 1148 // string_indexof 1149 // result: x10 1150 // src: x11 1151 // src_count: x12 1152 // pattern: x13 1153 // pattern_count: x14 or 1/2/3/4 1154 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle, 1155 Register haystack_len, Register needle_len, 1156 Register tmp1, Register tmp2, 1157 Register tmp3, Register tmp4, 1158 int needle_con_cnt, Register result, int ae) 1159 { 1160 // Note: 1161 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant 1162 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1 1163 assert(needle_con_cnt <= 4, "Invalid needle constant count"); 1164 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1165 1166 Register ch1 = t0; 1167 Register ch2 = t1; 1168 Register hlen_neg = haystack_len, nlen_neg = needle_len; 1169 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4; 1170 1171 bool isLL = ae == StrIntrinsicNode::LL; 1172 1173 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 1174 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 1175 int needle_chr_shift = needle_isL ? 0 : 1; 1176 int haystack_chr_shift = haystack_isL ? 0 : 1; 1177 int needle_chr_size = needle_isL ? 1 : 2; 1178 int haystack_chr_size = haystack_isL ? 1 : 2; 1179 1180 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 1181 (load_chr_insn)&MacroAssembler::lhu; 1182 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 1183 (load_chr_insn)&MacroAssembler::lhu; 1184 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu; 1185 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld; 1186 1187 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE; 1188 1189 Register first = tmp3; 1190 1191 if (needle_con_cnt == -1) { 1192 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 1193 1194 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2); 1195 bltz(t0, DOSHORT); 1196 1197 (this->*needle_load_1chr)(first, Address(needle), noreg); 1198 slli(t0, needle_len, needle_chr_shift); 1199 add(needle, needle, t0); 1200 neg(nlen_neg, t0); 1201 slli(t0, result_tmp, haystack_chr_shift); 1202 add(haystack, haystack, t0); 1203 neg(hlen_neg, t0); 1204 1205 bind(FIRST_LOOP); 1206 add(t0, haystack, hlen_neg); 1207 (this->*haystack_load_1chr)(ch2, Address(t0), noreg); 1208 beq(first, ch2, STR1_LOOP); 1209 1210 bind(STR2_NEXT); 1211 add(hlen_neg, hlen_neg, haystack_chr_size); 1212 blez(hlen_neg, FIRST_LOOP); 1213 j(NOMATCH); 1214 1215 bind(STR1_LOOP); 1216 add(nlen_tmp, nlen_neg, needle_chr_size); 1217 add(hlen_tmp, hlen_neg, haystack_chr_size); 1218 bgez(nlen_tmp, MATCH); 1219 1220 bind(STR1_NEXT); 1221 add(ch1, needle, nlen_tmp); 1222 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 1223 add(ch2, haystack, hlen_tmp); 1224 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1225 bne(ch1, ch2, STR2_NEXT); 1226 add(nlen_tmp, nlen_tmp, needle_chr_size); 1227 add(hlen_tmp, hlen_tmp, haystack_chr_size); 1228 bltz(nlen_tmp, STR1_NEXT); 1229 j(MATCH); 1230 1231 bind(DOSHORT); 1232 if (needle_isL == haystack_isL) { 1233 sub(t0, needle_len, 2); 1234 bltz(t0, DO1); 1235 bgtz(t0, DO3); 1236 } 1237 } 1238 1239 if (needle_con_cnt == 4) { 1240 Label CH1_LOOP; 1241 (this->*load_4chr)(ch1, Address(needle), noreg); 1242 sub(result_tmp, haystack_len, 4); 1243 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp 1244 add(haystack, haystack, tmp3); 1245 neg(hlen_neg, tmp3); 1246 if (AvoidUnalignedAccesses) { 1247 // preload first value, then we will read by 1 character per loop, instead of four 1248 // just shifting previous ch2 right by size of character in bits 1249 add(tmp3, haystack, hlen_neg); 1250 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1251 if (isLL) { 1252 // need to erase 1 most significant byte in 32-bit value of ch2 1253 slli(ch2, ch2, 40); 1254 srli(ch2, ch2, 32); 1255 } else { 1256 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation 1257 } 1258 } 1259 1260 bind(CH1_LOOP); 1261 add(tmp3, haystack, hlen_neg); 1262 if (AvoidUnalignedAccesses) { 1263 srli(ch2, ch2, isLL ? 8 : 16); 1264 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg); 1265 slli(tmp3, tmp3, isLL ? 24 : 48); 1266 add(ch2, ch2, tmp3); 1267 } else { 1268 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1269 } 1270 beq(ch1, ch2, MATCH); 1271 add(hlen_neg, hlen_neg, haystack_chr_size); 1272 blez(hlen_neg, CH1_LOOP); 1273 j(NOMATCH); 1274 } 1275 1276 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) { 1277 Label CH1_LOOP; 1278 BLOCK_COMMENT("string_indexof DO2 {"); 1279 bind(DO2); 1280 (this->*load_2chr)(ch1, Address(needle), noreg); 1281 if (needle_con_cnt == 2) { 1282 sub(result_tmp, haystack_len, 2); 1283 } 1284 slli(tmp3, result_tmp, haystack_chr_shift); 1285 add(haystack, haystack, tmp3); 1286 neg(hlen_neg, tmp3); 1287 if (AvoidUnalignedAccesses) { 1288 // preload first value, then we will read by 1 character per loop, instead of two 1289 // just shifting previous ch2 right by size of character in bits 1290 add(tmp3, haystack, hlen_neg); 1291 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1292 slli(ch2, ch2, isLL ? 8 : 16); 1293 } 1294 bind(CH1_LOOP); 1295 add(tmp3, haystack, hlen_neg); 1296 if (AvoidUnalignedAccesses) { 1297 srli(ch2, ch2, isLL ? 8 : 16); 1298 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg); 1299 slli(tmp3, tmp3, isLL ? 8 : 16); 1300 add(ch2, ch2, tmp3); 1301 } else { 1302 (this->*load_2chr)(ch2, Address(tmp3), noreg); 1303 } 1304 beq(ch1, ch2, MATCH); 1305 add(hlen_neg, hlen_neg, haystack_chr_size); 1306 blez(hlen_neg, CH1_LOOP); 1307 j(NOMATCH); 1308 BLOCK_COMMENT("} string_indexof DO2"); 1309 } 1310 1311 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) { 1312 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1313 BLOCK_COMMENT("string_indexof DO3 {"); 1314 1315 bind(DO3); 1316 (this->*load_2chr)(first, Address(needle), noreg); 1317 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg); 1318 if (needle_con_cnt == 3) { 1319 sub(result_tmp, haystack_len, 3); 1320 } 1321 slli(hlen_tmp, result_tmp, haystack_chr_shift); 1322 add(haystack, haystack, hlen_tmp); 1323 neg(hlen_neg, hlen_tmp); 1324 1325 bind(FIRST_LOOP); 1326 add(ch2, haystack, hlen_neg); 1327 if (AvoidUnalignedAccesses) { 1328 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2 1329 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1330 slli(tmp2, tmp2, isLL ? 8 : 16); 1331 add(ch2, ch2, tmp2); 1332 } else { 1333 (this->*load_2chr)(ch2, Address(ch2), noreg); 1334 } 1335 beq(first, ch2, STR1_LOOP); 1336 1337 bind(STR2_NEXT); 1338 add(hlen_neg, hlen_neg, haystack_chr_size); 1339 blez(hlen_neg, FIRST_LOOP); 1340 j(NOMATCH); 1341 1342 bind(STR1_LOOP); 1343 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size); 1344 add(ch2, haystack, hlen_tmp); 1345 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1346 bne(ch1, ch2, STR2_NEXT); 1347 j(MATCH); 1348 BLOCK_COMMENT("} string_indexof DO3"); 1349 } 1350 1351 if (needle_con_cnt == -1 || needle_con_cnt == 1) { 1352 Label DO1_LOOP; 1353 1354 BLOCK_COMMENT("string_indexof DO1 {"); 1355 bind(DO1); 1356 (this->*needle_load_1chr)(ch1, Address(needle), noreg); 1357 sub(result_tmp, haystack_len, 1); 1358 slli(tmp3, result_tmp, haystack_chr_shift); 1359 add(haystack, haystack, tmp3); 1360 neg(hlen_neg, tmp3); 1361 1362 bind(DO1_LOOP); 1363 add(tmp3, haystack, hlen_neg); 1364 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1365 beq(ch1, ch2, MATCH); 1366 add(hlen_neg, hlen_neg, haystack_chr_size); 1367 blez(hlen_neg, DO1_LOOP); 1368 BLOCK_COMMENT("} string_indexof DO1"); 1369 } 1370 1371 bind(NOMATCH); 1372 mv(result, -1); 1373 j(DONE); 1374 1375 bind(MATCH); 1376 srai(t0, hlen_neg, haystack_chr_shift); 1377 add(result, result_tmp, t0); 1378 1379 bind(DONE); 1380 } 1381 1382 // Compare strings. 1383 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1384 Register cnt1, Register cnt2, Register result, 1385 Register tmp1, Register tmp2, Register tmp3, 1386 int ae) 1387 { 1388 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1389 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1390 SHORT_LOOP_START, TAIL_CHECK, L; 1391 1392 const int STUB_THRESHOLD = 64 + 8; 1393 bool isLL = ae == StrIntrinsicNode::LL; 1394 bool isLU = ae == StrIntrinsicNode::LU; 1395 bool isUL = ae == StrIntrinsicNode::UL; 1396 1397 bool str1_isL = isLL || isLU; 1398 bool str2_isL = isLL || isUL; 1399 1400 // for L strings, 1 byte for 1 character 1401 // for U strings, 2 bytes for 1 character 1402 int str1_chr_size = str1_isL ? 1 : 2; 1403 int str2_chr_size = str2_isL ? 1 : 2; 1404 int minCharsInWord = isLL ? wordSize : wordSize / 2; 1405 1406 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1407 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1408 1409 BLOCK_COMMENT("string_compare {"); 1410 1411 // Bizarrely, the counts are passed in bytes, regardless of whether they 1412 // are L or U strings, however the result is always in characters. 1413 if (!str1_isL) { 1414 sraiw(cnt1, cnt1, 1); 1415 } 1416 if (!str2_isL) { 1417 sraiw(cnt2, cnt2, 1); 1418 } 1419 1420 // Compute the minimum of the string lengths and save the difference in result. 1421 sub(result, cnt1, cnt2); 1422 bgt(cnt1, cnt2, L); 1423 mv(cnt2, cnt1); 1424 bind(L); 1425 1426 // A very short string 1427 mv(t0, minCharsInWord); 1428 ble(cnt2, t0, SHORT_STRING); 1429 1430 // Compare longwords 1431 // load first parts of strings and finish initialization while loading 1432 { 1433 if (str1_isL == str2_isL) { // LL or UU 1434 // check if str1 and str2 is same pointer 1435 beq(str1, str2, DONE); 1436 // load 8 bytes once to compare 1437 ld(tmp1, Address(str1)); 1438 ld(tmp2, Address(str2)); 1439 mv(t0, STUB_THRESHOLD); 1440 bge(cnt2, t0, STUB); 1441 sub(cnt2, cnt2, minCharsInWord); 1442 beqz(cnt2, TAIL_CHECK); 1443 // convert cnt2 from characters to bytes 1444 if (!str1_isL) { 1445 slli(cnt2, cnt2, 1); 1446 } 1447 add(str2, str2, cnt2); 1448 add(str1, str1, cnt2); 1449 sub(cnt2, zr, cnt2); 1450 } else if (isLU) { // LU case 1451 lwu(tmp1, Address(str1)); 1452 ld(tmp2, Address(str2)); 1453 mv(t0, STUB_THRESHOLD); 1454 bge(cnt2, t0, STUB); 1455 addi(cnt2, cnt2, -4); 1456 add(str1, str1, cnt2); 1457 sub(cnt1, zr, cnt2); 1458 slli(cnt2, cnt2, 1); 1459 add(str2, str2, cnt2); 1460 inflate_lo32(tmp3, tmp1); 1461 mv(tmp1, tmp3); 1462 sub(cnt2, zr, cnt2); 1463 addi(cnt1, cnt1, 4); 1464 } else { // UL case 1465 ld(tmp1, Address(str1)); 1466 lwu(tmp2, Address(str2)); 1467 mv(t0, STUB_THRESHOLD); 1468 bge(cnt2, t0, STUB); 1469 addi(cnt2, cnt2, -4); 1470 slli(t0, cnt2, 1); 1471 sub(cnt1, zr, t0); 1472 add(str1, str1, t0); 1473 add(str2, str2, cnt2); 1474 inflate_lo32(tmp3, tmp2); 1475 mv(tmp2, tmp3); 1476 sub(cnt2, zr, cnt2); 1477 addi(cnt1, cnt1, 8); 1478 } 1479 addi(cnt2, cnt2, isUL ? 4 : 8); 1480 bne(tmp1, tmp2, DIFFERENCE); 1481 bgez(cnt2, TAIL); 1482 1483 // main loop 1484 bind(NEXT_WORD); 1485 if (str1_isL == str2_isL) { // LL or UU 1486 add(t0, str1, cnt2); 1487 ld(tmp1, Address(t0)); 1488 add(t0, str2, cnt2); 1489 ld(tmp2, Address(t0)); 1490 addi(cnt2, cnt2, 8); 1491 } else if (isLU) { // LU case 1492 add(t0, str1, cnt1); 1493 lwu(tmp1, Address(t0)); 1494 add(t0, str2, cnt2); 1495 ld(tmp2, Address(t0)); 1496 addi(cnt1, cnt1, 4); 1497 inflate_lo32(tmp3, tmp1); 1498 mv(tmp1, tmp3); 1499 addi(cnt2, cnt2, 8); 1500 } else { // UL case 1501 add(t0, str2, cnt2); 1502 lwu(tmp2, Address(t0)); 1503 add(t0, str1, cnt1); 1504 ld(tmp1, Address(t0)); 1505 inflate_lo32(tmp3, tmp2); 1506 mv(tmp2, tmp3); 1507 addi(cnt1, cnt1, 8); 1508 addi(cnt2, cnt2, 4); 1509 } 1510 bne(tmp1, tmp2, DIFFERENCE); 1511 bltz(cnt2, NEXT_WORD); 1512 bind(TAIL); 1513 if (str1_isL == str2_isL) { // LL or UU 1514 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2); 1515 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2); 1516 } else if (isLU) { // LU case 1517 load_int_misaligned(tmp1, Address(str1), tmp3, false); 1518 load_long_misaligned(tmp2, Address(str2), tmp3, 2); 1519 inflate_lo32(tmp3, tmp1); 1520 mv(tmp1, tmp3); 1521 } else { // UL case 1522 load_int_misaligned(tmp2, Address(str2), tmp3, false); 1523 load_long_misaligned(tmp1, Address(str1), tmp3, 2); 1524 inflate_lo32(tmp3, tmp2); 1525 mv(tmp2, tmp3); 1526 } 1527 bind(TAIL_CHECK); 1528 beq(tmp1, tmp2, DONE); 1529 1530 // Find the first different characters in the longwords and 1531 // compute their difference. 1532 bind(DIFFERENCE); 1533 xorr(tmp3, tmp1, tmp2); 1534 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb 1535 srl(tmp1, tmp1, result); 1536 srl(tmp2, tmp2, result); 1537 if (isLL) { 1538 andi(tmp1, tmp1, 0xFF); 1539 andi(tmp2, tmp2, 0xFF); 1540 } else { 1541 andi(tmp1, tmp1, 0xFFFF); 1542 andi(tmp2, tmp2, 0xFFFF); 1543 } 1544 sub(result, tmp1, tmp2); 1545 j(DONE); 1546 } 1547 1548 bind(STUB); 1549 RuntimeAddress stub = nullptr; 1550 switch (ae) { 1551 case StrIntrinsicNode::LL: 1552 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL()); 1553 break; 1554 case StrIntrinsicNode::UU: 1555 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU()); 1556 break; 1557 case StrIntrinsicNode::LU: 1558 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU()); 1559 break; 1560 case StrIntrinsicNode::UL: 1561 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL()); 1562 break; 1563 default: 1564 ShouldNotReachHere(); 1565 } 1566 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1567 address call = reloc_call(stub); 1568 if (call == nullptr) { 1569 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1570 ciEnv::current()->record_failure("CodeCache is full"); 1571 return; 1572 } 1573 j(DONE); 1574 1575 bind(SHORT_STRING); 1576 // Is the minimum length zero? 1577 beqz(cnt2, DONE); 1578 // arrange code to do most branches while loading and loading next characters 1579 // while comparing previous 1580 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1581 addi(str1, str1, str1_chr_size); 1582 addi(cnt2, cnt2, -1); 1583 beqz(cnt2, SHORT_LAST_INIT); 1584 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1585 addi(str2, str2, str2_chr_size); 1586 j(SHORT_LOOP_START); 1587 bind(SHORT_LOOP); 1588 addi(cnt2, cnt2, -1); 1589 beqz(cnt2, SHORT_LAST); 1590 bind(SHORT_LOOP_START); 1591 (this->*str1_load_chr)(tmp2, Address(str1), t0); 1592 addi(str1, str1, str1_chr_size); 1593 (this->*str2_load_chr)(t0, Address(str2), t0); 1594 addi(str2, str2, str2_chr_size); 1595 bne(tmp1, cnt1, SHORT_LOOP_TAIL); 1596 addi(cnt2, cnt2, -1); 1597 beqz(cnt2, SHORT_LAST2); 1598 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1599 addi(str1, str1, str1_chr_size); 1600 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1601 addi(str2, str2, str2_chr_size); 1602 beq(tmp2, t0, SHORT_LOOP); 1603 sub(result, tmp2, t0); 1604 j(DONE); 1605 bind(SHORT_LOOP_TAIL); 1606 sub(result, tmp1, cnt1); 1607 j(DONE); 1608 bind(SHORT_LAST2); 1609 beq(tmp2, t0, DONE); 1610 sub(result, tmp2, t0); 1611 1612 j(DONE); 1613 bind(SHORT_LAST_INIT); 1614 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1615 addi(str2, str2, str2_chr_size); 1616 bind(SHORT_LAST); 1617 beq(tmp1, cnt1, DONE); 1618 sub(result, tmp1, cnt1); 1619 1620 bind(DONE); 1621 1622 BLOCK_COMMENT("} string_compare"); 1623 } 1624 1625 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, 1626 Register tmp1, Register tmp2, Register tmp3, 1627 Register result, int elem_size) { 1628 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 1629 assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0); 1630 1631 int elem_per_word = wordSize/elem_size; 1632 int log_elem_size = exact_log2(elem_size); 1633 int length_offset = arrayOopDesc::length_offset_in_bytes(); 1634 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 1635 1636 Register cnt1 = tmp3; 1637 Register cnt2 = tmp1; // cnt2 only used in array length compare 1638 Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01; 1639 1640 BLOCK_COMMENT("arrays_equals {"); 1641 1642 // if (a1 == a2), return true 1643 beq(a1, a2, SAME); 1644 1645 mv(result, false); 1646 // if (a1 == nullptr || a2 == nullptr) 1647 // return false; 1648 beqz(a1, DONE); 1649 beqz(a2, DONE); 1650 1651 // if (a1.length != a2.length) 1652 // return false; 1653 lwu(cnt1, Address(a1, length_offset)); 1654 lwu(cnt2, Address(a2, length_offset)); 1655 bne(cnt1, cnt2, DONE); 1656 1657 la(a1, Address(a1, base_offset)); 1658 la(a2, Address(a2, base_offset)); 1659 // Check for short strings, i.e. smaller than wordSize. 1660 addi(cnt1, cnt1, -elem_per_word); 1661 bltz(cnt1, SHORT); 1662 1663 // Main 8 byte comparison loop. 1664 bind(NEXT_WORD); { 1665 ld(tmp1, Address(a1)); 1666 ld(tmp2, Address(a2)); 1667 addi(cnt1, cnt1, -elem_per_word); 1668 addi(a1, a1, wordSize); 1669 addi(a2, a2, wordSize); 1670 bne(tmp1, tmp2, DONE); 1671 } bgez(cnt1, NEXT_WORD); 1672 1673 addi(tmp1, cnt1, elem_per_word); 1674 beqz(tmp1, SAME); 1675 1676 bind(SHORT); 1677 test_bit(tmp1, cnt1, 2 - log_elem_size); 1678 beqz(tmp1, TAIL03); // 0-7 bytes left. 1679 { 1680 lwu(tmp1, Address(a1)); 1681 lwu(tmp2, Address(a2)); 1682 addi(a1, a1, 4); 1683 addi(a2, a2, 4); 1684 bne(tmp1, tmp2, DONE); 1685 } 1686 1687 bind(TAIL03); 1688 test_bit(tmp1, cnt1, 1 - log_elem_size); 1689 beqz(tmp1, TAIL01); // 0-3 bytes left. 1690 { 1691 lhu(tmp1, Address(a1)); 1692 lhu(tmp2, Address(a2)); 1693 addi(a1, a1, 2); 1694 addi(a2, a2, 2); 1695 bne(tmp1, tmp2, DONE); 1696 } 1697 1698 bind(TAIL01); 1699 if (elem_size == 1) { // Only needed when comparing byte arrays. 1700 test_bit(tmp1, cnt1, 0); 1701 beqz(tmp1, SAME); // 0-1 bytes left. 1702 { 1703 lbu(tmp1, Address(a1)); 1704 lbu(tmp2, Address(a2)); 1705 bne(tmp1, tmp2, DONE); 1706 } 1707 } 1708 1709 bind(SAME); 1710 mv(result, true); 1711 // That's it. 1712 bind(DONE); 1713 1714 BLOCK_COMMENT("} arrays_equals"); 1715 } 1716 1717 // Compare Strings 1718 1719 // For Strings we're passed the address of the first characters in a1 and a2 1720 // and the length in cnt1. There are two implementations. 1721 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed 1722 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte. 1723 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte. 1724 1725 void C2_MacroAssembler::string_equals(Register a1, Register a2, 1726 Register result, Register cnt1) 1727 { 1728 Label SAME, DONE, SHORT, NEXT_WORD; 1729 Register tmp1 = t0; 1730 Register tmp2 = t1; 1731 1732 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); 1733 1734 BLOCK_COMMENT("string_equals {"); 1735 1736 mv(result, false); 1737 1738 // Check for short strings, i.e. smaller than wordSize. 1739 addi(cnt1, cnt1, -wordSize); 1740 bltz(cnt1, SHORT); 1741 1742 // Main 8 byte comparison loop. 1743 bind(NEXT_WORD); { 1744 ld(tmp1, Address(a1)); 1745 ld(tmp2, Address(a2)); 1746 addi(cnt1, cnt1, -wordSize); 1747 addi(a1, a1, wordSize); 1748 addi(a2, a2, wordSize); 1749 bne(tmp1, tmp2, DONE); 1750 } bgez(cnt1, NEXT_WORD); 1751 1752 addi(tmp1, cnt1, wordSize); 1753 beqz(tmp1, SAME); 1754 1755 bind(SHORT); 1756 Label TAIL03, TAIL01; 1757 1758 // 0-7 bytes left. 1759 test_bit(tmp1, cnt1, 2); 1760 beqz(tmp1, TAIL03); 1761 { 1762 lwu(tmp1, Address(a1)); 1763 lwu(tmp2, Address(a2)); 1764 addi(a1, a1, 4); 1765 addi(a2, a2, 4); 1766 bne(tmp1, tmp2, DONE); 1767 } 1768 1769 bind(TAIL03); 1770 // 0-3 bytes left. 1771 test_bit(tmp1, cnt1, 1); 1772 beqz(tmp1, TAIL01); 1773 { 1774 lhu(tmp1, Address(a1)); 1775 lhu(tmp2, Address(a2)); 1776 addi(a1, a1, 2); 1777 addi(a2, a2, 2); 1778 bne(tmp1, tmp2, DONE); 1779 } 1780 1781 bind(TAIL01); 1782 // 0-1 bytes left. 1783 test_bit(tmp1, cnt1, 0); 1784 beqz(tmp1, SAME); 1785 { 1786 lbu(tmp1, Address(a1)); 1787 lbu(tmp2, Address(a2)); 1788 bne(tmp1, tmp2, DONE); 1789 } 1790 1791 // Arrays are equal. 1792 bind(SAME); 1793 mv(result, true); 1794 1795 // That's it. 1796 bind(DONE); 1797 BLOCK_COMMENT("} string_equals"); 1798 } 1799 1800 // jdk.internal.util.ArraysSupport.vectorizedHashCode 1801 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 1802 Register tmp1, Register tmp2, Register tmp3, 1803 Register tmp4, Register tmp5, Register tmp6, 1804 BasicType eltype) 1805 { 1806 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); 1807 1808 const int elsize = arrays_hashcode_elsize(eltype); 1809 const int chunks_end_shift = exact_log2(elsize); 1810 1811 switch (eltype) { 1812 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 1813 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 1814 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 1815 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 1816 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 1817 default: 1818 ShouldNotReachHere(); 1819 } 1820 1821 const int stride = 4; 1822 const Register pow31_4 = tmp1; 1823 const Register pow31_3 = tmp2; 1824 const Register pow31_2 = tmp3; 1825 const Register chunks = tmp4; 1826 const Register chunks_end = chunks; 1827 1828 Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP; 1829 1830 // result has a value initially 1831 1832 beqz(cnt, DONE); 1833 1834 andi(chunks, cnt, ~(stride-1)); 1835 beqz(chunks, TAIL); 1836 1837 mv(pow31_4, 923521); // [31^^4] 1838 mv(pow31_3, 29791); // [31^^3] 1839 mv(pow31_2, 961); // [31^^2] 1840 1841 slli(chunks_end, chunks, chunks_end_shift); 1842 add(chunks_end, ary, chunks_end); 1843 andi(cnt, cnt, stride-1); // don't forget about tail! 1844 1845 bind(WIDE_LOOP); 1846 mulw(result, result, pow31_4); // 31^^4 * h 1847 arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype); 1848 arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype); 1849 arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype); 1850 arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype); 1851 mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0] 1852 addw(result, result, t0); 1853 mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1] 1854 addw(result, result, t1); 1855 slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2] 1856 subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2] 1857 addw(result, result, tmp5); 1858 addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1] 1859 // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3] 1860 addi(ary, ary, elsize * stride); 1861 bne(ary, chunks_end, WIDE_LOOP); 1862 beqz(cnt, DONE); 1863 1864 bind(TAIL); 1865 slli(chunks_end, cnt, chunks_end_shift); 1866 add(chunks_end, ary, chunks_end); 1867 1868 bind(TAIL_LOOP); 1869 arrays_hashcode_elload(t0, Address(ary), eltype); 1870 slli(t1, result, 5); // optimize 31 * result 1871 subw(result, t1, result); // with result<<5 - result 1872 addw(result, result, t0); 1873 addi(ary, ary, elsize); 1874 bne(ary, chunks_end, TAIL_LOOP); 1875 1876 bind(DONE); 1877 BLOCK_COMMENT("} // arrays_hashcode"); 1878 } 1879 1880 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 1881 switch (eltype) { 1882 case T_BOOLEAN: return sizeof(jboolean); 1883 case T_BYTE: return sizeof(jbyte); 1884 case T_SHORT: return sizeof(jshort); 1885 case T_CHAR: return sizeof(jchar); 1886 case T_INT: return sizeof(jint); 1887 default: 1888 ShouldNotReachHere(); 1889 return -1; 1890 } 1891 } 1892 1893 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 1894 switch (eltype) { 1895 // T_BOOLEAN used as surrogate for unsigned byte 1896 case T_BOOLEAN: lbu(dst, src); break; 1897 case T_BYTE: lb(dst, src); break; 1898 case T_SHORT: lh(dst, src); break; 1899 case T_CHAR: lhu(dst, src); break; 1900 case T_INT: lw(dst, src); break; 1901 default: 1902 ShouldNotReachHere(); 1903 } 1904 } 1905 1906 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); 1907 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, 1908 bool is_far, bool is_unordered); 1909 1910 static conditional_branch_insn conditional_branches[] = 1911 { 1912 /* SHORT branches */ 1913 (conditional_branch_insn)&MacroAssembler::beq, 1914 (conditional_branch_insn)&MacroAssembler::bgt, 1915 nullptr, // BoolTest::overflow 1916 (conditional_branch_insn)&MacroAssembler::blt, 1917 (conditional_branch_insn)&MacroAssembler::bne, 1918 (conditional_branch_insn)&MacroAssembler::ble, 1919 nullptr, // BoolTest::no_overflow 1920 (conditional_branch_insn)&MacroAssembler::bge, 1921 1922 /* UNSIGNED branches */ 1923 (conditional_branch_insn)&MacroAssembler::beq, 1924 (conditional_branch_insn)&MacroAssembler::bgtu, 1925 nullptr, 1926 (conditional_branch_insn)&MacroAssembler::bltu, 1927 (conditional_branch_insn)&MacroAssembler::bne, 1928 (conditional_branch_insn)&MacroAssembler::bleu, 1929 nullptr, 1930 (conditional_branch_insn)&MacroAssembler::bgeu 1931 }; 1932 1933 static float_conditional_branch_insn float_conditional_branches[] = 1934 { 1935 /* FLOAT SHORT branches */ 1936 (float_conditional_branch_insn)&MacroAssembler::float_beq, 1937 (float_conditional_branch_insn)&MacroAssembler::float_bgt, 1938 nullptr, // BoolTest::overflow 1939 (float_conditional_branch_insn)&MacroAssembler::float_blt, 1940 (float_conditional_branch_insn)&MacroAssembler::float_bne, 1941 (float_conditional_branch_insn)&MacroAssembler::float_ble, 1942 nullptr, // BoolTest::no_overflow 1943 (float_conditional_branch_insn)&MacroAssembler::float_bge, 1944 1945 /* DOUBLE SHORT branches */ 1946 (float_conditional_branch_insn)&MacroAssembler::double_beq, 1947 (float_conditional_branch_insn)&MacroAssembler::double_bgt, 1948 nullptr, 1949 (float_conditional_branch_insn)&MacroAssembler::double_blt, 1950 (float_conditional_branch_insn)&MacroAssembler::double_bne, 1951 (float_conditional_branch_insn)&MacroAssembler::double_ble, 1952 nullptr, 1953 (float_conditional_branch_insn)&MacroAssembler::double_bge 1954 }; 1955 1956 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) { 1957 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])), 1958 "invalid conditional branch index"); 1959 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far); 1960 } 1961 1962 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use 1963 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode(). 1964 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) { 1965 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])), 1966 "invalid float conditional branch index"); 1967 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask); 1968 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far, 1969 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true); 1970 } 1971 1972 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1973 switch (cmpFlag) { 1974 case BoolTest::eq: 1975 case BoolTest::le: 1976 beqz(op1, L, is_far); 1977 break; 1978 case BoolTest::ne: 1979 case BoolTest::gt: 1980 bnez(op1, L, is_far); 1981 break; 1982 default: 1983 ShouldNotReachHere(); 1984 } 1985 } 1986 1987 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1988 switch (cmpFlag) { 1989 case BoolTest::eq: 1990 beqz(op1, L, is_far); 1991 break; 1992 case BoolTest::ne: 1993 bnez(op1, L, is_far); 1994 break; 1995 default: 1996 ShouldNotReachHere(); 1997 } 1998 } 1999 2000 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) { 2001 Label L; 2002 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L); 2003 mv(dst, src); 2004 bind(L); 2005 } 2006 2007 // Set dst to NaN if any NaN input. 2008 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2, 2009 bool is_double, bool is_min) { 2010 assert_different_registers(dst, src1, src2); 2011 2012 Label Done, Compare; 2013 2014 is_double ? fclass_d(t0, src1) 2015 : fclass_s(t0, src1); 2016 is_double ? fclass_d(t1, src2) 2017 : fclass_s(t1, src2); 2018 orr(t0, t0, t1); 2019 andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN 2020 beqz(t0, Compare); 2021 is_double ? fadd_d(dst, src1, src2) 2022 : fadd_s(dst, src1, src2); 2023 j(Done); 2024 2025 bind(Compare); 2026 if (is_double) { 2027 is_min ? fmin_d(dst, src1, src2) 2028 : fmax_d(dst, src1, src2); 2029 } else { 2030 is_min ? fmin_s(dst, src1, src2) 2031 : fmax_s(dst, src1, src2); 2032 } 2033 2034 bind(Done); 2035 } 2036 2037 // According to Java SE specification, for floating-point round operations, if 2038 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the 2039 // rounded result; this differs from behavior of RISC-V fcvt instructions (which 2040 // round out-of-range values to the nearest max or min value), therefore special 2041 // handling is needed by NaN, +/-Infinity, +/-0. 2042 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode, 2043 Register tmp1, Register tmp2, Register tmp3) { 2044 2045 assert_different_registers(dst, src); 2046 assert_different_registers(tmp1, tmp2, tmp3); 2047 2048 // Set rounding mode for conversions 2049 // Here we use similar modes to double->long and long->double conversions 2050 // Different mode for long->double conversion matter only if long value was not representable as double, 2051 // we got long value as a result of double->long conversion so, it is definitely representable 2052 RoundingMode rm; 2053 switch (round_mode) { 2054 case RoundDoubleModeNode::rmode_ceil: 2055 rm = RoundingMode::rup; 2056 break; 2057 case RoundDoubleModeNode::rmode_floor: 2058 rm = RoundingMode::rdn; 2059 break; 2060 case RoundDoubleModeNode::rmode_rint: 2061 rm = RoundingMode::rne; 2062 break; 2063 default: 2064 ShouldNotReachHere(); 2065 } 2066 2067 // tmp1 - is a register to store double converted to long int 2068 // tmp2 - is a register to create constant for comparison 2069 // tmp3 - is a register where we store modified result of double->long conversion 2070 Label done, bad_val; 2071 2072 // Conversion from double to long 2073 fcvt_l_d(tmp1, src, rm); 2074 2075 // Generate constant (tmp2) 2076 // tmp2 = 100...0000 2077 addi(tmp2, zr, 1); 2078 slli(tmp2, tmp2, 63); 2079 2080 // Prepare converted long (tmp1) 2081 // as a result when conversion overflow we got: 2082 // tmp1 = 011...1111 or 100...0000 2083 // Convert it to: tmp3 = 100...0000 2084 addi(tmp3, tmp1, 1); 2085 andi(tmp3, tmp3, -2); 2086 beq(tmp3, tmp2, bad_val); 2087 2088 // Conversion from long to double 2089 fcvt_d_l(dst, tmp1, rm); 2090 // Add sign of input value to result for +/- 0 cases 2091 fsgnj_d(dst, dst, src); 2092 j(done); 2093 2094 // If got conversion overflow return src 2095 bind(bad_val); 2096 fmv_d(dst, src); 2097 2098 bind(done); 2099 } 2100 2101 // According to Java SE specification, for floating-point signum operations, if 2102 // on input we have NaN or +/-0.0 value we should return it, 2103 // otherwise return +/- 1.0 using sign of input. 2104 // one - gives us a floating-point 1.0 (got from matching rule) 2105 // bool is_double - specifies single or double precision operations will be used. 2106 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) { 2107 Label done; 2108 2109 is_double ? fclass_d(t0, dst) 2110 : fclass_s(t0, dst); 2111 2112 // check if input is -0, +0, signaling NaN or quiet NaN 2113 andi(t0, t0, fclass_mask::zero | fclass_mask::nan); 2114 2115 bnez(t0, done); 2116 2117 // use floating-point 1.0 with a sign of input 2118 is_double ? fsgnj_d(dst, one, dst) 2119 : fsgnj_s(dst, one, dst); 2120 2121 bind(done); 2122 } 2123 2124 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) { 2125 #define __ masm. 2126 FloatRegister dst = stub.data<0>(); 2127 Register src = stub.data<1>(); 2128 Register tmp = stub.data<2>(); 2129 __ bind(stub.entry()); 2130 2131 // following instructions mainly focus on NaN, as riscv does not handle 2132 // NaN well with fcvt, but the code also works for Inf at the same time. 2133 2134 // construct a NaN in 32 bits from the NaN in 16 bits, 2135 // we need the payloads of non-canonical NaNs to be preserved. 2136 __ mv(tmp, 0x7f800000); 2137 // sign-bit was already set via sign-extension if necessary. 2138 __ slli(t0, src, 13); 2139 __ orr(tmp, t0, tmp); 2140 __ fmv_w_x(dst, tmp); 2141 2142 __ j(stub.continuation()); 2143 #undef __ 2144 } 2145 2146 // j.l.Float.float16ToFloat 2147 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) { 2148 auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path); 2149 2150 // On riscv, NaN needs a special process as fcvt does not work in that case. 2151 // On riscv, Inf does not need a special process as fcvt can handle it correctly. 2152 // but we consider to get the slow path to process NaN and Inf at the same time, 2153 // as both of them are rare cases, and if we try to get the slow path to handle 2154 // only NaN case it would sacrifise the performance for normal cases, 2155 // i.e. non-NaN and non-Inf cases. 2156 2157 // check whether it's a NaN or +/- Inf. 2158 mv(t0, 0x7c00); 2159 andr(tmp, src, t0); 2160 // jump to stub processing NaN and Inf cases. 2161 beq(t0, tmp, stub->entry()); 2162 2163 // non-NaN or non-Inf cases, just use built-in instructions. 2164 fmv_h_x(dst, src); 2165 fcvt_s_h(dst, dst); 2166 2167 bind(stub->continuation()); 2168 } 2169 2170 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) { 2171 #define __ masm. 2172 Register dst = stub.data<0>(); 2173 FloatRegister src = stub.data<1>(); 2174 Register tmp = stub.data<2>(); 2175 __ bind(stub.entry()); 2176 2177 __ fmv_x_w(dst, src); 2178 2179 // preserve the payloads of non-canonical NaNs. 2180 __ srai(dst, dst, 13); 2181 // preserve the sign bit. 2182 __ srai(tmp, dst, 13); 2183 __ slli(tmp, tmp, 10); 2184 __ mv(t0, 0x3ff); 2185 __ orr(tmp, tmp, t0); 2186 2187 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2188 __ andr(dst, dst, tmp); 2189 2190 __ j(stub.continuation()); 2191 #undef __ 2192 } 2193 2194 // j.l.Float.floatToFloat16 2195 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) { 2196 auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path); 2197 2198 // On riscv, NaN needs a special process as fcvt does not work in that case. 2199 2200 // check whether it's a NaN. 2201 // replace fclass with feq as performance optimization. 2202 feq_s(t0, src, src); 2203 // jump to stub processing NaN cases. 2204 beqz(t0, stub->entry()); 2205 2206 // non-NaN cases, just use built-in instructions. 2207 fcvt_h_s(ftmp, src); 2208 fmv_x_h(dst, ftmp); 2209 2210 bind(stub->continuation()); 2211 } 2212 2213 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) { 2214 #define __ masm. 2215 VectorRegister dst = stub.data<0>(); 2216 VectorRegister src = stub.data<1>(); 2217 uint vector_length = stub.data<2>(); 2218 __ bind(stub.entry()); 2219 2220 // following instructions mainly focus on NaN, as riscv does not handle 2221 // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time. 2222 // 2223 // construct NaN's in 32 bits from the NaN's in 16 bits, 2224 // we need the payloads of non-canonical NaNs to be preserved. 2225 2226 // adjust vector type to 2 * SEW. 2227 __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1); 2228 // widen and sign-extend src data. 2229 __ vsext_vf2(dst, src, Assembler::v0_t); 2230 __ mv(t0, 0x7f800000); 2231 // sign-bit was already set via sign-extension if necessary. 2232 __ vsll_vi(dst, dst, 13, Assembler::v0_t); 2233 __ vor_vx(dst, dst, t0, Assembler::v0_t); 2234 2235 __ j(stub.continuation()); 2236 #undef __ 2237 } 2238 2239 // j.l.Float.float16ToFloat 2240 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) { 2241 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint> 2242 (dst, src, vector_length, 24, float16_to_float_v_slow_path); 2243 assert_different_registers(dst, src); 2244 2245 // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case. 2246 // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly. 2247 // but we consider to get the slow path to process NaN and Inf at the same time, 2248 // as both of them are rare cases, and if we try to get the slow path to handle 2249 // only NaN case it would sacrifise the performance for normal cases, 2250 // i.e. non-NaN and non-Inf cases. 2251 2252 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2); 2253 2254 // check whether there is a NaN or +/- Inf. 2255 mv(t0, 0x7c00); 2256 vand_vx(v0, src, t0); 2257 // v0 will be used as mask in slow path. 2258 vmseq_vx(v0, v0, t0); 2259 vcpop_m(t0, v0); 2260 2261 // For non-NaN or non-Inf cases, just use built-in instructions. 2262 vfwcvt_f_f_v(dst, src); 2263 2264 // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide. 2265 bnez(t0, stub->entry()); 2266 2267 bind(stub->continuation()); 2268 } 2269 2270 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm, 2271 C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) { 2272 #define __ masm. 2273 VectorRegister dst = stub.data<0>(); 2274 VectorRegister src = stub.data<1>(); 2275 VectorRegister tmp = stub.data<2>(); 2276 __ bind(stub.entry()); 2277 2278 // mul is already set to mf2 in float_to_float16_v. 2279 2280 // preserve the payloads of non-canonical NaNs. 2281 __ vnsra_wi(dst, src, 13, Assembler::v0_t); 2282 2283 // preserve the sign bit. 2284 __ vnsra_wi(tmp, src, 26, Assembler::v0_t); 2285 __ vsll_vi(tmp, tmp, 10, Assembler::v0_t); 2286 __ mv(t0, 0x3ff); 2287 __ vor_vx(tmp, tmp, t0, Assembler::v0_t); 2288 2289 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2290 __ vand_vv(dst, dst, tmp, Assembler::v0_t); 2291 2292 __ j(stub.continuation()); 2293 #undef __ 2294 } 2295 2296 // j.l.Float.float16ToFloat 2297 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp, 2298 Register tmp, uint vector_length) { 2299 assert_different_registers(dst, src, vtmp); 2300 2301 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister> 2302 (dst, src, vtmp, 28, float_to_float16_v_slow_path); 2303 2304 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case. 2305 2306 vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1); 2307 2308 // check whether there is a NaN. 2309 // replace v_fclass with vmseq_vv as performance optimization. 2310 vmfne_vv(v0, src, src); 2311 vcpop_m(t0, v0); 2312 2313 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp); 2314 2315 // For non-NaN cases, just use built-in instructions. 2316 vfncvt_f_f_w(dst, src); 2317 2318 // jump to stub processing NaN cases. 2319 bnez(t0, stub->entry()); 2320 2321 bind(stub->continuation()); 2322 } 2323 2324 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) { 2325 vsetvli_helper(bt, vlen); 2326 2327 // check if input is -0, +0, signaling NaN or quiet NaN 2328 vfclass_v(v0, dst); 2329 mv(t0, fclass_mask::zero | fclass_mask::nan); 2330 vand_vx(v0, v0, t0); 2331 vmseq_vi(v0, v0, 0); 2332 2333 // use floating-point 1.0 with a sign of input 2334 vfsgnj_vv(dst, one, dst, v0_t); 2335 } 2336 2337 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) { 2338 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2339 // intrinsic is enabled when MaxVectorSize >= 16 2340 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2341 long len = is_long ? 64 : 32; 2342 2343 // load the src data(in bits) to be compressed. 2344 vsetivli(x0, 1, sew, Assembler::m1); 2345 vmv_s_x(v0, src); 2346 // reset the src data(in bytes) to zero. 2347 mv(t0, len); 2348 vsetvli(x0, t0, Assembler::e8, lmul); 2349 vmv_v_i(v4, 0); 2350 // convert the src data from bits to bytes. 2351 vmerge_vim(v4, v4, 1); // v0 as the implicit mask register 2352 // reset the dst data(in bytes) to zero. 2353 vmv_v_i(v8, 0); 2354 // load the mask data(in bits). 2355 vsetivli(x0, 1, sew, Assembler::m1); 2356 vmv_s_x(v0, mask); 2357 // compress the src data(in bytes) to dst(in bytes). 2358 vsetvli(x0, t0, Assembler::e8, lmul); 2359 vcompress_vm(v8, v4, v0); 2360 // convert the dst data from bytes to bits. 2361 vmseq_vi(v0, v8, 1); 2362 // store result back. 2363 vsetivli(x0, 1, sew, Assembler::m1); 2364 vmv_x_s(dst, v0); 2365 } 2366 2367 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) { 2368 compress_bits_v(dst, src, mask, /* is_long */ false); 2369 } 2370 2371 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) { 2372 compress_bits_v(dst, src, mask, /* is_long */ true); 2373 } 2374 2375 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) { 2376 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2377 // intrinsic is enabled when MaxVectorSize >= 16 2378 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2379 long len = is_long ? 64 : 32; 2380 2381 // load the src data(in bits) to be expanded. 2382 vsetivli(x0, 1, sew, Assembler::m1); 2383 vmv_s_x(v0, src); 2384 // reset the src data(in bytes) to zero. 2385 mv(t0, len); 2386 vsetvli(x0, t0, Assembler::e8, lmul); 2387 vmv_v_i(v4, 0); 2388 // convert the src data from bits to bytes. 2389 vmerge_vim(v4, v4, 1); // v0 as implicit mask register 2390 // reset the dst data(in bytes) to zero. 2391 vmv_v_i(v12, 0); 2392 // load the mask data(in bits). 2393 vsetivli(x0, 1, sew, Assembler::m1); 2394 vmv_s_x(v0, mask); 2395 // expand the src data(in bytes) to dst(in bytes). 2396 vsetvli(x0, t0, Assembler::e8, lmul); 2397 viota_m(v8, v0); 2398 vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register 2399 // convert the dst data from bytes to bits. 2400 vmseq_vi(v0, v12, 1); 2401 // store result back. 2402 vsetivli(x0, 1, sew, Assembler::m1); 2403 vmv_x_s(dst, v0); 2404 } 2405 2406 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) { 2407 expand_bits_v(dst, src, mask, /* is_long */ false); 2408 } 2409 2410 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) { 2411 expand_bits_v(dst, src, mask, /* is_long */ true); 2412 } 2413 2414 // j.l.Math.round(float) 2415 // Returns the closest int to the argument, with ties rounding to positive infinity. 2416 // We need to handle 3 special cases defined by java api spec: 2417 // NaN, 2418 // float >= Integer.MAX_VALUE, 2419 // float <= Integer.MIN_VALUE. 2420 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, 2421 BasicType bt, uint vector_length) { 2422 // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined, 2423 // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g. 2424 // RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted 2425 // to 2, instead of 2 and 3 respectively. 2426 // RUP does not work either, although java api requires "rounding to positive infinity", 2427 // but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively. 2428 // 2429 // The optimal solution for non-NaN cases is: 2430 // src+0.5 => dst, with rdn rounding mode, 2431 // convert dst from float to int, with rnd rounding mode. 2432 // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE. 2433 // 2434 // But, we still need to handle NaN explicilty with vector mask instructions. 2435 // 2436 // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details. 2437 2438 csrwi(CSR_FRM, C2_MacroAssembler::rdn); 2439 vsetvli_helper(bt, vector_length); 2440 2441 // don't rearrage the instructions sequence order without performance testing. 2442 // check MacroAssembler::java_round_float in riscv64 for more details. 2443 mv(t0, jint_cast(0.5f)); 2444 fmv_w_x(ftmp, t0); 2445 2446 // replacing vfclass with feq as performance optimization 2447 vmfeq_vv(v0, src, src); 2448 // set dst = 0 in cases of NaN 2449 vmv_v_x(dst, zr); 2450 2451 // dst = (src + 0.5) rounded down towards negative infinity 2452 vfadd_vf(dst, src, ftmp, Assembler::v0_t); 2453 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn 2454 2455 csrwi(CSR_FRM, C2_MacroAssembler::rne); 2456 } 2457 2458 // java.lang.Math.round(double a) 2459 // Returns the closest long to the argument, with ties rounding to positive infinity. 2460 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, 2461 BasicType bt, uint vector_length) { 2462 // check C2_MacroAssembler::java_round_float_v above for more details. 2463 2464 csrwi(CSR_FRM, C2_MacroAssembler::rdn); 2465 vsetvli_helper(bt, vector_length); 2466 2467 mv(t0, julong_cast(0.5)); 2468 fmv_d_x(ftmp, t0); 2469 2470 // replacing vfclass with feq as performance optimization 2471 vmfeq_vv(v0, src, src); 2472 // set dst = 0 in cases of NaN 2473 vmv_v_x(dst, zr); 2474 2475 // dst = (src + 0.5) rounded down towards negative infinity 2476 vfadd_vf(dst, src, ftmp, Assembler::v0_t); 2477 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn 2478 2479 csrwi(CSR_FRM, C2_MacroAssembler::rne); 2480 } 2481 2482 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, 2483 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE, 2484 Assembler::LMUL lmul) { 2485 Label loop; 2486 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16; 2487 2488 bind(loop); 2489 vsetvli(tmp1, cnt, sew, lmul); 2490 vlex_v(vr1, a1, sew); 2491 vlex_v(vr2, a2, sew); 2492 vmsne_vv(vrs, vr1, vr2); 2493 vfirst_m(tmp2, vrs); 2494 bgez(tmp2, DONE); 2495 sub(cnt, cnt, tmp1); 2496 if (!islatin) { 2497 slli(tmp1, tmp1, 1); // get byte counts 2498 } 2499 add(a1, a1, tmp1); 2500 add(a2, a2, tmp1); 2501 bnez(cnt, loop); 2502 2503 mv(result, true); 2504 } 2505 2506 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) { 2507 Label DONE; 2508 Register tmp1 = t0; 2509 Register tmp2 = t1; 2510 2511 BLOCK_COMMENT("string_equals_v {"); 2512 2513 mv(result, false); 2514 2515 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2); 2516 2517 bind(DONE); 2518 BLOCK_COMMENT("} string_equals_v"); 2519 } 2520 2521 // used by C2 ClearArray patterns. 2522 // base: Address of a buffer to be zeroed 2523 // cnt: Count in HeapWords 2524 // 2525 // base, cnt, v4, v5, v6, v7 and t0 are clobbered. 2526 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) { 2527 Label loop; 2528 2529 // making zero words 2530 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2531 vxor_vv(v4, v4, v4); 2532 2533 bind(loop); 2534 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2535 vse64_v(v4, base); 2536 sub(cnt, cnt, t0); 2537 shadd(base, t0, base, t0, 3); 2538 bnez(cnt, loop); 2539 } 2540 2541 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result, 2542 Register cnt1, int elem_size) { 2543 Label DONE; 2544 Register tmp1 = t0; 2545 Register tmp2 = t1; 2546 Register cnt2 = tmp2; 2547 int length_offset = arrayOopDesc::length_offset_in_bytes(); 2548 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 2549 2550 BLOCK_COMMENT("arrays_equals_v {"); 2551 2552 // if (a1 == a2), return true 2553 mv(result, true); 2554 beq(a1, a2, DONE); 2555 2556 mv(result, false); 2557 // if a1 == null or a2 == null, return false 2558 beqz(a1, DONE); 2559 beqz(a2, DONE); 2560 // if (a1.length != a2.length), return false 2561 lwu(cnt1, Address(a1, length_offset)); 2562 lwu(cnt2, Address(a2, length_offset)); 2563 bne(cnt1, cnt2, DONE); 2564 2565 la(a1, Address(a1, base_offset)); 2566 la(a2, Address(a2, base_offset)); 2567 2568 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2); 2569 2570 bind(DONE); 2571 2572 BLOCK_COMMENT("} arrays_equals_v"); 2573 } 2574 2575 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2, 2576 Register result, Register tmp1, Register tmp2, int encForm) { 2577 Label DIFFERENCE, DONE, L, loop; 2578 bool encLL = encForm == StrIntrinsicNode::LL; 2579 bool encLU = encForm == StrIntrinsicNode::LU; 2580 bool encUL = encForm == StrIntrinsicNode::UL; 2581 2582 bool str1_isL = encLL || encLU; 2583 bool str2_isL = encLL || encUL; 2584 2585 int minCharsInWord = encLL ? wordSize : wordSize / 2; 2586 2587 BLOCK_COMMENT("string_compare {"); 2588 2589 // for Latin strings, 1 byte for 1 character 2590 // for UTF16 strings, 2 bytes for 1 character 2591 if (!str1_isL) 2592 sraiw(cnt1, cnt1, 1); 2593 if (!str2_isL) 2594 sraiw(cnt2, cnt2, 1); 2595 2596 // if str1 == str2, return the difference 2597 // save the minimum of the string lengths in cnt2. 2598 sub(result, cnt1, cnt2); 2599 bgt(cnt1, cnt2, L); 2600 mv(cnt2, cnt1); 2601 bind(L); 2602 2603 // We focus on the optimization of small sized string. 2604 // Please check below document for string size distribution statistics. 2605 // https://cr.openjdk.org/~shade/density/string-density-report.pdf 2606 if (str1_isL == str2_isL) { // LL or UU 2607 // Below construction of v regs and lmul is based on test on 2 different boards, 2608 // vlen == 128 and vlen == 256 respectively. 2609 if (!encLL && MaxVectorSize == 16) { // UU 2610 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4); 2611 } else { // UU + MaxVectorSize or LL 2612 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2); 2613 } 2614 2615 j(DONE); 2616 } else { // LU or UL 2617 Register strL = encLU ? str1 : str2; 2618 Register strU = encLU ? str2 : str1; 2619 VectorRegister vstr1 = encLU ? v8 : v4; 2620 VectorRegister vstr2 = encLU ? v4 : v8; 2621 2622 bind(loop); 2623 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2); 2624 vle8_v(vstr1, strL); 2625 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4); 2626 vzext_vf2(vstr2, vstr1); 2627 vle16_v(vstr1, strU); 2628 vmsne_vv(v4, vstr2, vstr1); 2629 vfirst_m(tmp2, v4); 2630 bgez(tmp2, DIFFERENCE); 2631 sub(cnt2, cnt2, tmp1); 2632 add(strL, strL, tmp1); 2633 shadd(strU, tmp1, strU, tmp1, 1); 2634 bnez(cnt2, loop); 2635 j(DONE); 2636 } 2637 2638 bind(DIFFERENCE); 2639 slli(tmp1, tmp2, 1); 2640 add(str1, str1, str1_isL ? tmp2 : tmp1); 2641 add(str2, str2, str2_isL ? tmp2 : tmp1); 2642 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0)); 2643 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0)); 2644 sub(result, tmp1, tmp2); 2645 2646 bind(DONE); 2647 } 2648 2649 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) { 2650 Label loop; 2651 assert_different_registers(src, dst, len, tmp, t0); 2652 2653 BLOCK_COMMENT("byte_array_inflate_v {"); 2654 bind(loop); 2655 vsetvli(tmp, len, Assembler::e8, Assembler::m2); 2656 vle8_v(v6, src); 2657 vsetvli(t0, len, Assembler::e16, Assembler::m4); 2658 vzext_vf2(v4, v6); 2659 vse16_v(v4, dst); 2660 sub(len, len, tmp); 2661 add(src, src, tmp); 2662 shadd(dst, tmp, dst, tmp, 1); 2663 bnez(len, loop); 2664 BLOCK_COMMENT("} byte_array_inflate_v"); 2665 } 2666 2667 // Compress char[] array to byte[]. 2668 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) 2669 // result: the array length if every element in array can be encoded, 2670 // otherwise, the index of first non-latin1 (> 0xff) character. 2671 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, 2672 Register result, Register tmp) { 2673 encode_iso_array_v(src, dst, len, result, tmp, false); 2674 } 2675 2676 // Intrinsic for 2677 // 2678 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray 2679 // return the number of characters copied. 2680 // - java/lang/StringUTF16.compress 2681 // return index of non-latin1 character if copy fails, otherwise 'len'. 2682 // 2683 // This version always returns the number of characters copied. A successful 2684 // copy will complete with the post-condition: 'res' == 'len', while an 2685 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'. 2686 // 2687 // Clobbers: src, dst, len, result, t0 2688 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, 2689 Register result, Register tmp, bool ascii) { 2690 Label loop, fail, done; 2691 2692 BLOCK_COMMENT("encode_iso_array_v {"); 2693 mv(result, 0); 2694 2695 bind(loop); 2696 mv(tmp, ascii ? 0x7f : 0xff); 2697 vsetvli(t0, len, Assembler::e16, Assembler::m2); 2698 vle16_v(v2, src); 2699 2700 vmsgtu_vx(v1, v2, tmp); 2701 vfirst_m(tmp, v1); 2702 vmsbf_m(v0, v1); 2703 // compress char to byte 2704 vsetvli(t0, len, Assembler::e8); 2705 vncvt_x_x_w(v1, v2, Assembler::v0_t); 2706 vse8_v(v1, dst, Assembler::v0_t); 2707 2708 // fail if char > 0x7f/0xff 2709 bgez(tmp, fail); 2710 add(result, result, t0); 2711 add(dst, dst, t0); 2712 sub(len, len, t0); 2713 shadd(src, t0, src, t0, 1); 2714 bnez(len, loop); 2715 j(done); 2716 2717 bind(fail); 2718 add(result, result, tmp); 2719 2720 bind(done); 2721 BLOCK_COMMENT("} encode_iso_array_v"); 2722 } 2723 2724 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) { 2725 Label LOOP, SET_RESULT, DONE; 2726 2727 BLOCK_COMMENT("count_positives_v {"); 2728 assert_different_registers(ary, len, result, tmp); 2729 2730 mv(result, zr); 2731 2732 bind(LOOP); 2733 vsetvli(t0, len, Assembler::e8, Assembler::m4); 2734 vle8_v(v4, ary); 2735 vmslt_vx(v4, v4, zr); 2736 vfirst_m(tmp, v4); 2737 bgez(tmp, SET_RESULT); 2738 // if tmp == -1, all bytes are positive 2739 add(result, result, t0); 2740 2741 sub(len, len, t0); 2742 add(ary, ary, t0); 2743 bnez(len, LOOP); 2744 j(DONE); 2745 2746 // add remaining positive bytes count 2747 bind(SET_RESULT); 2748 add(result, result, tmp); 2749 2750 bind(DONE); 2751 BLOCK_COMMENT("} count_positives_v"); 2752 } 2753 2754 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1, 2755 Register ch, Register result, 2756 Register tmp1, Register tmp2, 2757 bool isL) { 2758 mv(result, zr); 2759 2760 Label loop, MATCH, DONE; 2761 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16; 2762 bind(loop); 2763 vsetvli(tmp1, cnt1, sew, Assembler::m4); 2764 vlex_v(v4, str1, sew); 2765 vmseq_vx(v4, v4, ch); 2766 vfirst_m(tmp2, v4); 2767 bgez(tmp2, MATCH); // if equal, return index 2768 2769 add(result, result, tmp1); 2770 sub(cnt1, cnt1, tmp1); 2771 if (!isL) slli(tmp1, tmp1, 1); 2772 add(str1, str1, tmp1); 2773 bnez(cnt1, loop); 2774 2775 mv(result, -1); 2776 j(DONE); 2777 2778 bind(MATCH); 2779 add(result, result, tmp2); 2780 2781 bind(DONE); 2782 } 2783 2784 // Set dst to NaN if any NaN input. 2785 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2786 BasicType bt, bool is_min, uint vector_length) { 2787 assert_different_registers(dst, src1, src2); 2788 2789 vsetvli_helper(bt, vector_length); 2790 2791 is_min ? vfmin_vv(dst, src1, src2) 2792 : vfmax_vv(dst, src1, src2); 2793 2794 vmfne_vv(v0, src1, src1); 2795 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2796 vmfne_vv(v0, src2, src2); 2797 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2798 } 2799 2800 // Set dst to NaN if any NaN input. 2801 // The destination vector register elements corresponding to masked-off elements 2802 // are handled with a mask-undisturbed policy. 2803 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2804 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2, 2805 BasicType bt, bool is_min, uint vector_length) { 2806 assert_different_registers(src1, src2, tmp1, tmp2); 2807 vsetvli_helper(bt, vector_length); 2808 2809 // Check vector elements of src1 and src2 for NaN. 2810 vmfeq_vv(tmp1, src1, src1); 2811 vmfeq_vv(tmp2, src2, src2); 2812 2813 vmandn_mm(v0, vmask, tmp1); 2814 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2815 vmandn_mm(v0, vmask, tmp2); 2816 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2817 2818 vmand_mm(tmp2, tmp1, tmp2); 2819 vmand_mm(v0, vmask, tmp2); 2820 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t) 2821 : vfmax_vv(dst, src1, src2, Assembler::v0_t); 2822 } 2823 2824 // Set dst to NaN if any NaN input. 2825 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst, 2826 FloatRegister src1, VectorRegister src2, 2827 VectorRegister tmp1, VectorRegister tmp2, 2828 bool is_double, bool is_min, uint vector_length, VectorMask vm) { 2829 assert_different_registers(dst, src1); 2830 assert_different_registers(src2, tmp1, tmp2); 2831 2832 Label L_done, L_NaN_1, L_NaN_2; 2833 // Set dst to src1 if src1 is NaN 2834 is_double ? feq_d(t0, src1, src1) 2835 : feq_s(t0, src1, src1); 2836 beqz(t0, L_NaN_2); 2837 2838 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 2839 vfmv_s_f(tmp2, src1); 2840 2841 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm) 2842 : vfredmax_vs(tmp1, src2, tmp2, vm); 2843 vfmv_f_s(dst, tmp1); 2844 2845 // Checking NaNs in src2 2846 vmfne_vv(tmp1, src2, src2, vm); 2847 vcpop_m(t0, tmp1, vm); 2848 beqz(t0, L_done); 2849 2850 bind(L_NaN_1); 2851 vfredusum_vs(tmp1, src2, tmp2, vm); 2852 vfmv_f_s(dst, tmp1); 2853 j(L_done); 2854 2855 bind(L_NaN_2); 2856 is_double ? fmv_d(dst, src1) 2857 : fmv_s(dst, src1); 2858 bind(L_done); 2859 } 2860 2861 bool C2_MacroAssembler::in_scratch_emit_size() { 2862 if (ciEnv::current()->task() != nullptr) { 2863 PhaseOutput* phase_output = Compile::current()->output(); 2864 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2865 return true; 2866 } 2867 } 2868 return MacroAssembler::in_scratch_emit_size(); 2869 } 2870 2871 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, 2872 VectorRegister src2, VectorRegister tmp, 2873 int opc, BasicType bt, uint vector_length, VectorMask vm) { 2874 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2875 vsetvli_helper(bt, vector_length); 2876 vmv_s_x(tmp, src1); 2877 switch (opc) { 2878 case Op_AddReductionVI: 2879 case Op_AddReductionVL: 2880 vredsum_vs(tmp, src2, tmp, vm); 2881 break; 2882 case Op_AndReductionV: 2883 vredand_vs(tmp, src2, tmp, vm); 2884 break; 2885 case Op_OrReductionV: 2886 vredor_vs(tmp, src2, tmp, vm); 2887 break; 2888 case Op_XorReductionV: 2889 vredxor_vs(tmp, src2, tmp, vm); 2890 break; 2891 case Op_MaxReductionV: 2892 vredmax_vs(tmp, src2, tmp, vm); 2893 break; 2894 case Op_MinReductionV: 2895 vredmin_vs(tmp, src2, tmp, vm); 2896 break; 2897 default: 2898 ShouldNotReachHere(); 2899 } 2900 vmv_x_s(dst, tmp); 2901 } 2902 2903 // Set vl and vtype for full and partial vector operations. 2904 // (vma = mu, vta = tu, vill = false) 2905 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) { 2906 Assembler::SEW sew = Assembler::elemtype_to_sew(bt); 2907 if (vector_length <= 31) { 2908 vsetivli(tmp, vector_length, sew, vlmul); 2909 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) { 2910 vsetvli(tmp, x0, sew, vlmul); 2911 } else { 2912 mv(tmp, vector_length); 2913 vsetvli(tmp, tmp, sew, vlmul); 2914 } 2915 } 2916 2917 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2918 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2919 assert(is_integral_type(bt), "unsupported element type"); 2920 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2921 vsetvli_helper(bt, vector_length); 2922 vmclr_m(vd); 2923 switch (cond) { 2924 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break; 2925 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break; 2926 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break; 2927 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break; 2928 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break; 2929 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break; 2930 case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break; 2931 case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break; 2932 case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break; 2933 case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break; 2934 default: 2935 assert(false, "unsupported compare condition"); 2936 ShouldNotReachHere(); 2937 } 2938 } 2939 2940 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2941 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2942 assert(is_floating_point_type(bt), "unsupported element type"); 2943 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2944 vsetvli_helper(bt, vector_length); 2945 vmclr_m(vd); 2946 switch (cond) { 2947 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break; 2948 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break; 2949 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break; 2950 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break; 2951 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break; 2952 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break; 2953 default: 2954 assert(false, "unsupported compare condition"); 2955 ShouldNotReachHere(); 2956 } 2957 } 2958 2959 // In Matcher::scalable_predicate_reg_slots, 2960 // we assume each predicate register is one-eighth of the size of 2961 // scalable vector register, one mask bit per vector byte. 2962 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) { 2963 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2964 add(t0, sp, offset); 2965 vse8_v(v, t0); 2966 } 2967 2968 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) { 2969 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2970 add(t0, sp, offset); 2971 vle8_v(v, t0); 2972 } 2973 2974 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2975 VectorRegister src, BasicType src_bt, bool is_signed) { 2976 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size"); 2977 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2978 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands 2979 // The destination EEW is greater than the source EEW, the source EMUL is at least 1, 2980 // and the overlap is in the highest-numbered part of the destination register group. 2981 // Since LMUL=1, vd and vs cannot be the same. 2982 assert_different_registers(dst, src); 2983 2984 vsetvli_helper(dst_bt, vector_length); 2985 if (is_signed) { 2986 if (src_bt == T_BYTE) { 2987 switch (dst_bt) { 2988 case T_SHORT: 2989 vsext_vf2(dst, src); 2990 break; 2991 case T_INT: 2992 vsext_vf4(dst, src); 2993 break; 2994 case T_LONG: 2995 vsext_vf8(dst, src); 2996 break; 2997 default: 2998 ShouldNotReachHere(); 2999 } 3000 } else if (src_bt == T_SHORT) { 3001 if (dst_bt == T_INT) { 3002 vsext_vf2(dst, src); 3003 } else { 3004 vsext_vf4(dst, src); 3005 } 3006 } else if (src_bt == T_INT) { 3007 vsext_vf2(dst, src); 3008 } 3009 } else { 3010 if (src_bt == T_BYTE) { 3011 switch (dst_bt) { 3012 case T_SHORT: 3013 vzext_vf2(dst, src); 3014 break; 3015 case T_INT: 3016 vzext_vf4(dst, src); 3017 break; 3018 case T_LONG: 3019 vzext_vf8(dst, src); 3020 break; 3021 default: 3022 ShouldNotReachHere(); 3023 } 3024 } else if (src_bt == T_SHORT) { 3025 if (dst_bt == T_INT) { 3026 vzext_vf2(dst, src); 3027 } else { 3028 vzext_vf4(dst, src); 3029 } 3030 } else if (src_bt == T_INT) { 3031 vzext_vf2(dst, src); 3032 } 3033 } 3034 } 3035 3036 // Vector narrow from src to dst with specified element sizes. 3037 // High part of dst vector will be filled with zero. 3038 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 3039 VectorRegister src, BasicType src_bt) { 3040 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size"); 3041 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 3042 mv(t0, vector_length); 3043 if (src_bt == T_LONG) { 3044 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions 3045 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source. 3046 // So we can currently only scale down by 1/2 the width at a time. 3047 vsetvli(t0, t0, Assembler::e32, Assembler::mf2); 3048 vncvt_x_x_w(dst, src); 3049 if (dst_bt == T_SHORT || dst_bt == T_BYTE) { 3050 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 3051 vncvt_x_x_w(dst, dst); 3052 if (dst_bt == T_BYTE) { 3053 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 3054 vncvt_x_x_w(dst, dst); 3055 } 3056 } 3057 } else if (src_bt == T_INT) { 3058 // T_SHORT 3059 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 3060 vncvt_x_x_w(dst, src); 3061 if (dst_bt == T_BYTE) { 3062 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 3063 vncvt_x_x_w(dst, dst); 3064 } 3065 } else if (src_bt == T_SHORT) { 3066 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 3067 vncvt_x_x_w(dst, src); 3068 } 3069 } 3070 3071 #define VFCVT_SAFE(VFLOATCVT) \ 3072 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \ 3073 assert_different_registers(dst, src); \ 3074 vxor_vv(dst, dst, dst); \ 3075 vmfeq_vv(v0, src, src); \ 3076 VFLOATCVT(dst, src, Assembler::v0_t); \ 3077 } 3078 3079 VFCVT_SAFE(vfcvt_rtz_x_f_v); 3080 3081 #undef VFCVT_SAFE 3082 3083 // Extract a scalar element from an vector at position 'idx'. 3084 // The input elements in src are expected to be of integral type. 3085 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt, 3086 int idx, VectorRegister tmp) { 3087 assert(is_integral_type(bt), "unsupported element type"); 3088 assert(idx >= 0, "idx cannot be negative"); 3089 // Only need the first element after vector slidedown 3090 vsetvli_helper(bt, 1); 3091 if (idx == 0) { 3092 vmv_x_s(dst, src); 3093 } else if (idx <= 31) { 3094 vslidedown_vi(tmp, src, idx); 3095 vmv_x_s(dst, tmp); 3096 } else { 3097 mv(t0, idx); 3098 vslidedown_vx(tmp, src, t0); 3099 vmv_x_s(dst, tmp); 3100 } 3101 } 3102 3103 // Extract a scalar element from an vector at position 'idx'. 3104 // The input elements in src are expected to be of floating point type. 3105 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, 3106 int idx, VectorRegister tmp) { 3107 assert(is_floating_point_type(bt), "unsupported element type"); 3108 assert(idx >= 0, "idx cannot be negative"); 3109 // Only need the first element after vector slidedown 3110 vsetvli_helper(bt, 1); 3111 if (idx == 0) { 3112 vfmv_f_s(dst, src); 3113 } else if (idx <= 31) { 3114 vslidedown_vi(tmp, src, idx); 3115 vfmv_f_s(dst, tmp); 3116 } else { 3117 mv(t0, idx); 3118 vslidedown_vx(tmp, src, t0); 3119 vfmv_f_s(dst, tmp); 3120 } 3121 }