1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/compile.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, 48 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) { 49 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure. 50 Register flag = t1; 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmp1Reg; 54 Register tmp = tmp2Reg; 55 Label object_has_monitor; 56 // Finish fast lock successfully. MUST branch to with flag == 0 57 Label locked; 58 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 59 Label slow_path; 60 61 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 62 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0); 63 64 mv(flag, 1); 65 66 // Load markWord from object into displaced_header. 67 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 68 69 if (DiagnoseSyncOnValueBasedClasses != 0) { 70 load_klass(tmp, oop); 71 lwu(tmp, Address(tmp, Klass::access_flags_offset())); 72 test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 73 bnez(tmp, slow_path); 74 } 75 76 // Check for existing monitor 77 test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value)); 78 bnez(tmp, object_has_monitor); 79 80 if (LockingMode == LM_MONITOR) { 81 j(slow_path); 82 } else { 83 assert(LockingMode == LM_LEGACY, "must be"); 84 // Set tmp to be (markWord of object | UNLOCK_VALUE). 85 ori(tmp, disp_hdr, markWord::unlocked_value); 86 87 // Initialize the box. (Must happen before we update the object mark!) 88 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 89 90 // Compare object markWord with an unlocked value (tmp) and if 91 // equal exchange the stack address of our box with object markWord. 92 // On failure disp_hdr contains the possibly locked markWord. 93 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, 94 Assembler::aq, Assembler::rl, /*result*/disp_hdr); 95 beq(disp_hdr, tmp, locked); 96 97 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 98 99 // If the compare-and-exchange succeeded, then we found an unlocked 100 // object, will have now locked it will continue at label locked 101 // We did not see an unlocked object so try the fast recursive case. 102 103 // Check if the owner is self by comparing the value in the 104 // markWord of object (disp_hdr) with the stack pointer. 105 sub(disp_hdr, disp_hdr, sp); 106 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place)); 107 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked, 108 // hence we can store 0 as the displaced header in the box, which indicates that it is a 109 // recursive lock. 110 andr(tmp/*==0?*/, disp_hdr, tmp); 111 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 112 beqz(tmp, locked); 113 j(slow_path); 114 } 115 116 // Handle existing monitor. 117 bind(object_has_monitor); 118 // The object's monitor m is unlocked iff m->owner == nullptr, 119 // otherwise m->owner may contain a thread or a stack address. 120 // 121 // Try to CAS m->owner from null to current thread. 122 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value)); 123 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, 124 Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected) 125 126 // Store a non-null value into the box to avoid looking like a re-entrant 127 // lock. The fast-path monitor unlock code checks for 128 // markWord::monitor_value so use markWord::unused_mark which has the 129 // relevant bit set, and also matches ObjectSynchronizer::slow_enter. 130 mv(tmp, (address)markWord::unused_mark().value()); 131 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 132 133 beqz(tmp3Reg, locked); // CAS success means locking succeeded 134 135 bne(tmp3Reg, xthread, slow_path); // Check for recursive locking 136 137 // Recursive lock case 138 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg); 139 140 bind(locked); 141 mv(flag, zr); 142 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg); 143 144 #ifdef ASSERT 145 // Check that locked label is reached with flag == 0. 146 Label flag_correct; 147 beqz(flag, flag_correct); 148 stop("Fast Lock Flag != 0"); 149 #endif 150 151 bind(slow_path); 152 #ifdef ASSERT 153 // Check that slow_path label is reached with flag != 0. 154 bnez(flag, flag_correct); 155 stop("Fast Lock Flag == 0"); 156 bind(flag_correct); 157 #endif 158 // C2 uses the value of flag (0 vs !0) to determine the continuation. 159 } 160 161 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, 162 Register tmp1Reg, Register tmp2Reg) { 163 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure. 164 Register flag = t1; 165 Register oop = objectReg; 166 Register box = boxReg; 167 Register disp_hdr = tmp1Reg; 168 Register tmp = tmp2Reg; 169 Label object_has_monitor; 170 // Finish fast lock successfully. MUST branch to with flag == 0 171 Label unlocked; 172 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 173 Label slow_path; 174 175 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 176 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0); 177 178 mv(flag, 1); 179 180 if (LockingMode == LM_LEGACY) { 181 // Find the lock address and load the displaced header from the stack. 182 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 183 184 // If the displaced header is 0, we have a recursive unlock. 185 beqz(disp_hdr, unlocked); 186 } 187 188 // Handle existing monitor. 189 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 190 test_bit(t0, tmp, exact_log2(markWord::monitor_value)); 191 bnez(t0, object_has_monitor); 192 193 if (LockingMode == LM_MONITOR) { 194 j(slow_path); 195 } else { 196 assert(LockingMode == LM_LEGACY, "must be"); 197 // Check if it is still a light weight lock, this is true if we 198 // see the stack address of the basicLock in the markWord of the 199 // object. 200 201 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, 202 Assembler::relaxed, Assembler::rl, /*result*/tmp); 203 beq(box, tmp, unlocked); // box == tmp if cas succeeds 204 j(slow_path); 205 } 206 207 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 208 209 // Handle existing monitor. 210 bind(object_has_monitor); 211 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 212 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 213 214 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 215 216 Label notRecursive; 217 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive. 218 219 // Recursive lock 220 addi(disp_hdr, disp_hdr, -1); 221 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 222 j(unlocked); 223 224 bind(notRecursive); 225 ld(t0, Address(tmp, ObjectMonitor::EntryList_offset())); 226 ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 227 orr(t0, t0, disp_hdr); // Will be 0 if both are 0. 228 bnez(t0, slow_path); 229 230 // need a release store here 231 la(tmp, Address(tmp, ObjectMonitor::owner_offset())); 232 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 233 sd(zr, Address(tmp)); // set unowned 234 235 bind(unlocked); 236 mv(flag, zr); 237 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg); 238 239 #ifdef ASSERT 240 // Check that unlocked label is reached with flag == 0. 241 Label flag_correct; 242 beqz(flag, flag_correct); 243 stop("Fast Lock Flag != 0"); 244 #endif 245 246 bind(slow_path); 247 #ifdef ASSERT 248 // Check that slow_path label is reached with flag != 0. 249 bnez(flag, flag_correct); 250 stop("Fast Lock Flag == 0"); 251 bind(flag_correct); 252 #endif 253 // C2 uses the value of flag (0 vs !0) to determine the continuation. 254 } 255 256 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) { 257 // Flag register, zero for success; non-zero for failure. 258 Register flag = t1; 259 260 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 261 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0); 262 263 mv(flag, 1); 264 265 // Handle inflated monitor. 266 Label inflated; 267 // Finish fast lock successfully. MUST branch to with flag == 0 268 Label locked; 269 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 270 Label slow_path; 271 272 if (DiagnoseSyncOnValueBasedClasses != 0) { 273 load_klass(tmp1, obj); 274 lwu(tmp1, Address(tmp1, Klass::access_flags_offset())); 275 test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 276 bnez(tmp1, slow_path); 277 } 278 279 const Register tmp1_mark = tmp1; 280 281 { // Lightweight locking 282 283 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0 284 Label push; 285 286 const Register tmp2_top = tmp2; 287 const Register tmp3_t = tmp3; 288 289 // Check if lock-stack is full. 290 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 291 mv(tmp3_t, (unsigned)LockStack::end_offset()); 292 bge(tmp2_top, tmp3_t, slow_path); 293 294 // Check if recursive. 295 add(tmp3_t, xthread, tmp2_top); 296 ld(tmp3_t, Address(tmp3_t, -oopSize)); 297 beq(obj, tmp3_t, push); 298 299 // Relaxed normal load to check for monitor. Optimization for monitor case. 300 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 301 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 302 bnez(tmp3_t, inflated); 303 304 // Not inflated 305 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la"); 306 307 // Try to lock. Transition lock-bits 0b01 => 0b00 308 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value); 309 xori(tmp3_t, tmp1_mark, markWord::unlocked_value); 310 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 311 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t); 312 bne(tmp1_mark, tmp3_t, slow_path); 313 314 bind(push); 315 // After successful lock, push object on lock-stack. 316 add(tmp3_t, xthread, tmp2_top); 317 sd(obj, Address(tmp3_t)); 318 addw(tmp2_top, tmp2_top, oopSize); 319 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 320 j(locked); 321 } 322 323 { // Handle inflated monitor. 324 bind(inflated); 325 326 // mark contains the tagged ObjectMonitor*. 327 const Register tmp1_tagged_monitor = tmp1_mark; 328 const uintptr_t monitor_tag = markWord::monitor_value; 329 const Register tmp2_owner_addr = tmp2; 330 const Register tmp3_owner = tmp3; 331 332 // Compute owner address. 333 la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag))); 334 335 // CAS owner (null => current thread). 336 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64, 337 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner); 338 beqz(tmp3_owner, locked); 339 340 // Check if recursive. 341 bne(tmp3_owner, xthread, slow_path); 342 343 // Recursive. 344 increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3); 345 } 346 347 bind(locked); 348 mv(flag, zr); 349 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 350 351 #ifdef ASSERT 352 // Check that locked label is reached with flag == 0. 353 Label flag_correct; 354 beqz(flag, flag_correct); 355 stop("Fast Lock Flag != 0"); 356 #endif 357 358 bind(slow_path); 359 #ifdef ASSERT 360 // Check that slow_path label is reached with flag != 0. 361 bnez(flag, flag_correct); 362 stop("Fast Lock Flag == 0"); 363 bind(flag_correct); 364 #endif 365 // C2 uses the value of flag (0 vs !0) to determine the continuation. 366 } 367 368 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2, 369 Register tmp3) { 370 // Flag register, zero for success; non-zero for failure. 371 Register flag = t1; 372 373 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 374 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0); 375 376 mv(flag, 1); 377 378 // Handle inflated monitor. 379 Label inflated, inflated_load_monitor; 380 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0 381 Label unlocked; 382 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0 383 Label slow_path; 384 385 const Register tmp1_mark = tmp1; 386 const Register tmp2_top = tmp2; 387 const Register tmp3_t = tmp3; 388 389 { // Lightweight unlock 390 391 // Check if obj is top of lock-stack. 392 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 393 subw(tmp2_top, tmp2_top, oopSize); 394 add(tmp3_t, xthread, tmp2_top); 395 ld(tmp3_t, Address(tmp3_t)); 396 // Top of lock stack was not obj. Must be monitor. 397 bne(obj, tmp3_t, inflated_load_monitor); 398 399 // Pop lock-stack. 400 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 401 DEBUG_ONLY(sd(zr, Address(tmp3_t));) 402 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 403 404 // Check if recursive. 405 add(tmp3_t, xthread, tmp2_top); 406 ld(tmp3_t, Address(tmp3_t, -oopSize)); 407 beq(obj, tmp3_t, unlocked); 408 409 // Not recursive. 410 // Load Mark. 411 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 412 413 // Check header for monitor (0b10). 414 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 415 bnez(tmp3_t, inflated); 416 417 // Try to unlock. Transition lock bits 0b00 => 0b01 418 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 419 ori(tmp3_t, tmp1_mark, markWord::unlocked_value); 420 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 421 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t); 422 beq(tmp1_mark, tmp3_t, unlocked); 423 424 // Compare and exchange failed. 425 // Restore lock-stack and handle the unlock in runtime. 426 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 427 DEBUG_ONLY(sd(obj, Address(tmp3_t));) 428 addw(tmp2_top, tmp2_top, oopSize); 429 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 430 j(slow_path); 431 } 432 433 { // Handle inflated monitor. 434 bind(inflated_load_monitor); 435 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 436 #ifdef ASSERT 437 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 438 bnez(tmp3_t, inflated); 439 stop("Fast Unlock not monitor"); 440 #endif 441 442 bind(inflated); 443 444 #ifdef ASSERT 445 Label check_done; 446 subw(tmp2_top, tmp2_top, oopSize); 447 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset())); 448 blt(tmp2_top, tmp3_t, check_done); 449 add(tmp3_t, xthread, tmp2_top); 450 ld(tmp3_t, Address(tmp3_t)); 451 bne(obj, tmp3_t, inflated); 452 stop("Fast Unlock lock on stack"); 453 bind(check_done); 454 #endif 455 456 // mark contains the tagged ObjectMonitor*. 457 const Register tmp1_monitor = tmp1_mark; 458 const uintptr_t monitor_tag = markWord::monitor_value; 459 460 // Untag the monitor. 461 sub(tmp1_monitor, tmp1_mark, monitor_tag); 462 463 const Register tmp2_recursions = tmp2; 464 Label not_recursive; 465 466 // Check if recursive. 467 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 468 beqz(tmp2_recursions, not_recursive); 469 470 // Recursive unlock. 471 addi(tmp2_recursions, tmp2_recursions, -1); 472 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 473 j(unlocked); 474 475 bind(not_recursive); 476 477 Label release; 478 const Register tmp2_owner_addr = tmp2; 479 480 // Compute owner address. 481 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset())); 482 483 // Check if the entry lists are empty. 484 ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset())); 485 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset())); 486 orr(t0, t0, tmp3_t); 487 beqz(t0, release); 488 489 // The owner may be anonymous and we removed the last obj entry in 490 // the lock-stack. This loses the information about the owner. 491 // Write the thread to the owner field so the runtime knows the owner. 492 sd(xthread, Address(tmp2_owner_addr)); 493 j(slow_path); 494 495 bind(release); 496 // Set owner to null. 497 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 498 sd(zr, Address(tmp2_owner_addr)); 499 } 500 501 bind(unlocked); 502 mv(flag, zr); 503 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 504 505 #ifdef ASSERT 506 // Check that unlocked label is reached with flag == 0. 507 Label flag_correct; 508 beqz(flag, flag_correct); 509 stop("Fast Lock Flag != 0"); 510 #endif 511 512 bind(slow_path); 513 #ifdef ASSERT 514 // Check that slow_path label is reached with flag != 0. 515 bnez(flag, flag_correct); 516 stop("Fast Lock Flag == 0"); 517 bind(flag_correct); 518 #endif 519 // C2 uses the value of flag (0 vs !0) to determine the continuation. 520 } 521 522 // short string 523 // StringUTF16.indexOfChar 524 // StringLatin1.indexOfChar 525 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1, 526 Register ch, Register result, 527 bool isL) 528 { 529 Register ch1 = t0; 530 Register index = t1; 531 532 BLOCK_COMMENT("string_indexof_char_short {"); 533 534 Label LOOP, LOOP1, LOOP4, LOOP8; 535 Label MATCH, MATCH1, MATCH2, MATCH3, 536 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH; 537 538 mv(result, -1); 539 mv(index, zr); 540 541 bind(LOOP); 542 addi(t0, index, 8); 543 ble(t0, cnt1, LOOP8); 544 addi(t0, index, 4); 545 ble(t0, cnt1, LOOP4); 546 j(LOOP1); 547 548 bind(LOOP8); 549 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 550 beq(ch, ch1, MATCH); 551 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 552 beq(ch, ch1, MATCH1); 553 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 554 beq(ch, ch1, MATCH2); 555 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 556 beq(ch, ch1, MATCH3); 557 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8)); 558 beq(ch, ch1, MATCH4); 559 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10)); 560 beq(ch, ch1, MATCH5); 561 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12)); 562 beq(ch, ch1, MATCH6); 563 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14)); 564 beq(ch, ch1, MATCH7); 565 addi(index, index, 8); 566 addi(str1, str1, isL ? 8 : 16); 567 blt(index, cnt1, LOOP); 568 j(NOMATCH); 569 570 bind(LOOP4); 571 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 572 beq(ch, ch1, MATCH); 573 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 574 beq(ch, ch1, MATCH1); 575 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 576 beq(ch, ch1, MATCH2); 577 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 578 beq(ch, ch1, MATCH3); 579 addi(index, index, 4); 580 addi(str1, str1, isL ? 4 : 8); 581 bge(index, cnt1, NOMATCH); 582 583 bind(LOOP1); 584 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1)); 585 beq(ch, ch1, MATCH); 586 addi(index, index, 1); 587 addi(str1, str1, isL ? 1 : 2); 588 blt(index, cnt1, LOOP1); 589 j(NOMATCH); 590 591 bind(MATCH1); 592 addi(index, index, 1); 593 j(MATCH); 594 595 bind(MATCH2); 596 addi(index, index, 2); 597 j(MATCH); 598 599 bind(MATCH3); 600 addi(index, index, 3); 601 j(MATCH); 602 603 bind(MATCH4); 604 addi(index, index, 4); 605 j(MATCH); 606 607 bind(MATCH5); 608 addi(index, index, 5); 609 j(MATCH); 610 611 bind(MATCH6); 612 addi(index, index, 6); 613 j(MATCH); 614 615 bind(MATCH7); 616 addi(index, index, 7); 617 618 bind(MATCH); 619 mv(result, index); 620 bind(NOMATCH); 621 BLOCK_COMMENT("} string_indexof_char_short"); 622 } 623 624 // StringUTF16.indexOfChar 625 // StringLatin1.indexOfChar 626 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 627 Register ch, Register result, 628 Register tmp1, Register tmp2, 629 Register tmp3, Register tmp4, 630 bool isL) 631 { 632 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG; 633 Register ch1 = t0; 634 Register orig_cnt = t1; 635 Register mask1 = tmp3; 636 Register mask2 = tmp2; 637 Register match_mask = tmp1; 638 Register trailing_char = tmp4; 639 Register unaligned_elems = tmp4; 640 641 BLOCK_COMMENT("string_indexof_char {"); 642 beqz(cnt1, NOMATCH); 643 644 addi(t0, cnt1, isL ? -32 : -16); 645 bgtz(t0, DO_LONG); 646 string_indexof_char_short(str1, cnt1, ch, result, isL); 647 j(DONE); 648 649 bind(DO_LONG); 650 mv(orig_cnt, cnt1); 651 if (AvoidUnalignedAccesses) { 652 Label ALIGNED; 653 andi(unaligned_elems, str1, 0x7); 654 beqz(unaligned_elems, ALIGNED); 655 sub(unaligned_elems, unaligned_elems, 8); 656 neg(unaligned_elems, unaligned_elems); 657 if (!isL) { 658 srli(unaligned_elems, unaligned_elems, 1); 659 } 660 // do unaligned part per element 661 string_indexof_char_short(str1, unaligned_elems, ch, result, isL); 662 bgez(result, DONE); 663 mv(orig_cnt, cnt1); 664 sub(cnt1, cnt1, unaligned_elems); 665 bind(ALIGNED); 666 } 667 668 // duplicate ch 669 if (isL) { 670 slli(ch1, ch, 8); 671 orr(ch, ch1, ch); 672 } 673 slli(ch1, ch, 16); 674 orr(ch, ch1, ch); 675 slli(ch1, ch, 32); 676 orr(ch, ch1, ch); 677 678 if (!isL) { 679 slli(cnt1, cnt1, 1); 680 } 681 682 uint64_t mask0101 = UCONST64(0x0101010101010101); 683 uint64_t mask0001 = UCONST64(0x0001000100010001); 684 mv(mask1, isL ? mask0101 : mask0001); 685 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 686 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 687 mv(mask2, isL ? mask7f7f : mask7fff); 688 689 bind(CH1_LOOP); 690 ld(ch1, Address(str1)); 691 addi(str1, str1, 8); 692 addi(cnt1, cnt1, -8); 693 compute_match_mask(ch1, ch, match_mask, mask1, mask2); 694 bnez(match_mask, HIT); 695 bgtz(cnt1, CH1_LOOP); 696 j(NOMATCH); 697 698 bind(HIT); 699 ctzc_bit(trailing_char, match_mask, isL, ch1, result); 700 srli(trailing_char, trailing_char, 3); 701 addi(cnt1, cnt1, 8); 702 ble(cnt1, trailing_char, NOMATCH); 703 // match case 704 if (!isL) { 705 srli(cnt1, cnt1, 1); 706 srli(trailing_char, trailing_char, 1); 707 } 708 709 sub(result, orig_cnt, cnt1); 710 add(result, result, trailing_char); 711 j(DONE); 712 713 bind(NOMATCH); 714 mv(result, -1); 715 716 bind(DONE); 717 BLOCK_COMMENT("} string_indexof_char"); 718 } 719 720 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp); 721 722 // Search for needle in haystack and return index or -1 723 // x10: result 724 // x11: haystack 725 // x12: haystack_len 726 // x13: needle 727 // x14: needle_len 728 void C2_MacroAssembler::string_indexof(Register haystack, Register needle, 729 Register haystack_len, Register needle_len, 730 Register tmp1, Register tmp2, 731 Register tmp3, Register tmp4, 732 Register tmp5, Register tmp6, 733 Register result, int ae) 734 { 735 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 736 737 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH; 738 739 Register ch1 = t0; 740 Register ch2 = t1; 741 Register nlen_tmp = tmp1; // needle len tmp 742 Register hlen_tmp = tmp2; // haystack len tmp 743 Register result_tmp = tmp4; 744 745 bool isLL = ae == StrIntrinsicNode::LL; 746 747 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 748 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 749 int needle_chr_shift = needle_isL ? 0 : 1; 750 int haystack_chr_shift = haystack_isL ? 0 : 1; 751 int needle_chr_size = needle_isL ? 1 : 2; 752 int haystack_chr_size = haystack_isL ? 1 : 2; 753 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 754 (load_chr_insn)&MacroAssembler::lhu; 755 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 756 (load_chr_insn)&MacroAssembler::lhu; 757 758 BLOCK_COMMENT("string_indexof {"); 759 760 // Note, inline_string_indexOf() generates checks: 761 // if (pattern.count > src.count) return -1; 762 // if (pattern.count == 0) return 0; 763 764 // We have two strings, a source string in haystack, haystack_len and a pattern string 765 // in needle, needle_len. Find the first occurrence of pattern in source or return -1. 766 767 // For larger pattern and source we use a simplified Boyer Moore algorithm. 768 // With a small pattern and source we use linear scan. 769 770 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm. 771 sub(result_tmp, haystack_len, needle_len); 772 // needle_len < 8, use linear scan 773 sub(t0, needle_len, 8); 774 bltz(t0, LINEARSEARCH); 775 // needle_len >= 256, use linear scan 776 sub(t0, needle_len, 256); 777 bgez(t0, LINEARSTUB); 778 // needle_len >= haystack_len/4, use linear scan 779 srli(t0, haystack_len, 2); 780 bge(needle_len, t0, LINEARSTUB); 781 782 // Boyer-Moore-Horspool introduction: 783 // The Boyer Moore alogorithm is based on the description here:- 784 // 785 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 786 // 787 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 788 // and the 'Good Suffix' rule. 789 // 790 // These rules are essentially heuristics for how far we can shift the 791 // pattern along the search string. 792 // 793 // The implementation here uses the 'Bad Character' rule only because of the 794 // complexity of initialisation for the 'Good Suffix' rule. 795 // 796 // This is also known as the Boyer-Moore-Horspool algorithm: 797 // 798 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 799 // 800 // #define ASIZE 256 801 // 802 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) { 803 // int i, j; 804 // unsigned c; 805 // unsigned char bc[ASIZE]; 806 // 807 // /* Preprocessing */ 808 // for (i = 0; i < ASIZE; ++i) 809 // bc[i] = m; 810 // for (i = 0; i < m - 1; ) { 811 // c = pattern[i]; 812 // ++i; 813 // // c < 256 for Latin1 string, so, no need for branch 814 // #ifdef PATTERN_STRING_IS_LATIN1 815 // bc[c] = m - i; 816 // #else 817 // if (c < ASIZE) bc[c] = m - i; 818 // #endif 819 // } 820 // 821 // /* Searching */ 822 // j = 0; 823 // while (j <= n - m) { 824 // c = src[i+j]; 825 // if (pattern[m-1] == c) 826 // int k; 827 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 828 // if (k < 0) return j; 829 // // c < 256 for Latin1 string, so, no need for branch 830 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1 831 // // LL case: (c< 256) always true. Remove branch 832 // j += bc[pattern[j+m-1]]; 833 // #endif 834 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF 835 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 836 // if (c < ASIZE) 837 // j += bc[pattern[j+m-1]]; 838 // else 839 // j += 1 840 // #endif 841 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1 842 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 843 // if (c < ASIZE) 844 // j += bc[pattern[j+m-1]]; 845 // else 846 // j += m 847 // #endif 848 // } 849 // return -1; 850 // } 851 852 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result 853 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 854 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 855 856 Register haystack_end = haystack_len; 857 Register skipch = tmp2; 858 859 // pattern length is >=8, so, we can read at least 1 register for cases when 860 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 861 // UL case. We'll re-read last character in inner pre-loop code to have 862 // single outer pre-loop load 863 const int firstStep = isLL ? 7 : 3; 864 865 const int ASIZE = 256; 866 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd) 867 868 sub(sp, sp, ASIZE); 869 870 // init BC offset table with default value: needle_len 871 slli(t0, needle_len, 8); 872 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len] 873 slli(tmp1, t0, 16); 874 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len] 875 slli(tmp1, t0, 32); 876 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len] 877 878 mv(ch1, sp); // ch1 is t0 879 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations 880 881 bind(BM_INIT_LOOP); 882 // for (i = 0; i < ASIZE; ++i) 883 // bc[i] = m; 884 for (int i = 0; i < 4; i++) { 885 sd(tmp5, Address(ch1, i * wordSize)); 886 } 887 add(ch1, ch1, 32); 888 sub(tmp6, tmp6, 4); 889 bgtz(tmp6, BM_INIT_LOOP); 890 891 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern 892 Register orig_haystack = tmp5; 893 mv(orig_haystack, haystack); 894 // result_tmp = tmp4 895 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift); 896 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1 897 mv(tmp3, needle); 898 899 // for (i = 0; i < m - 1; ) { 900 // c = pattern[i]; 901 // ++i; 902 // // c < 256 for Latin1 string, so, no need for branch 903 // #ifdef PATTERN_STRING_IS_LATIN1 904 // bc[c] = m - i; 905 // #else 906 // if (c < ASIZE) bc[c] = m - i; 907 // #endif 908 // } 909 bind(BCLOOP); 910 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg); 911 add(tmp3, tmp3, needle_chr_size); 912 if (!needle_isL) { 913 // ae == StrIntrinsicNode::UU 914 mv(tmp6, ASIZE); 915 bgeu(ch1, tmp6, BCSKIP); 916 } 917 add(tmp4, sp, ch1); 918 sb(ch2, Address(tmp4)); // store skip offset to BC offset table 919 920 bind(BCSKIP); 921 sub(ch2, ch2, 1); // for next pattern element, skip distance -1 922 bgtz(ch2, BCLOOP); 923 924 // tmp6: pattern end, address after needle 925 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift); 926 if (needle_isL == haystack_isL) { 927 // load last 8 bytes (8LL/4UU symbols) 928 ld(tmp6, Address(tmp6, -wordSize)); 929 } else { 930 // UL: from UTF-16(source) search Latin1(pattern) 931 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols) 932 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d 933 // We'll have to wait until load completed, but it's still faster than per-character loads+checks 934 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a 935 slli(ch2, tmp6, XLEN - 24); 936 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b 937 slli(ch1, tmp6, XLEN - 16); 938 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c 939 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d 940 slli(ch2, ch2, 16); 941 orr(ch2, ch2, ch1); // 0x00000b0c 942 slli(result, tmp3, 48); // use result as temp register 943 orr(tmp6, tmp6, result); // 0x0a00000d 944 slli(result, ch2, 16); 945 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d 946 } 947 948 // i = m - 1; 949 // skipch = j + i; 950 // if (skipch == pattern[m - 1] 951 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 952 // else 953 // move j with bad char offset table 954 bind(BMLOOPSTR2); 955 // compare pattern to source string backward 956 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift); 957 (this->*haystack_load_1chr)(skipch, Address(result), noreg); 958 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8 959 if (needle_isL == haystack_isL) { 960 // re-init tmp3. It's for free because it's executed in parallel with 961 // load above. Alternative is to initialize it before loop, but it'll 962 // affect performance on in-order systems with 2 or more ld/st pipelines 963 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1] 964 } 965 if (!isLL) { // UU/UL case 966 slli(ch2, nlen_tmp, 1); // offsets in bytes 967 } 968 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char 969 add(result, haystack, isLL ? nlen_tmp : ch2); 970 // load 8 bytes from source string 971 // if isLL is false then read granularity can be 2 972 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway 973 mv(ch1, tmp6); 974 if (isLL) { 975 j(BMLOOPSTR1_AFTER_LOAD); 976 } else { 977 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 978 j(BMLOOPSTR1_CMP); 979 } 980 981 bind(BMLOOPSTR1); 982 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift); 983 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 984 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift); 985 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 986 987 bind(BMLOOPSTR1_AFTER_LOAD); 988 sub(nlen_tmp, nlen_tmp, 1); 989 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP); 990 991 bind(BMLOOPSTR1_CMP); 992 beq(ch1, ch2, BMLOOPSTR1); 993 994 bind(BMSKIP); 995 if (!isLL) { 996 // if we've met UTF symbol while searching Latin1 pattern, then we can 997 // skip needle_len symbols 998 if (needle_isL != haystack_isL) { 999 mv(result_tmp, needle_len); 1000 } else { 1001 mv(result_tmp, 1); 1002 } 1003 mv(t0, ASIZE); 1004 bgeu(skipch, t0, BMADV); 1005 } 1006 add(result_tmp, sp, skipch); 1007 lbu(result_tmp, Address(result_tmp)); // load skip offset 1008 1009 bind(BMADV); 1010 sub(nlen_tmp, needle_len, 1); 1011 // move haystack after bad char skip offset 1012 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift); 1013 ble(haystack, haystack_end, BMLOOPSTR2); 1014 add(sp, sp, ASIZE); 1015 j(NOMATCH); 1016 1017 bind(BMLOOPSTR1_LASTCMP); 1018 bne(ch1, ch2, BMSKIP); 1019 1020 bind(BMMATCH); 1021 sub(result, haystack, orig_haystack); 1022 if (!haystack_isL) { 1023 srli(result, result, 1); 1024 } 1025 add(sp, sp, ASIZE); 1026 j(DONE); 1027 1028 bind(LINEARSTUB); 1029 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm 1030 bltz(t0, LINEARSEARCH); 1031 mv(result, zr); 1032 RuntimeAddress stub = nullptr; 1033 if (isLL) { 1034 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll()); 1035 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 1036 } else if (needle_isL) { 1037 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul()); 1038 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 1039 } else { 1040 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu()); 1041 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 1042 } 1043 address call = trampoline_call(stub); 1044 if (call == nullptr) { 1045 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH)); 1046 ciEnv::current()->record_failure("CodeCache is full"); 1047 return; 1048 } 1049 j(DONE); 1050 1051 bind(NOMATCH); 1052 mv(result, -1); 1053 j(DONE); 1054 1055 bind(LINEARSEARCH); 1056 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae); 1057 1058 bind(DONE); 1059 BLOCK_COMMENT("} string_indexof"); 1060 } 1061 1062 // string_indexof 1063 // result: x10 1064 // src: x11 1065 // src_count: x12 1066 // pattern: x13 1067 // pattern_count: x14 or 1/2/3/4 1068 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle, 1069 Register haystack_len, Register needle_len, 1070 Register tmp1, Register tmp2, 1071 Register tmp3, Register tmp4, 1072 int needle_con_cnt, Register result, int ae) 1073 { 1074 // Note: 1075 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant 1076 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1 1077 assert(needle_con_cnt <= 4, "Invalid needle constant count"); 1078 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1079 1080 Register ch1 = t0; 1081 Register ch2 = t1; 1082 Register hlen_neg = haystack_len, nlen_neg = needle_len; 1083 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4; 1084 1085 bool isLL = ae == StrIntrinsicNode::LL; 1086 1087 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 1088 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 1089 int needle_chr_shift = needle_isL ? 0 : 1; 1090 int haystack_chr_shift = haystack_isL ? 0 : 1; 1091 int needle_chr_size = needle_isL ? 1 : 2; 1092 int haystack_chr_size = haystack_isL ? 1 : 2; 1093 1094 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 1095 (load_chr_insn)&MacroAssembler::lhu; 1096 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 1097 (load_chr_insn)&MacroAssembler::lhu; 1098 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu; 1099 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld; 1100 1101 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE; 1102 1103 Register first = tmp3; 1104 1105 if (needle_con_cnt == -1) { 1106 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 1107 1108 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2); 1109 bltz(t0, DOSHORT); 1110 1111 (this->*needle_load_1chr)(first, Address(needle), noreg); 1112 slli(t0, needle_len, needle_chr_shift); 1113 add(needle, needle, t0); 1114 neg(nlen_neg, t0); 1115 slli(t0, result_tmp, haystack_chr_shift); 1116 add(haystack, haystack, t0); 1117 neg(hlen_neg, t0); 1118 1119 bind(FIRST_LOOP); 1120 add(t0, haystack, hlen_neg); 1121 (this->*haystack_load_1chr)(ch2, Address(t0), noreg); 1122 beq(first, ch2, STR1_LOOP); 1123 1124 bind(STR2_NEXT); 1125 add(hlen_neg, hlen_neg, haystack_chr_size); 1126 blez(hlen_neg, FIRST_LOOP); 1127 j(NOMATCH); 1128 1129 bind(STR1_LOOP); 1130 add(nlen_tmp, nlen_neg, needle_chr_size); 1131 add(hlen_tmp, hlen_neg, haystack_chr_size); 1132 bgez(nlen_tmp, MATCH); 1133 1134 bind(STR1_NEXT); 1135 add(ch1, needle, nlen_tmp); 1136 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 1137 add(ch2, haystack, hlen_tmp); 1138 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1139 bne(ch1, ch2, STR2_NEXT); 1140 add(nlen_tmp, nlen_tmp, needle_chr_size); 1141 add(hlen_tmp, hlen_tmp, haystack_chr_size); 1142 bltz(nlen_tmp, STR1_NEXT); 1143 j(MATCH); 1144 1145 bind(DOSHORT); 1146 if (needle_isL == haystack_isL) { 1147 sub(t0, needle_len, 2); 1148 bltz(t0, DO1); 1149 bgtz(t0, DO3); 1150 } 1151 } 1152 1153 if (needle_con_cnt == 4) { 1154 Label CH1_LOOP; 1155 (this->*load_4chr)(ch1, Address(needle), noreg); 1156 sub(result_tmp, haystack_len, 4); 1157 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp 1158 add(haystack, haystack, tmp3); 1159 neg(hlen_neg, tmp3); 1160 if (AvoidUnalignedAccesses) { 1161 // preload first value, then we will read by 1 character per loop, instead of four 1162 // just shifting previous ch2 right by size of character in bits 1163 add(tmp3, haystack, hlen_neg); 1164 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1165 if (isLL) { 1166 // need to erase 1 most significant byte in 32-bit value of ch2 1167 slli(ch2, ch2, 40); 1168 srli(ch2, ch2, 32); 1169 } else { 1170 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation 1171 } 1172 } 1173 1174 bind(CH1_LOOP); 1175 add(tmp3, haystack, hlen_neg); 1176 if (AvoidUnalignedAccesses) { 1177 srli(ch2, ch2, isLL ? 8 : 16); 1178 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg); 1179 slli(tmp3, tmp3, isLL ? 24 : 48); 1180 add(ch2, ch2, tmp3); 1181 } else { 1182 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1183 } 1184 beq(ch1, ch2, MATCH); 1185 add(hlen_neg, hlen_neg, haystack_chr_size); 1186 blez(hlen_neg, CH1_LOOP); 1187 j(NOMATCH); 1188 } 1189 1190 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) { 1191 Label CH1_LOOP; 1192 BLOCK_COMMENT("string_indexof DO2 {"); 1193 bind(DO2); 1194 (this->*load_2chr)(ch1, Address(needle), noreg); 1195 if (needle_con_cnt == 2) { 1196 sub(result_tmp, haystack_len, 2); 1197 } 1198 slli(tmp3, result_tmp, haystack_chr_shift); 1199 add(haystack, haystack, tmp3); 1200 neg(hlen_neg, tmp3); 1201 if (AvoidUnalignedAccesses) { 1202 // preload first value, then we will read by 1 character per loop, instead of two 1203 // just shifting previous ch2 right by size of character in bits 1204 add(tmp3, haystack, hlen_neg); 1205 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1206 slli(ch2, ch2, isLL ? 8 : 16); 1207 } 1208 bind(CH1_LOOP); 1209 add(tmp3, haystack, hlen_neg); 1210 if (AvoidUnalignedAccesses) { 1211 srli(ch2, ch2, isLL ? 8 : 16); 1212 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg); 1213 slli(tmp3, tmp3, isLL ? 8 : 16); 1214 add(ch2, ch2, tmp3); 1215 } else { 1216 (this->*load_2chr)(ch2, Address(tmp3), noreg); 1217 } 1218 beq(ch1, ch2, MATCH); 1219 add(hlen_neg, hlen_neg, haystack_chr_size); 1220 blez(hlen_neg, CH1_LOOP); 1221 j(NOMATCH); 1222 BLOCK_COMMENT("} string_indexof DO2"); 1223 } 1224 1225 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) { 1226 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1227 BLOCK_COMMENT("string_indexof DO3 {"); 1228 1229 bind(DO3); 1230 (this->*load_2chr)(first, Address(needle), noreg); 1231 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg); 1232 if (needle_con_cnt == 3) { 1233 sub(result_tmp, haystack_len, 3); 1234 } 1235 slli(hlen_tmp, result_tmp, haystack_chr_shift); 1236 add(haystack, haystack, hlen_tmp); 1237 neg(hlen_neg, hlen_tmp); 1238 1239 bind(FIRST_LOOP); 1240 add(ch2, haystack, hlen_neg); 1241 if (AvoidUnalignedAccesses) { 1242 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2 1243 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1244 slli(tmp2, tmp2, isLL ? 8 : 16); 1245 add(ch2, ch2, tmp2); 1246 } else { 1247 (this->*load_2chr)(ch2, Address(ch2), noreg); 1248 } 1249 beq(first, ch2, STR1_LOOP); 1250 1251 bind(STR2_NEXT); 1252 add(hlen_neg, hlen_neg, haystack_chr_size); 1253 blez(hlen_neg, FIRST_LOOP); 1254 j(NOMATCH); 1255 1256 bind(STR1_LOOP); 1257 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size); 1258 add(ch2, haystack, hlen_tmp); 1259 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1260 bne(ch1, ch2, STR2_NEXT); 1261 j(MATCH); 1262 BLOCK_COMMENT("} string_indexof DO3"); 1263 } 1264 1265 if (needle_con_cnt == -1 || needle_con_cnt == 1) { 1266 Label DO1_LOOP; 1267 1268 BLOCK_COMMENT("string_indexof DO1 {"); 1269 bind(DO1); 1270 (this->*needle_load_1chr)(ch1, Address(needle), noreg); 1271 sub(result_tmp, haystack_len, 1); 1272 slli(tmp3, result_tmp, haystack_chr_shift); 1273 add(haystack, haystack, tmp3); 1274 neg(hlen_neg, tmp3); 1275 1276 bind(DO1_LOOP); 1277 add(tmp3, haystack, hlen_neg); 1278 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1279 beq(ch1, ch2, MATCH); 1280 add(hlen_neg, hlen_neg, haystack_chr_size); 1281 blez(hlen_neg, DO1_LOOP); 1282 BLOCK_COMMENT("} string_indexof DO1"); 1283 } 1284 1285 bind(NOMATCH); 1286 mv(result, -1); 1287 j(DONE); 1288 1289 bind(MATCH); 1290 srai(t0, hlen_neg, haystack_chr_shift); 1291 add(result, result_tmp, t0); 1292 1293 bind(DONE); 1294 } 1295 1296 // Compare strings. 1297 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1298 Register cnt1, Register cnt2, Register result, 1299 Register tmp1, Register tmp2, Register tmp3, 1300 int ae) 1301 { 1302 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1303 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1304 SHORT_LOOP_START, TAIL_CHECK, L; 1305 1306 const int STUB_THRESHOLD = 64 + 8; 1307 bool isLL = ae == StrIntrinsicNode::LL; 1308 bool isLU = ae == StrIntrinsicNode::LU; 1309 bool isUL = ae == StrIntrinsicNode::UL; 1310 1311 bool str1_isL = isLL || isLU; 1312 bool str2_isL = isLL || isUL; 1313 1314 // for L strings, 1 byte for 1 character 1315 // for U strings, 2 bytes for 1 character 1316 int str1_chr_size = str1_isL ? 1 : 2; 1317 int str2_chr_size = str2_isL ? 1 : 2; 1318 int minCharsInWord = isLL ? wordSize : wordSize / 2; 1319 1320 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1321 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1322 1323 BLOCK_COMMENT("string_compare {"); 1324 1325 // Bizzarely, the counts are passed in bytes, regardless of whether they 1326 // are L or U strings, however the result is always in characters. 1327 if (!str1_isL) { 1328 sraiw(cnt1, cnt1, 1); 1329 } 1330 if (!str2_isL) { 1331 sraiw(cnt2, cnt2, 1); 1332 } 1333 1334 // Compute the minimum of the string lengths and save the difference in result. 1335 sub(result, cnt1, cnt2); 1336 bgt(cnt1, cnt2, L); 1337 mv(cnt2, cnt1); 1338 bind(L); 1339 1340 // A very short string 1341 mv(t0, minCharsInWord); 1342 ble(cnt2, t0, SHORT_STRING); 1343 1344 // Compare longwords 1345 // load first parts of strings and finish initialization while loading 1346 { 1347 if (str1_isL == str2_isL) { // LL or UU 1348 // check if str1 and str2 is same pointer 1349 beq(str1, str2, DONE); 1350 // load 8 bytes once to compare 1351 ld(tmp1, Address(str1)); 1352 ld(tmp2, Address(str2)); 1353 mv(t0, STUB_THRESHOLD); 1354 bge(cnt2, t0, STUB); 1355 sub(cnt2, cnt2, minCharsInWord); 1356 beqz(cnt2, TAIL_CHECK); 1357 // convert cnt2 from characters to bytes 1358 if (!str1_isL) { 1359 slli(cnt2, cnt2, 1); 1360 } 1361 add(str2, str2, cnt2); 1362 add(str1, str1, cnt2); 1363 sub(cnt2, zr, cnt2); 1364 } else if (isLU) { // LU case 1365 lwu(tmp1, Address(str1)); 1366 ld(tmp2, Address(str2)); 1367 mv(t0, STUB_THRESHOLD); 1368 bge(cnt2, t0, STUB); 1369 addi(cnt2, cnt2, -4); 1370 add(str1, str1, cnt2); 1371 sub(cnt1, zr, cnt2); 1372 slli(cnt2, cnt2, 1); 1373 add(str2, str2, cnt2); 1374 inflate_lo32(tmp3, tmp1); 1375 mv(tmp1, tmp3); 1376 sub(cnt2, zr, cnt2); 1377 addi(cnt1, cnt1, 4); 1378 } else { // UL case 1379 ld(tmp1, Address(str1)); 1380 lwu(tmp2, Address(str2)); 1381 mv(t0, STUB_THRESHOLD); 1382 bge(cnt2, t0, STUB); 1383 addi(cnt2, cnt2, -4); 1384 slli(t0, cnt2, 1); 1385 sub(cnt1, zr, t0); 1386 add(str1, str1, t0); 1387 add(str2, str2, cnt2); 1388 inflate_lo32(tmp3, tmp2); 1389 mv(tmp2, tmp3); 1390 sub(cnt2, zr, cnt2); 1391 addi(cnt1, cnt1, 8); 1392 } 1393 addi(cnt2, cnt2, isUL ? 4 : 8); 1394 bne(tmp1, tmp2, DIFFERENCE); 1395 bgez(cnt2, TAIL); 1396 1397 // main loop 1398 bind(NEXT_WORD); 1399 if (str1_isL == str2_isL) { // LL or UU 1400 add(t0, str1, cnt2); 1401 ld(tmp1, Address(t0)); 1402 add(t0, str2, cnt2); 1403 ld(tmp2, Address(t0)); 1404 addi(cnt2, cnt2, 8); 1405 } else if (isLU) { // LU case 1406 add(t0, str1, cnt1); 1407 lwu(tmp1, Address(t0)); 1408 add(t0, str2, cnt2); 1409 ld(tmp2, Address(t0)); 1410 addi(cnt1, cnt1, 4); 1411 inflate_lo32(tmp3, tmp1); 1412 mv(tmp1, tmp3); 1413 addi(cnt2, cnt2, 8); 1414 } else { // UL case 1415 add(t0, str2, cnt2); 1416 lwu(tmp2, Address(t0)); 1417 add(t0, str1, cnt1); 1418 ld(tmp1, Address(t0)); 1419 inflate_lo32(tmp3, tmp2); 1420 mv(tmp2, tmp3); 1421 addi(cnt1, cnt1, 8); 1422 addi(cnt2, cnt2, 4); 1423 } 1424 bne(tmp1, tmp2, DIFFERENCE); 1425 bltz(cnt2, NEXT_WORD); 1426 bind(TAIL); 1427 if (str1_isL == str2_isL) { // LL or UU 1428 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2); 1429 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2); 1430 } else if (isLU) { // LU case 1431 load_int_misaligned(tmp1, Address(str1), tmp3, false); 1432 load_long_misaligned(tmp2, Address(str2), tmp3, 2); 1433 inflate_lo32(tmp3, tmp1); 1434 mv(tmp1, tmp3); 1435 } else { // UL case 1436 load_int_misaligned(tmp2, Address(str2), tmp3, false); 1437 load_long_misaligned(tmp1, Address(str1), tmp3, 2); 1438 inflate_lo32(tmp3, tmp2); 1439 mv(tmp2, tmp3); 1440 } 1441 bind(TAIL_CHECK); 1442 beq(tmp1, tmp2, DONE); 1443 1444 // Find the first different characters in the longwords and 1445 // compute their difference. 1446 bind(DIFFERENCE); 1447 xorr(tmp3, tmp1, tmp2); 1448 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb 1449 srl(tmp1, tmp1, result); 1450 srl(tmp2, tmp2, result); 1451 if (isLL) { 1452 andi(tmp1, tmp1, 0xFF); 1453 andi(tmp2, tmp2, 0xFF); 1454 } else { 1455 andi(tmp1, tmp1, 0xFFFF); 1456 andi(tmp2, tmp2, 0xFFFF); 1457 } 1458 sub(result, tmp1, tmp2); 1459 j(DONE); 1460 } 1461 1462 bind(STUB); 1463 RuntimeAddress stub = nullptr; 1464 switch (ae) { 1465 case StrIntrinsicNode::LL: 1466 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL()); 1467 break; 1468 case StrIntrinsicNode::UU: 1469 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU()); 1470 break; 1471 case StrIntrinsicNode::LU: 1472 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU()); 1473 break; 1474 case StrIntrinsicNode::UL: 1475 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL()); 1476 break; 1477 default: 1478 ShouldNotReachHere(); 1479 } 1480 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1481 address call = trampoline_call(stub); 1482 if (call == nullptr) { 1483 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1484 ciEnv::current()->record_failure("CodeCache is full"); 1485 return; 1486 } 1487 j(DONE); 1488 1489 bind(SHORT_STRING); 1490 // Is the minimum length zero? 1491 beqz(cnt2, DONE); 1492 // arrange code to do most branches while loading and loading next characters 1493 // while comparing previous 1494 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1495 addi(str1, str1, str1_chr_size); 1496 addi(cnt2, cnt2, -1); 1497 beqz(cnt2, SHORT_LAST_INIT); 1498 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1499 addi(str2, str2, str2_chr_size); 1500 j(SHORT_LOOP_START); 1501 bind(SHORT_LOOP); 1502 addi(cnt2, cnt2, -1); 1503 beqz(cnt2, SHORT_LAST); 1504 bind(SHORT_LOOP_START); 1505 (this->*str1_load_chr)(tmp2, Address(str1), t0); 1506 addi(str1, str1, str1_chr_size); 1507 (this->*str2_load_chr)(t0, Address(str2), t0); 1508 addi(str2, str2, str2_chr_size); 1509 bne(tmp1, cnt1, SHORT_LOOP_TAIL); 1510 addi(cnt2, cnt2, -1); 1511 beqz(cnt2, SHORT_LAST2); 1512 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1513 addi(str1, str1, str1_chr_size); 1514 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1515 addi(str2, str2, str2_chr_size); 1516 beq(tmp2, t0, SHORT_LOOP); 1517 sub(result, tmp2, t0); 1518 j(DONE); 1519 bind(SHORT_LOOP_TAIL); 1520 sub(result, tmp1, cnt1); 1521 j(DONE); 1522 bind(SHORT_LAST2); 1523 beq(tmp2, t0, DONE); 1524 sub(result, tmp2, t0); 1525 1526 j(DONE); 1527 bind(SHORT_LAST_INIT); 1528 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1529 addi(str2, str2, str2_chr_size); 1530 bind(SHORT_LAST); 1531 beq(tmp1, cnt1, DONE); 1532 sub(result, tmp1, cnt1); 1533 1534 bind(DONE); 1535 1536 BLOCK_COMMENT("} string_compare"); 1537 } 1538 1539 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, 1540 Register tmp1, Register tmp2, Register tmp3, 1541 Register result, int elem_size) { 1542 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 1543 assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0); 1544 1545 int elem_per_word = wordSize/elem_size; 1546 int log_elem_size = exact_log2(elem_size); 1547 int length_offset = arrayOopDesc::length_offset_in_bytes(); 1548 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 1549 1550 Register cnt1 = tmp3; 1551 Register cnt2 = tmp1; // cnt2 only used in array length compare 1552 Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01; 1553 1554 BLOCK_COMMENT("arrays_equals {"); 1555 1556 // if (a1 == a2), return true 1557 beq(a1, a2, SAME); 1558 1559 mv(result, false); 1560 // if (a1 == nullptr || a2 == nullptr) 1561 // return false; 1562 beqz(a1, DONE); 1563 beqz(a2, DONE); 1564 1565 // if (a1.length != a2.length) 1566 // return false; 1567 lwu(cnt1, Address(a1, length_offset)); 1568 lwu(cnt2, Address(a2, length_offset)); 1569 bne(cnt1, cnt2, DONE); 1570 1571 la(a1, Address(a1, base_offset)); 1572 la(a2, Address(a2, base_offset)); 1573 // Check for short strings, i.e. smaller than wordSize. 1574 addi(cnt1, cnt1, -elem_per_word); 1575 bltz(cnt1, SHORT); 1576 1577 // Main 8 byte comparison loop. 1578 bind(NEXT_WORD); { 1579 ld(tmp1, Address(a1)); 1580 ld(tmp2, Address(a2)); 1581 addi(cnt1, cnt1, -elem_per_word); 1582 addi(a1, a1, wordSize); 1583 addi(a2, a2, wordSize); 1584 bne(tmp1, tmp2, DONE); 1585 } bgez(cnt1, NEXT_WORD); 1586 1587 addi(tmp1, cnt1, elem_per_word); 1588 beqz(tmp1, SAME); 1589 1590 bind(SHORT); 1591 test_bit(tmp1, cnt1, 2 - log_elem_size); 1592 beqz(tmp1, TAIL03); // 0-7 bytes left. 1593 { 1594 lwu(tmp1, Address(a1)); 1595 lwu(tmp2, Address(a2)); 1596 addi(a1, a1, 4); 1597 addi(a2, a2, 4); 1598 bne(tmp1, tmp2, DONE); 1599 } 1600 1601 bind(TAIL03); 1602 test_bit(tmp1, cnt1, 1 - log_elem_size); 1603 beqz(tmp1, TAIL01); // 0-3 bytes left. 1604 { 1605 lhu(tmp1, Address(a1)); 1606 lhu(tmp2, Address(a2)); 1607 addi(a1, a1, 2); 1608 addi(a2, a2, 2); 1609 bne(tmp1, tmp2, DONE); 1610 } 1611 1612 bind(TAIL01); 1613 if (elem_size == 1) { // Only needed when comparing byte arrays. 1614 test_bit(tmp1, cnt1, 0); 1615 beqz(tmp1, SAME); // 0-1 bytes left. 1616 { 1617 lbu(tmp1, Address(a1)); 1618 lbu(tmp2, Address(a2)); 1619 bne(tmp1, tmp2, DONE); 1620 } 1621 } 1622 1623 bind(SAME); 1624 mv(result, true); 1625 // That's it. 1626 bind(DONE); 1627 1628 BLOCK_COMMENT("} arrays_equals"); 1629 } 1630 1631 // Compare Strings 1632 1633 // For Strings we're passed the address of the first characters in a1 and a2 1634 // and the length in cnt1. There are two implementations. 1635 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed 1636 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte. 1637 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte. 1638 1639 void C2_MacroAssembler::string_equals(Register a1, Register a2, 1640 Register result, Register cnt1) 1641 { 1642 Label SAME, DONE, SHORT, NEXT_WORD; 1643 Register tmp1 = t0; 1644 Register tmp2 = t1; 1645 1646 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); 1647 1648 BLOCK_COMMENT("string_equals {"); 1649 1650 mv(result, false); 1651 1652 // Check for short strings, i.e. smaller than wordSize. 1653 addi(cnt1, cnt1, -wordSize); 1654 bltz(cnt1, SHORT); 1655 1656 // Main 8 byte comparison loop. 1657 bind(NEXT_WORD); { 1658 ld(tmp1, Address(a1)); 1659 ld(tmp2, Address(a2)); 1660 addi(cnt1, cnt1, -wordSize); 1661 addi(a1, a1, wordSize); 1662 addi(a2, a2, wordSize); 1663 bne(tmp1, tmp2, DONE); 1664 } bgez(cnt1, NEXT_WORD); 1665 1666 addi(tmp1, cnt1, wordSize); 1667 beqz(tmp1, SAME); 1668 1669 bind(SHORT); 1670 Label TAIL03, TAIL01; 1671 1672 // 0-7 bytes left. 1673 test_bit(tmp1, cnt1, 2); 1674 beqz(tmp1, TAIL03); 1675 { 1676 lwu(tmp1, Address(a1)); 1677 lwu(tmp2, Address(a2)); 1678 addi(a1, a1, 4); 1679 addi(a2, a2, 4); 1680 bne(tmp1, tmp2, DONE); 1681 } 1682 1683 bind(TAIL03); 1684 // 0-3 bytes left. 1685 test_bit(tmp1, cnt1, 1); 1686 beqz(tmp1, TAIL01); 1687 { 1688 lhu(tmp1, Address(a1)); 1689 lhu(tmp2, Address(a2)); 1690 addi(a1, a1, 2); 1691 addi(a2, a2, 2); 1692 bne(tmp1, tmp2, DONE); 1693 } 1694 1695 bind(TAIL01); 1696 // 0-1 bytes left. 1697 test_bit(tmp1, cnt1, 0); 1698 beqz(tmp1, SAME); 1699 { 1700 lbu(tmp1, Address(a1)); 1701 lbu(tmp2, Address(a2)); 1702 bne(tmp1, tmp2, DONE); 1703 } 1704 1705 // Arrays are equal. 1706 bind(SAME); 1707 mv(result, true); 1708 1709 // That's it. 1710 bind(DONE); 1711 BLOCK_COMMENT("} string_equals"); 1712 } 1713 1714 // jdk.internal.util.ArraysSupport.vectorizedHashCode 1715 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 1716 Register tmp1, Register tmp2, Register tmp3, 1717 Register tmp4, Register tmp5, Register tmp6, 1718 BasicType eltype) 1719 { 1720 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); 1721 1722 const int elsize = arrays_hashcode_elsize(eltype); 1723 const int chunks_end_shift = exact_log2(elsize); 1724 1725 switch (eltype) { 1726 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 1727 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 1728 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 1729 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 1730 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 1731 default: 1732 ShouldNotReachHere(); 1733 } 1734 1735 const int stride = 4; 1736 const Register pow31_4 = tmp1; 1737 const Register pow31_3 = tmp2; 1738 const Register pow31_2 = tmp3; 1739 const Register chunks = tmp4; 1740 const Register chunks_end = chunks; 1741 1742 Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP; 1743 1744 // result has a value initially 1745 1746 beqz(cnt, DONE); 1747 1748 andi(chunks, cnt, ~(stride-1)); 1749 beqz(chunks, TAIL); 1750 1751 mv(pow31_4, 923521); // [31^^4] 1752 mv(pow31_3, 29791); // [31^^3] 1753 mv(pow31_2, 961); // [31^^2] 1754 1755 slli(chunks_end, chunks, chunks_end_shift); 1756 add(chunks_end, ary, chunks_end); 1757 andi(cnt, cnt, stride-1); // don't forget about tail! 1758 1759 bind(WIDE_LOOP); 1760 mulw(result, result, pow31_4); // 31^^4 * h 1761 arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype); 1762 arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype); 1763 arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype); 1764 arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype); 1765 mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0] 1766 addw(result, result, t0); 1767 mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1] 1768 addw(result, result, t1); 1769 slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2] 1770 subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2] 1771 addw(result, result, tmp5); 1772 addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1] 1773 // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3] 1774 addi(ary, ary, elsize * stride); 1775 bne(ary, chunks_end, WIDE_LOOP); 1776 beqz(cnt, DONE); 1777 1778 bind(TAIL); 1779 slli(chunks_end, cnt, chunks_end_shift); 1780 add(chunks_end, ary, chunks_end); 1781 1782 bind(TAIL_LOOP); 1783 arrays_hashcode_elload(t0, Address(ary), eltype); 1784 slli(t1, result, 5); // optimize 31 * result 1785 subw(result, t1, result); // with result<<5 - result 1786 addw(result, result, t0); 1787 addi(ary, ary, elsize); 1788 bne(ary, chunks_end, TAIL_LOOP); 1789 1790 bind(DONE); 1791 BLOCK_COMMENT("} // arrays_hashcode"); 1792 } 1793 1794 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 1795 switch (eltype) { 1796 case T_BOOLEAN: return sizeof(jboolean); 1797 case T_BYTE: return sizeof(jbyte); 1798 case T_SHORT: return sizeof(jshort); 1799 case T_CHAR: return sizeof(jchar); 1800 case T_INT: return sizeof(jint); 1801 default: 1802 ShouldNotReachHere(); 1803 return -1; 1804 } 1805 } 1806 1807 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 1808 switch (eltype) { 1809 // T_BOOLEAN used as surrogate for unsigned byte 1810 case T_BOOLEAN: lbu(dst, src); break; 1811 case T_BYTE: lb(dst, src); break; 1812 case T_SHORT: lh(dst, src); break; 1813 case T_CHAR: lhu(dst, src); break; 1814 case T_INT: lw(dst, src); break; 1815 default: 1816 ShouldNotReachHere(); 1817 } 1818 } 1819 1820 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); 1821 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, 1822 bool is_far, bool is_unordered); 1823 1824 static conditional_branch_insn conditional_branches[] = 1825 { 1826 /* SHORT branches */ 1827 (conditional_branch_insn)&MacroAssembler::beq, 1828 (conditional_branch_insn)&MacroAssembler::bgt, 1829 nullptr, // BoolTest::overflow 1830 (conditional_branch_insn)&MacroAssembler::blt, 1831 (conditional_branch_insn)&MacroAssembler::bne, 1832 (conditional_branch_insn)&MacroAssembler::ble, 1833 nullptr, // BoolTest::no_overflow 1834 (conditional_branch_insn)&MacroAssembler::bge, 1835 1836 /* UNSIGNED branches */ 1837 (conditional_branch_insn)&MacroAssembler::beq, 1838 (conditional_branch_insn)&MacroAssembler::bgtu, 1839 nullptr, 1840 (conditional_branch_insn)&MacroAssembler::bltu, 1841 (conditional_branch_insn)&MacroAssembler::bne, 1842 (conditional_branch_insn)&MacroAssembler::bleu, 1843 nullptr, 1844 (conditional_branch_insn)&MacroAssembler::bgeu 1845 }; 1846 1847 static float_conditional_branch_insn float_conditional_branches[] = 1848 { 1849 /* FLOAT SHORT branches */ 1850 (float_conditional_branch_insn)&MacroAssembler::float_beq, 1851 (float_conditional_branch_insn)&MacroAssembler::float_bgt, 1852 nullptr, // BoolTest::overflow 1853 (float_conditional_branch_insn)&MacroAssembler::float_blt, 1854 (float_conditional_branch_insn)&MacroAssembler::float_bne, 1855 (float_conditional_branch_insn)&MacroAssembler::float_ble, 1856 nullptr, // BoolTest::no_overflow 1857 (float_conditional_branch_insn)&MacroAssembler::float_bge, 1858 1859 /* DOUBLE SHORT branches */ 1860 (float_conditional_branch_insn)&MacroAssembler::double_beq, 1861 (float_conditional_branch_insn)&MacroAssembler::double_bgt, 1862 nullptr, 1863 (float_conditional_branch_insn)&MacroAssembler::double_blt, 1864 (float_conditional_branch_insn)&MacroAssembler::double_bne, 1865 (float_conditional_branch_insn)&MacroAssembler::double_ble, 1866 nullptr, 1867 (float_conditional_branch_insn)&MacroAssembler::double_bge 1868 }; 1869 1870 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) { 1871 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])), 1872 "invalid conditional branch index"); 1873 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far); 1874 } 1875 1876 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use 1877 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode(). 1878 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) { 1879 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])), 1880 "invalid float conditional branch index"); 1881 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask); 1882 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far, 1883 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true); 1884 } 1885 1886 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1887 switch (cmpFlag) { 1888 case BoolTest::eq: 1889 case BoolTest::le: 1890 beqz(op1, L, is_far); 1891 break; 1892 case BoolTest::ne: 1893 case BoolTest::gt: 1894 bnez(op1, L, is_far); 1895 break; 1896 default: 1897 ShouldNotReachHere(); 1898 } 1899 } 1900 1901 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1902 switch (cmpFlag) { 1903 case BoolTest::eq: 1904 beqz(op1, L, is_far); 1905 break; 1906 case BoolTest::ne: 1907 bnez(op1, L, is_far); 1908 break; 1909 default: 1910 ShouldNotReachHere(); 1911 } 1912 } 1913 1914 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) { 1915 Label L; 1916 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L); 1917 mv(dst, src); 1918 bind(L); 1919 } 1920 1921 // Set dst to NaN if any NaN input. 1922 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2, 1923 bool is_double, bool is_min) { 1924 assert_different_registers(dst, src1, src2); 1925 1926 Label Done, Compare; 1927 1928 is_double ? fclass_d(t0, src1) 1929 : fclass_s(t0, src1); 1930 is_double ? fclass_d(t1, src2) 1931 : fclass_s(t1, src2); 1932 orr(t0, t0, t1); 1933 andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN 1934 beqz(t0, Compare); 1935 is_double ? fadd_d(dst, src1, src2) 1936 : fadd_s(dst, src1, src2); 1937 j(Done); 1938 1939 bind(Compare); 1940 if (is_double) { 1941 is_min ? fmin_d(dst, src1, src2) 1942 : fmax_d(dst, src1, src2); 1943 } else { 1944 is_min ? fmin_s(dst, src1, src2) 1945 : fmax_s(dst, src1, src2); 1946 } 1947 1948 bind(Done); 1949 } 1950 1951 // According to Java SE specification, for floating-point round operations, if 1952 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the 1953 // rounded result; this differs from behavior of RISC-V fcvt instructions (which 1954 // round out-of-range values to the nearest max or min value), therefore special 1955 // handling is needed by NaN, +/-Infinity, +/-0. 1956 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode, 1957 Register tmp1, Register tmp2, Register tmp3) { 1958 1959 assert_different_registers(dst, src); 1960 assert_different_registers(tmp1, tmp2, tmp3); 1961 1962 // Set rounding mode for conversions 1963 // Here we use similar modes to double->long and long->double conversions 1964 // Different mode for long->double conversion matter only if long value was not representable as double, 1965 // we got long value as a result of double->long conversion so, it is definitely representable 1966 RoundingMode rm; 1967 switch (round_mode) { 1968 case RoundDoubleModeNode::rmode_ceil: 1969 rm = RoundingMode::rup; 1970 break; 1971 case RoundDoubleModeNode::rmode_floor: 1972 rm = RoundingMode::rdn; 1973 break; 1974 case RoundDoubleModeNode::rmode_rint: 1975 rm = RoundingMode::rne; 1976 break; 1977 default: 1978 ShouldNotReachHere(); 1979 } 1980 1981 // tmp1 - is a register to store double converted to long int 1982 // tmp2 - is a register to create constant for comparison 1983 // tmp3 - is a register where we store modified result of double->long conversion 1984 Label done, bad_val; 1985 1986 // Conversion from double to long 1987 fcvt_l_d(tmp1, src, rm); 1988 1989 // Generate constant (tmp2) 1990 // tmp2 = 100...0000 1991 addi(tmp2, zr, 1); 1992 slli(tmp2, tmp2, 63); 1993 1994 // Prepare converted long (tmp1) 1995 // as a result when conversion overflow we got: 1996 // tmp1 = 011...1111 or 100...0000 1997 // Convert it to: tmp3 = 100...0000 1998 addi(tmp3, tmp1, 1); 1999 andi(tmp3, tmp3, -2); 2000 beq(tmp3, tmp2, bad_val); 2001 2002 // Conversion from long to double 2003 fcvt_d_l(dst, tmp1, rm); 2004 // Add sign of input value to result for +/- 0 cases 2005 fsgnj_d(dst, dst, src); 2006 j(done); 2007 2008 // If got conversion overflow return src 2009 bind(bad_val); 2010 fmv_d(dst, src); 2011 2012 bind(done); 2013 } 2014 2015 // According to Java SE specification, for floating-point signum operations, if 2016 // on input we have NaN or +/-0.0 value we should return it, 2017 // otherwise return +/- 1.0 using sign of input. 2018 // one - gives us a floating-point 1.0 (got from matching rule) 2019 // bool is_double - specifies single or double precision operations will be used. 2020 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) { 2021 Label done; 2022 2023 is_double ? fclass_d(t0, dst) 2024 : fclass_s(t0, dst); 2025 2026 // check if input is -0, +0, signaling NaN or quiet NaN 2027 andi(t0, t0, fclass_mask::zero | fclass_mask::nan); 2028 2029 bnez(t0, done); 2030 2031 // use floating-point 1.0 with a sign of input 2032 is_double ? fsgnj_d(dst, one, dst) 2033 : fsgnj_s(dst, one, dst); 2034 2035 bind(done); 2036 } 2037 2038 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) { 2039 #define __ masm. 2040 FloatRegister dst = stub.data<0>(); 2041 Register src = stub.data<1>(); 2042 Register tmp = stub.data<2>(); 2043 __ bind(stub.entry()); 2044 2045 // following instructions mainly focus on NaN, as riscv does not handle 2046 // NaN well with fcvt, but the code also works for Inf at the same time. 2047 2048 // construct a NaN in 32 bits from the NaN in 16 bits, 2049 // we need the payloads of non-canonical NaNs to be preserved. 2050 __ mv(tmp, 0x7f800000); 2051 // sign-bit was already set via sign-extension if necessary. 2052 __ slli(t0, src, 13); 2053 __ orr(tmp, t0, tmp); 2054 __ fmv_w_x(dst, tmp); 2055 2056 __ j(stub.continuation()); 2057 #undef __ 2058 } 2059 2060 // j.l.Float.float16ToFloat 2061 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) { 2062 auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path); 2063 2064 // On riscv, NaN needs a special process as fcvt does not work in that case. 2065 // On riscv, Inf does not need a special process as fcvt can handle it correctly. 2066 // but we consider to get the slow path to process NaN and Inf at the same time, 2067 // as both of them are rare cases, and if we try to get the slow path to handle 2068 // only NaN case it would sacrifise the performance for normal cases, 2069 // i.e. non-NaN and non-Inf cases. 2070 2071 // check whether it's a NaN or +/- Inf. 2072 mv(t0, 0x7c00); 2073 andr(tmp, src, t0); 2074 // jump to stub processing NaN and Inf cases. 2075 beq(t0, tmp, stub->entry()); 2076 2077 // non-NaN or non-Inf cases, just use built-in instructions. 2078 fmv_h_x(dst, src); 2079 fcvt_s_h(dst, dst); 2080 2081 bind(stub->continuation()); 2082 } 2083 2084 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) { 2085 #define __ masm. 2086 Register dst = stub.data<0>(); 2087 FloatRegister src = stub.data<1>(); 2088 Register tmp = stub.data<2>(); 2089 __ bind(stub.entry()); 2090 2091 __ fmv_x_w(dst, src); 2092 2093 // preserve the payloads of non-canonical NaNs. 2094 __ srai(dst, dst, 13); 2095 // preserve the sign bit. 2096 __ srai(tmp, dst, 13); 2097 __ slli(tmp, tmp, 10); 2098 __ mv(t0, 0x3ff); 2099 __ orr(tmp, tmp, t0); 2100 2101 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2102 __ andr(dst, dst, tmp); 2103 2104 __ j(stub.continuation()); 2105 #undef __ 2106 } 2107 2108 // j.l.Float.floatToFloat16 2109 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) { 2110 auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path); 2111 2112 // On riscv, NaN needs a special process as fcvt does not work in that case. 2113 2114 // check whether it's a NaN. 2115 // replace fclass with feq as performance optimization. 2116 feq_s(t0, src, src); 2117 // jump to stub processing NaN cases. 2118 beqz(t0, stub->entry()); 2119 2120 // non-NaN cases, just use built-in instructions. 2121 fcvt_h_s(ftmp, src); 2122 fmv_x_h(dst, ftmp); 2123 2124 bind(stub->continuation()); 2125 } 2126 2127 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) { 2128 #define __ masm. 2129 VectorRegister dst = stub.data<0>(); 2130 VectorRegister src = stub.data<1>(); 2131 uint vector_length = stub.data<2>(); 2132 __ bind(stub.entry()); 2133 2134 // following instructions mainly focus on NaN, as riscv does not handle 2135 // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time. 2136 // 2137 // construct NaN's in 32 bits from the NaN's in 16 bits, 2138 // we need the payloads of non-canonical NaNs to be preserved. 2139 2140 // adjust vector type to 2 * SEW. 2141 __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1); 2142 // widen and sign-extend src data. 2143 __ vsext_vf2(dst, src, Assembler::v0_t); 2144 __ mv(t0, 0x7f800000); 2145 // sign-bit was already set via sign-extension if necessary. 2146 __ vsll_vi(dst, dst, 13, Assembler::v0_t); 2147 __ vor_vx(dst, dst, t0, Assembler::v0_t); 2148 2149 __ j(stub.continuation()); 2150 #undef __ 2151 } 2152 2153 // j.l.Float.float16ToFloat 2154 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) { 2155 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint> 2156 (dst, src, vector_length, 24, float16_to_float_v_slow_path); 2157 assert_different_registers(dst, src); 2158 2159 // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case. 2160 // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly. 2161 // but we consider to get the slow path to process NaN and Inf at the same time, 2162 // as both of them are rare cases, and if we try to get the slow path to handle 2163 // only NaN case it would sacrifise the performance for normal cases, 2164 // i.e. non-NaN and non-Inf cases. 2165 2166 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2); 2167 2168 // check whether there is a NaN or +/- Inf. 2169 mv(t0, 0x7c00); 2170 vand_vx(v0, src, t0); 2171 // v0 will be used as mask in slow path. 2172 vmseq_vx(v0, v0, t0); 2173 vcpop_m(t0, v0); 2174 2175 // For non-NaN or non-Inf cases, just use built-in instructions. 2176 vfwcvt_f_f_v(dst, src); 2177 2178 // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide. 2179 bnez(t0, stub->entry()); 2180 2181 bind(stub->continuation()); 2182 } 2183 2184 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm, 2185 C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) { 2186 #define __ masm. 2187 VectorRegister dst = stub.data<0>(); 2188 VectorRegister src = stub.data<1>(); 2189 VectorRegister tmp = stub.data<2>(); 2190 __ bind(stub.entry()); 2191 2192 // mul is already set to mf2 in float_to_float16_v. 2193 2194 // preserve the payloads of non-canonical NaNs. 2195 __ vnsra_wi(dst, src, 13, Assembler::v0_t); 2196 2197 // preserve the sign bit. 2198 __ vnsra_wi(tmp, src, 26, Assembler::v0_t); 2199 __ vsll_vi(tmp, tmp, 10, Assembler::v0_t); 2200 __ mv(t0, 0x3ff); 2201 __ vor_vx(tmp, tmp, t0, Assembler::v0_t); 2202 2203 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2204 __ vand_vv(dst, dst, tmp, Assembler::v0_t); 2205 2206 __ j(stub.continuation()); 2207 #undef __ 2208 } 2209 2210 // j.l.Float.float16ToFloat 2211 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp, 2212 Register tmp, uint vector_length) { 2213 assert_different_registers(dst, src, vtmp); 2214 2215 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister> 2216 (dst, src, vtmp, 28, float_to_float16_v_slow_path); 2217 2218 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case. 2219 2220 vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1); 2221 2222 // check whether there is a NaN. 2223 // replace v_fclass with vmseq_vv as performance optimization. 2224 vmfne_vv(v0, src, src); 2225 vcpop_m(t0, v0); 2226 2227 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp); 2228 2229 // For non-NaN cases, just use built-in instructions. 2230 vfncvt_f_f_w(dst, src); 2231 2232 // jump to stub processing NaN cases. 2233 bnez(t0, stub->entry()); 2234 2235 bind(stub->continuation()); 2236 } 2237 2238 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) { 2239 vsetvli_helper(bt, vlen); 2240 2241 // check if input is -0, +0, signaling NaN or quiet NaN 2242 vfclass_v(v0, dst); 2243 mv(t0, fclass_mask::zero | fclass_mask::nan); 2244 vand_vx(v0, v0, t0); 2245 vmseq_vi(v0, v0, 0); 2246 2247 // use floating-point 1.0 with a sign of input 2248 vfsgnj_vv(dst, one, dst, v0_t); 2249 } 2250 2251 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) { 2252 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2253 // intrinsic is enabled when MaxVectorSize >= 16 2254 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2255 long len = is_long ? 64 : 32; 2256 2257 // load the src data(in bits) to be compressed. 2258 vsetivli(x0, 1, sew, Assembler::m1); 2259 vmv_s_x(v0, src); 2260 // reset the src data(in bytes) to zero. 2261 mv(t0, len); 2262 vsetvli(x0, t0, Assembler::e8, lmul); 2263 vmv_v_i(v4, 0); 2264 // convert the src data from bits to bytes. 2265 vmerge_vim(v4, v4, 1); // v0 as the implicit mask register 2266 // reset the dst data(in bytes) to zero. 2267 vmv_v_i(v8, 0); 2268 // load the mask data(in bits). 2269 vsetivli(x0, 1, sew, Assembler::m1); 2270 vmv_s_x(v0, mask); 2271 // compress the src data(in bytes) to dst(in bytes). 2272 vsetvli(x0, t0, Assembler::e8, lmul); 2273 vcompress_vm(v8, v4, v0); 2274 // convert the dst data from bytes to bits. 2275 vmseq_vi(v0, v8, 1); 2276 // store result back. 2277 vsetivli(x0, 1, sew, Assembler::m1); 2278 vmv_x_s(dst, v0); 2279 } 2280 2281 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) { 2282 compress_bits_v(dst, src, mask, /* is_long */ false); 2283 } 2284 2285 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) { 2286 compress_bits_v(dst, src, mask, /* is_long */ true); 2287 } 2288 2289 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) { 2290 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2291 // intrinsic is enabled when MaxVectorSize >= 16 2292 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2293 long len = is_long ? 64 : 32; 2294 2295 // load the src data(in bits) to be expanded. 2296 vsetivli(x0, 1, sew, Assembler::m1); 2297 vmv_s_x(v0, src); 2298 // reset the src data(in bytes) to zero. 2299 mv(t0, len); 2300 vsetvli(x0, t0, Assembler::e8, lmul); 2301 vmv_v_i(v4, 0); 2302 // convert the src data from bits to bytes. 2303 vmerge_vim(v4, v4, 1); // v0 as implicit mask register 2304 // reset the dst data(in bytes) to zero. 2305 vmv_v_i(v12, 0); 2306 // load the mask data(in bits). 2307 vsetivli(x0, 1, sew, Assembler::m1); 2308 vmv_s_x(v0, mask); 2309 // expand the src data(in bytes) to dst(in bytes). 2310 vsetvli(x0, t0, Assembler::e8, lmul); 2311 viota_m(v8, v0); 2312 vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register 2313 // convert the dst data from bytes to bits. 2314 vmseq_vi(v0, v12, 1); 2315 // store result back. 2316 vsetivli(x0, 1, sew, Assembler::m1); 2317 vmv_x_s(dst, v0); 2318 } 2319 2320 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) { 2321 expand_bits_v(dst, src, mask, /* is_long */ false); 2322 } 2323 2324 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) { 2325 expand_bits_v(dst, src, mask, /* is_long */ true); 2326 } 2327 2328 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, 2329 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) { 2330 Label loop; 2331 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16; 2332 2333 bind(loop); 2334 vsetvli(tmp1, cnt, sew, Assembler::m2); 2335 vlex_v(vr1, a1, sew); 2336 vlex_v(vr2, a2, sew); 2337 vmsne_vv(vrs, vr1, vr2); 2338 vfirst_m(tmp2, vrs); 2339 bgez(tmp2, DONE); 2340 sub(cnt, cnt, tmp1); 2341 if (!islatin) { 2342 slli(tmp1, tmp1, 1); // get byte counts 2343 } 2344 add(a1, a1, tmp1); 2345 add(a2, a2, tmp1); 2346 bnez(cnt, loop); 2347 2348 mv(result, true); 2349 } 2350 2351 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) { 2352 Label DONE; 2353 Register tmp1 = t0; 2354 Register tmp2 = t1; 2355 2356 BLOCK_COMMENT("string_equals_v {"); 2357 2358 mv(result, false); 2359 2360 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE); 2361 2362 bind(DONE); 2363 BLOCK_COMMENT("} string_equals_v"); 2364 } 2365 2366 // used by C2 ClearArray patterns. 2367 // base: Address of a buffer to be zeroed 2368 // cnt: Count in HeapWords 2369 // 2370 // base, cnt, v4, v5, v6, v7 and t0 are clobbered. 2371 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) { 2372 Label loop; 2373 2374 // making zero words 2375 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2376 vxor_vv(v4, v4, v4); 2377 2378 bind(loop); 2379 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2380 vse64_v(v4, base); 2381 sub(cnt, cnt, t0); 2382 shadd(base, t0, base, t0, 3); 2383 bnez(cnt, loop); 2384 } 2385 2386 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result, 2387 Register cnt1, int elem_size) { 2388 Label DONE; 2389 Register tmp1 = t0; 2390 Register tmp2 = t1; 2391 Register cnt2 = tmp2; 2392 int length_offset = arrayOopDesc::length_offset_in_bytes(); 2393 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 2394 2395 BLOCK_COMMENT("arrays_equals_v {"); 2396 2397 // if (a1 == a2), return true 2398 mv(result, true); 2399 beq(a1, a2, DONE); 2400 2401 mv(result, false); 2402 // if a1 == null or a2 == null, return false 2403 beqz(a1, DONE); 2404 beqz(a2, DONE); 2405 // if (a1.length != a2.length), return false 2406 lwu(cnt1, Address(a1, length_offset)); 2407 lwu(cnt2, Address(a2, length_offset)); 2408 bne(cnt1, cnt2, DONE); 2409 2410 la(a1, Address(a1, base_offset)); 2411 la(a2, Address(a2, base_offset)); 2412 2413 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE); 2414 2415 bind(DONE); 2416 2417 BLOCK_COMMENT("} arrays_equals_v"); 2418 } 2419 2420 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2, 2421 Register result, Register tmp1, Register tmp2, int encForm) { 2422 Label DIFFERENCE, DONE, L, loop; 2423 bool encLL = encForm == StrIntrinsicNode::LL; 2424 bool encLU = encForm == StrIntrinsicNode::LU; 2425 bool encUL = encForm == StrIntrinsicNode::UL; 2426 2427 bool str1_isL = encLL || encLU; 2428 bool str2_isL = encLL || encUL; 2429 2430 int minCharsInWord = encLL ? wordSize : wordSize / 2; 2431 2432 BLOCK_COMMENT("string_compare {"); 2433 2434 // for Latin strings, 1 byte for 1 character 2435 // for UTF16 strings, 2 bytes for 1 character 2436 if (!str1_isL) 2437 sraiw(cnt1, cnt1, 1); 2438 if (!str2_isL) 2439 sraiw(cnt2, cnt2, 1); 2440 2441 // if str1 == str2, return the difference 2442 // save the minimum of the string lengths in cnt2. 2443 sub(result, cnt1, cnt2); 2444 bgt(cnt1, cnt2, L); 2445 mv(cnt2, cnt1); 2446 bind(L); 2447 2448 if (str1_isL == str2_isL) { // LL or UU 2449 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE); 2450 j(DONE); 2451 } else { // LU or UL 2452 Register strL = encLU ? str1 : str2; 2453 Register strU = encLU ? str2 : str1; 2454 VectorRegister vstr1 = encLU ? v8 : v4; 2455 VectorRegister vstr2 = encLU ? v4 : v8; 2456 2457 bind(loop); 2458 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2); 2459 vle8_v(vstr1, strL); 2460 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4); 2461 vzext_vf2(vstr2, vstr1); 2462 vle16_v(vstr1, strU); 2463 vmsne_vv(v4, vstr2, vstr1); 2464 vfirst_m(tmp2, v4); 2465 bgez(tmp2, DIFFERENCE); 2466 sub(cnt2, cnt2, tmp1); 2467 add(strL, strL, tmp1); 2468 shadd(strU, tmp1, strU, tmp1, 1); 2469 bnez(cnt2, loop); 2470 j(DONE); 2471 } 2472 2473 bind(DIFFERENCE); 2474 slli(tmp1, tmp2, 1); 2475 add(str1, str1, str1_isL ? tmp2 : tmp1); 2476 add(str2, str2, str2_isL ? tmp2 : tmp1); 2477 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0)); 2478 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0)); 2479 sub(result, tmp1, tmp2); 2480 2481 bind(DONE); 2482 } 2483 2484 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) { 2485 Label loop; 2486 assert_different_registers(src, dst, len, tmp, t0); 2487 2488 BLOCK_COMMENT("byte_array_inflate_v {"); 2489 bind(loop); 2490 vsetvli(tmp, len, Assembler::e8, Assembler::m2); 2491 vle8_v(v6, src); 2492 vsetvli(t0, len, Assembler::e16, Assembler::m4); 2493 vzext_vf2(v4, v6); 2494 vse16_v(v4, dst); 2495 sub(len, len, tmp); 2496 add(src, src, tmp); 2497 shadd(dst, tmp, dst, tmp, 1); 2498 bnez(len, loop); 2499 BLOCK_COMMENT("} byte_array_inflate_v"); 2500 } 2501 2502 // Compress char[] array to byte[]. 2503 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) 2504 // result: the array length if every element in array can be encoded, 2505 // otherwise, the index of first non-latin1 (> 0xff) character. 2506 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, 2507 Register result, Register tmp) { 2508 encode_iso_array_v(src, dst, len, result, tmp, false); 2509 } 2510 2511 // Intrinsic for 2512 // 2513 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray 2514 // return the number of characters copied. 2515 // - java/lang/StringUTF16.compress 2516 // return index of non-latin1 character if copy fails, otherwise 'len'. 2517 // 2518 // This version always returns the number of characters copied. A successful 2519 // copy will complete with the post-condition: 'res' == 'len', while an 2520 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'. 2521 // 2522 // Clobbers: src, dst, len, result, t0 2523 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, 2524 Register result, Register tmp, bool ascii) { 2525 Label loop, fail, done; 2526 2527 BLOCK_COMMENT("encode_iso_array_v {"); 2528 mv(result, 0); 2529 2530 bind(loop); 2531 mv(tmp, ascii ? 0x7f : 0xff); 2532 vsetvli(t0, len, Assembler::e16, Assembler::m2); 2533 vle16_v(v2, src); 2534 2535 vmsgtu_vx(v1, v2, tmp); 2536 vfirst_m(tmp, v1); 2537 vmsbf_m(v0, v1); 2538 // compress char to byte 2539 vsetvli(t0, len, Assembler::e8); 2540 vncvt_x_x_w(v1, v2, Assembler::v0_t); 2541 vse8_v(v1, dst, Assembler::v0_t); 2542 2543 // fail if char > 0x7f/0xff 2544 bgez(tmp, fail); 2545 add(result, result, t0); 2546 add(dst, dst, t0); 2547 sub(len, len, t0); 2548 shadd(src, t0, src, t0, 1); 2549 bnez(len, loop); 2550 j(done); 2551 2552 bind(fail); 2553 add(result, result, tmp); 2554 2555 bind(done); 2556 BLOCK_COMMENT("} encode_iso_array_v"); 2557 } 2558 2559 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) { 2560 Label LOOP, SET_RESULT, DONE; 2561 2562 BLOCK_COMMENT("count_positives_v {"); 2563 assert_different_registers(ary, len, result, tmp); 2564 2565 mv(result, zr); 2566 2567 bind(LOOP); 2568 vsetvli(t0, len, Assembler::e8, Assembler::m4); 2569 vle8_v(v4, ary); 2570 vmslt_vx(v4, v4, zr); 2571 vfirst_m(tmp, v4); 2572 bgez(tmp, SET_RESULT); 2573 // if tmp == -1, all bytes are positive 2574 add(result, result, t0); 2575 2576 sub(len, len, t0); 2577 add(ary, ary, t0); 2578 bnez(len, LOOP); 2579 j(DONE); 2580 2581 // add remaining positive bytes count 2582 bind(SET_RESULT); 2583 add(result, result, tmp); 2584 2585 bind(DONE); 2586 BLOCK_COMMENT("} count_positives_v"); 2587 } 2588 2589 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1, 2590 Register ch, Register result, 2591 Register tmp1, Register tmp2, 2592 bool isL) { 2593 mv(result, zr); 2594 2595 Label loop, MATCH, DONE; 2596 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16; 2597 bind(loop); 2598 vsetvli(tmp1, cnt1, sew, Assembler::m4); 2599 vlex_v(v4, str1, sew); 2600 vmseq_vx(v4, v4, ch); 2601 vfirst_m(tmp2, v4); 2602 bgez(tmp2, MATCH); // if equal, return index 2603 2604 add(result, result, tmp1); 2605 sub(cnt1, cnt1, tmp1); 2606 if (!isL) slli(tmp1, tmp1, 1); 2607 add(str1, str1, tmp1); 2608 bnez(cnt1, loop); 2609 2610 mv(result, -1); 2611 j(DONE); 2612 2613 bind(MATCH); 2614 add(result, result, tmp2); 2615 2616 bind(DONE); 2617 } 2618 2619 // Set dst to NaN if any NaN input. 2620 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2621 BasicType bt, bool is_min, uint vector_length) { 2622 assert_different_registers(dst, src1, src2); 2623 2624 vsetvli_helper(bt, vector_length); 2625 2626 is_min ? vfmin_vv(dst, src1, src2) 2627 : vfmax_vv(dst, src1, src2); 2628 2629 vmfne_vv(v0, src1, src1); 2630 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2631 vmfne_vv(v0, src2, src2); 2632 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2633 } 2634 2635 // Set dst to NaN if any NaN input. 2636 // The destination vector register elements corresponding to masked-off elements 2637 // are handled with a mask-undisturbed policy. 2638 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2639 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2, 2640 BasicType bt, bool is_min, uint vector_length) { 2641 assert_different_registers(src1, src2, tmp1, tmp2); 2642 vsetvli_helper(bt, vector_length); 2643 2644 // Check vector elements of src1 and src2 for NaN. 2645 vmfeq_vv(tmp1, src1, src1); 2646 vmfeq_vv(tmp2, src2, src2); 2647 2648 vmandn_mm(v0, vmask, tmp1); 2649 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2650 vmandn_mm(v0, vmask, tmp2); 2651 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2652 2653 vmand_mm(tmp2, tmp1, tmp2); 2654 vmand_mm(v0, vmask, tmp2); 2655 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t) 2656 : vfmax_vv(dst, src1, src2, Assembler::v0_t); 2657 } 2658 2659 // Set dst to NaN if any NaN input. 2660 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst, 2661 FloatRegister src1, VectorRegister src2, 2662 VectorRegister tmp1, VectorRegister tmp2, 2663 bool is_double, bool is_min, uint vector_length, VectorMask vm) { 2664 assert_different_registers(dst, src1); 2665 assert_different_registers(src2, tmp1, tmp2); 2666 2667 Label L_done, L_NaN_1, L_NaN_2; 2668 // Set dst to src1 if src1 is NaN 2669 is_double ? feq_d(t0, src1, src1) 2670 : feq_s(t0, src1, src1); 2671 beqz(t0, L_NaN_2); 2672 2673 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 2674 vfmv_s_f(tmp2, src1); 2675 2676 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm) 2677 : vfredmax_vs(tmp1, src2, tmp2, vm); 2678 vfmv_f_s(dst, tmp1); 2679 2680 // Checking NaNs in src2 2681 vmfne_vv(tmp1, src2, src2, vm); 2682 vcpop_m(t0, tmp1, vm); 2683 beqz(t0, L_done); 2684 2685 bind(L_NaN_1); 2686 vfredusum_vs(tmp1, src2, tmp2, vm); 2687 vfmv_f_s(dst, tmp1); 2688 j(L_done); 2689 2690 bind(L_NaN_2); 2691 is_double ? fmv_d(dst, src1) 2692 : fmv_s(dst, src1); 2693 bind(L_done); 2694 } 2695 2696 bool C2_MacroAssembler::in_scratch_emit_size() { 2697 if (ciEnv::current()->task() != nullptr) { 2698 PhaseOutput* phase_output = Compile::current()->output(); 2699 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2700 return true; 2701 } 2702 } 2703 return MacroAssembler::in_scratch_emit_size(); 2704 } 2705 2706 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, 2707 VectorRegister src2, VectorRegister tmp, 2708 int opc, BasicType bt, uint vector_length, VectorMask vm) { 2709 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2710 vsetvli_helper(bt, vector_length); 2711 vmv_s_x(tmp, src1); 2712 switch (opc) { 2713 case Op_AddReductionVI: 2714 case Op_AddReductionVL: 2715 vredsum_vs(tmp, src2, tmp, vm); 2716 break; 2717 case Op_AndReductionV: 2718 vredand_vs(tmp, src2, tmp, vm); 2719 break; 2720 case Op_OrReductionV: 2721 vredor_vs(tmp, src2, tmp, vm); 2722 break; 2723 case Op_XorReductionV: 2724 vredxor_vs(tmp, src2, tmp, vm); 2725 break; 2726 case Op_MaxReductionV: 2727 vredmax_vs(tmp, src2, tmp, vm); 2728 break; 2729 case Op_MinReductionV: 2730 vredmin_vs(tmp, src2, tmp, vm); 2731 break; 2732 default: 2733 ShouldNotReachHere(); 2734 } 2735 vmv_x_s(dst, tmp); 2736 } 2737 2738 // Set vl and vtype for full and partial vector operations. 2739 // (vma = mu, vta = tu, vill = false) 2740 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) { 2741 Assembler::SEW sew = Assembler::elemtype_to_sew(bt); 2742 if (vector_length <= 31) { 2743 vsetivli(tmp, vector_length, sew, vlmul); 2744 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) { 2745 vsetvli(tmp, x0, sew, vlmul); 2746 } else { 2747 mv(tmp, vector_length); 2748 vsetvli(tmp, tmp, sew, vlmul); 2749 } 2750 } 2751 2752 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2753 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2754 assert(is_integral_type(bt), "unsupported element type"); 2755 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2756 vsetvli_helper(bt, vector_length); 2757 vmclr_m(vd); 2758 switch (cond) { 2759 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break; 2760 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break; 2761 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break; 2762 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break; 2763 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break; 2764 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break; 2765 case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break; 2766 case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break; 2767 case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break; 2768 case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break; 2769 default: 2770 assert(false, "unsupported compare condition"); 2771 ShouldNotReachHere(); 2772 } 2773 } 2774 2775 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2776 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2777 assert(is_floating_point_type(bt), "unsupported element type"); 2778 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2779 vsetvli_helper(bt, vector_length); 2780 vmclr_m(vd); 2781 switch (cond) { 2782 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break; 2783 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break; 2784 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break; 2785 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break; 2786 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break; 2787 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break; 2788 default: 2789 assert(false, "unsupported compare condition"); 2790 ShouldNotReachHere(); 2791 } 2792 } 2793 2794 // In Matcher::scalable_predicate_reg_slots, 2795 // we assume each predicate register is one-eighth of the size of 2796 // scalable vector register, one mask bit per vector byte. 2797 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) { 2798 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2799 add(t0, sp, offset); 2800 vse8_v(v, t0); 2801 } 2802 2803 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) { 2804 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2805 add(t0, sp, offset); 2806 vle8_v(v, t0); 2807 } 2808 2809 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2810 VectorRegister src, BasicType src_bt, bool is_signed) { 2811 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size"); 2812 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2813 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands 2814 // The destination EEW is greater than the source EEW, the source EMUL is at least 1, 2815 // and the overlap is in the highest-numbered part of the destination register group. 2816 // Since LMUL=1, vd and vs cannot be the same. 2817 assert_different_registers(dst, src); 2818 2819 vsetvli_helper(dst_bt, vector_length); 2820 if (is_signed) { 2821 if (src_bt == T_BYTE) { 2822 switch (dst_bt) { 2823 case T_SHORT: 2824 vsext_vf2(dst, src); 2825 break; 2826 case T_INT: 2827 vsext_vf4(dst, src); 2828 break; 2829 case T_LONG: 2830 vsext_vf8(dst, src); 2831 break; 2832 default: 2833 ShouldNotReachHere(); 2834 } 2835 } else if (src_bt == T_SHORT) { 2836 if (dst_bt == T_INT) { 2837 vsext_vf2(dst, src); 2838 } else { 2839 vsext_vf4(dst, src); 2840 } 2841 } else if (src_bt == T_INT) { 2842 vsext_vf2(dst, src); 2843 } 2844 } else { 2845 if (src_bt == T_BYTE) { 2846 switch (dst_bt) { 2847 case T_SHORT: 2848 vzext_vf2(dst, src); 2849 break; 2850 case T_INT: 2851 vzext_vf4(dst, src); 2852 break; 2853 case T_LONG: 2854 vzext_vf8(dst, src); 2855 break; 2856 default: 2857 ShouldNotReachHere(); 2858 } 2859 } else if (src_bt == T_SHORT) { 2860 if (dst_bt == T_INT) { 2861 vzext_vf2(dst, src); 2862 } else { 2863 vzext_vf4(dst, src); 2864 } 2865 } else if (src_bt == T_INT) { 2866 vzext_vf2(dst, src); 2867 } 2868 } 2869 } 2870 2871 // Vector narrow from src to dst with specified element sizes. 2872 // High part of dst vector will be filled with zero. 2873 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2874 VectorRegister src, BasicType src_bt) { 2875 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size"); 2876 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2877 mv(t0, vector_length); 2878 if (src_bt == T_LONG) { 2879 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions 2880 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source. 2881 // So we can currently only scale down by 1/2 the width at a time. 2882 vsetvli(t0, t0, Assembler::e32, Assembler::mf2); 2883 vncvt_x_x_w(dst, src); 2884 if (dst_bt == T_SHORT || dst_bt == T_BYTE) { 2885 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2886 vncvt_x_x_w(dst, dst); 2887 if (dst_bt == T_BYTE) { 2888 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2889 vncvt_x_x_w(dst, dst); 2890 } 2891 } 2892 } else if (src_bt == T_INT) { 2893 // T_SHORT 2894 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2895 vncvt_x_x_w(dst, src); 2896 if (dst_bt == T_BYTE) { 2897 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2898 vncvt_x_x_w(dst, dst); 2899 } 2900 } else if (src_bt == T_SHORT) { 2901 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2902 vncvt_x_x_w(dst, src); 2903 } 2904 } 2905 2906 #define VFCVT_SAFE(VFLOATCVT) \ 2907 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \ 2908 assert_different_registers(dst, src); \ 2909 vxor_vv(dst, dst, dst); \ 2910 vmfeq_vv(v0, src, src); \ 2911 VFLOATCVT(dst, src, Assembler::v0_t); \ 2912 } 2913 2914 VFCVT_SAFE(vfcvt_rtz_x_f_v); 2915 2916 #undef VFCVT_SAFE 2917 2918 // Extract a scalar element from an vector at position 'idx'. 2919 // The input elements in src are expected to be of integral type. 2920 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt, 2921 int idx, VectorRegister tmp) { 2922 assert(is_integral_type(bt), "unsupported element type"); 2923 assert(idx >= 0, "idx cannot be negative"); 2924 // Only need the first element after vector slidedown 2925 vsetvli_helper(bt, 1); 2926 if (idx == 0) { 2927 vmv_x_s(dst, src); 2928 } else if (idx <= 31) { 2929 vslidedown_vi(tmp, src, idx); 2930 vmv_x_s(dst, tmp); 2931 } else { 2932 mv(t0, idx); 2933 vslidedown_vx(tmp, src, t0); 2934 vmv_x_s(dst, tmp); 2935 } 2936 } 2937 2938 // Extract a scalar element from an vector at position 'idx'. 2939 // The input elements in src are expected to be of floating point type. 2940 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, 2941 int idx, VectorRegister tmp) { 2942 assert(is_floating_point_type(bt), "unsupported element type"); 2943 assert(idx >= 0, "idx cannot be negative"); 2944 // Only need the first element after vector slidedown 2945 vsetvli_helper(bt, 1); 2946 if (idx == 0) { 2947 vfmv_f_s(dst, src); 2948 } else if (idx <= 31) { 2949 vslidedown_vi(tmp, src, idx); 2950 vfmv_f_s(dst, tmp); 2951 } else { 2952 mv(t0, idx); 2953 vslidedown_vx(tmp, src, t0); 2954 vfmv_f_s(dst, tmp); 2955 } 2956 }