1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/compile.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, 48 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) { 49 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure. 50 Register flag = t1; 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmp1Reg; 54 Register tmp = tmp2Reg; 55 Label object_has_monitor; 56 // Finish fast lock successfully. MUST branch to with flag == 0 57 Label locked; 58 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 59 Label slow_path; 60 61 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 62 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0); 63 64 mv(flag, 1); 65 66 // Load markWord from object into displaced_header. 67 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 68 69 if (DiagnoseSyncOnValueBasedClasses != 0) { 70 load_klass(tmp, oop); 71 lwu(tmp, Address(tmp, Klass::access_flags_offset())); 72 test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 73 bnez(tmp, slow_path); 74 } 75 76 // Check for existing monitor 77 test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value)); 78 bnez(tmp, object_has_monitor); 79 80 if (LockingMode == LM_MONITOR) { 81 j(slow_path); 82 } else { 83 assert(LockingMode == LM_LEGACY, "must be"); 84 // Set tmp to be (markWord of object | UNLOCK_VALUE). 85 ori(tmp, disp_hdr, markWord::unlocked_value); 86 87 // Initialize the box. (Must happen before we update the object mark!) 88 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 89 90 // Compare object markWord with an unlocked value (tmp) and if 91 // equal exchange the stack address of our box with object markWord. 92 // On failure disp_hdr contains the possibly locked markWord. 93 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, 94 Assembler::aq, Assembler::rl, /*result*/disp_hdr); 95 beq(disp_hdr, tmp, locked); 96 97 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 98 99 // If the compare-and-exchange succeeded, then we found an unlocked 100 // object, will have now locked it will continue at label locked 101 // We did not see an unlocked object so try the fast recursive case. 102 103 // Check if the owner is self by comparing the value in the 104 // markWord of object (disp_hdr) with the stack pointer. 105 sub(disp_hdr, disp_hdr, sp); 106 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place)); 107 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked, 108 // hence we can store 0 as the displaced header in the box, which indicates that it is a 109 // recursive lock. 110 andr(tmp/*==0?*/, disp_hdr, tmp); 111 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 112 beqz(tmp, locked); 113 j(slow_path); 114 } 115 116 // Handle existing monitor. 117 bind(object_has_monitor); 118 // The object's monitor m is unlocked iff m->owner == nullptr, 119 // otherwise m->owner may contain a thread or a stack address. 120 // 121 // Try to CAS m->owner from null to current thread id. 122 Register tid = flag; 123 mv(tid, Address(xthread, JavaThread::lock_id_offset())); 124 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value)); 125 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/tid, Assembler::int64, 126 Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected) 127 128 // Store a non-null value into the box to avoid looking like a re-entrant 129 // lock. The fast-path monitor unlock code checks for 130 // markWord::monitor_value so use markWord::unused_mark which has the 131 // relevant bit set, and also matches ObjectSynchronizer::slow_enter. 132 mv(tmp, (address)markWord::unused_mark().value()); 133 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 134 135 beqz(tmp3Reg, locked); // CAS success means locking succeeded 136 137 bne(tmp3Reg, tid, slow_path); // Check for recursive locking 138 139 // Recursive lock case 140 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg); 141 142 bind(locked); 143 mv(flag, zr); 144 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg); 145 146 #ifdef ASSERT 147 // Check that locked label is reached with flag == 0. 148 Label flag_correct; 149 beqz(flag, flag_correct); 150 stop("Fast Lock Flag != 0"); 151 #endif 152 153 bind(slow_path); 154 #ifdef ASSERT 155 // Check that slow_path label is reached with flag != 0. 156 bnez(flag, flag_correct); 157 stop("Fast Lock Flag == 0"); 158 bind(flag_correct); 159 #endif 160 // C2 uses the value of flag (0 vs !0) to determine the continuation. 161 } 162 163 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, 164 Register tmp1Reg, Register tmp2Reg) { 165 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure. 166 Register flag = t1; 167 Register oop = objectReg; 168 Register box = boxReg; 169 Register disp_hdr = tmp1Reg; 170 Register tmp = tmp2Reg; 171 Label object_has_monitor; 172 // Finish fast lock successfully. MUST branch to vwith flag == 0 173 Label unlocked; 174 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 175 Label slow_path; 176 177 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 178 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0); 179 180 mv(flag, 1); 181 182 if (LockingMode == LM_LEGACY) { 183 // Find the lock address and load the displaced header from the stack. 184 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 185 186 // If the displaced header is 0, we have a recursive unlock. 187 beqz(disp_hdr, unlocked); 188 } 189 190 // Handle existing monitor. 191 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 192 test_bit(t0, tmp, exact_log2(markWord::monitor_value)); 193 bnez(t0, object_has_monitor); 194 195 if (LockingMode == LM_MONITOR) { 196 j(slow_path); 197 } else { 198 assert(LockingMode == LM_LEGACY, "must be"); 199 // Check if it is still a light weight lock, this is true if we 200 // see the stack address of the basicLock in the markWord of the 201 // object. 202 203 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, 204 Assembler::relaxed, Assembler::rl, /*result*/tmp); 205 beq(box, tmp, unlocked); // box == tmp if cas succeeds 206 j(slow_path); 207 } 208 209 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 210 211 // Handle existing monitor. 212 bind(object_has_monitor); 213 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 214 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 215 216 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 217 218 Label notRecursive; 219 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive. 220 221 // Recursive lock 222 addi(disp_hdr, disp_hdr, -1); 223 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 224 j(unlocked); 225 226 bind(notRecursive); 227 ld(t0, Address(tmp, ObjectMonitor::EntryList_offset())); 228 ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 229 orr(t0, t0, disp_hdr); // Will be 0 if both are 0. 230 bnez(t0, slow_path); 231 232 // need a release store here 233 la(tmp, Address(tmp, ObjectMonitor::owner_offset())); 234 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 235 sd(zr, Address(tmp)); // set unowned 236 237 bind(unlocked); 238 mv(flag, zr); 239 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg); 240 241 #ifdef ASSERT 242 // Check that unlocked label is reached with flag == 0. 243 Label flag_correct; 244 beqz(flag, flag_correct); 245 stop("Fast Lock Flag != 0"); 246 #endif 247 248 bind(slow_path); 249 #ifdef ASSERT 250 // Check that slow_path label is reached with flag != 0. 251 bnez(flag, flag_correct); 252 stop("Fast Lock Flag == 0"); 253 bind(flag_correct); 254 #endif 255 // C2 uses the value of flag (0 vs !0) to determine the continuation. 256 } 257 258 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) { 259 // Flag register, zero for success; non-zero for failure. 260 Register flag = t1; 261 262 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 263 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0); 264 265 mv(flag, 1); 266 267 // Handle inflated monitor. 268 Label inflated; 269 // Finish fast lock successfully. MUST branch to with flag == 0 270 Label locked; 271 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 272 Label slow_path; 273 274 if (DiagnoseSyncOnValueBasedClasses != 0) { 275 load_klass(tmp1, obj); 276 lwu(tmp1, Address(tmp1, Klass::access_flags_offset())); 277 test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 278 bnez(tmp1, slow_path); 279 } 280 281 const Register tmp1_mark = tmp1; 282 283 { // Lightweight locking 284 285 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0 286 Label push; 287 288 const Register tmp2_top = tmp2; 289 const Register tmp3_t = tmp3; 290 291 // Check if lock-stack is full. 292 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 293 mv(tmp3_t, (unsigned)LockStack::end_offset()); 294 bge(tmp2_top, tmp3_t, slow_path); 295 296 // Check if recursive. 297 add(tmp3_t, xthread, tmp2_top); 298 ld(tmp3_t, Address(tmp3_t, -oopSize)); 299 beq(obj, tmp3_t, push); 300 301 // Relaxed normal load to check for monitor. Optimization for monitor case. 302 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 303 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 304 bnez(tmp3_t, inflated); 305 306 // Not inflated 307 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la"); 308 309 // Try to lock. Transition lock-bits 0b01 => 0b00 310 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value); 311 xori(tmp3_t, tmp1_mark, markWord::unlocked_value); 312 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 313 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t); 314 bne(tmp1_mark, tmp3_t, slow_path); 315 316 bind(push); 317 // After successful lock, push object on lock-stack. 318 add(tmp3_t, xthread, tmp2_top); 319 sd(obj, Address(tmp3_t)); 320 addw(tmp2_top, tmp2_top, oopSize); 321 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 322 j(locked); 323 } 324 325 { // Handle inflated monitor. 326 bind(inflated); 327 328 // mark contains the tagged ObjectMonitor*. 329 const Register tmp1_tagged_monitor = tmp1_mark; 330 const uintptr_t monitor_tag = markWord::monitor_value; 331 const Register tmp2_owner_addr = tmp2; 332 const Register tmp3_owner = tmp3; 333 334 // Compute owner address. 335 la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag))); 336 337 // CAS owner (null => current thread id). 338 Register tid = flag; 339 mv(tid, Address(xthread, JavaThread::lock_id_offset())); 340 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tid, Assembler::int64, 341 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner); 342 beqz(tmp3_owner, locked); 343 344 // Check if recursive. 345 bne(tmp3_owner, tid, slow_path); 346 347 // Recursive. 348 increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3); 349 } 350 351 bind(locked); 352 mv(flag, zr); 353 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 354 355 #ifdef ASSERT 356 // Check that locked label is reached with flag == 0. 357 Label flag_correct; 358 beqz(flag, flag_correct); 359 stop("Fast Lock Flag != 0"); 360 #endif 361 362 bind(slow_path); 363 #ifdef ASSERT 364 // Check that slow_path label is reached with flag != 0. 365 bnez(flag, flag_correct); 366 stop("Fast Lock Flag == 0"); 367 bind(flag_correct); 368 #endif 369 // C2 uses the value of flag (0 vs !0) to determine the continuation. 370 } 371 372 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2, 373 Register tmp3) { 374 // Flag register, zero for success; non-zero for failure. 375 Register flag = t1; 376 377 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 378 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0); 379 380 mv(flag, 1); 381 382 // Handle inflated monitor. 383 Label inflated, inflated_load_monitor; 384 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0 385 Label unlocked; 386 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0 387 Label slow_path; 388 389 const Register tmp1_mark = tmp1; 390 const Register tmp2_top = tmp2; 391 const Register tmp3_t = tmp3; 392 393 { // Lightweight unlock 394 395 // Check if obj is top of lock-stack. 396 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 397 subw(tmp2_top, tmp2_top, oopSize); 398 add(tmp3_t, xthread, tmp2_top); 399 ld(tmp3_t, Address(tmp3_t)); 400 // Top of lock stack was not obj. Must be monitor. 401 bne(obj, tmp3_t, inflated_load_monitor); 402 403 // Pop lock-stack. 404 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 405 DEBUG_ONLY(sd(zr, Address(tmp3_t));) 406 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 407 408 // Check if recursive. 409 add(tmp3_t, xthread, tmp2_top); 410 ld(tmp3_t, Address(tmp3_t, -oopSize)); 411 beq(obj, tmp3_t, unlocked); 412 413 // Not recursive. 414 // Load Mark. 415 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 416 417 // Check header for monitor (0b10). 418 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 419 bnez(tmp3_t, inflated); 420 421 // Try to unlock. Transition lock bits 0b00 => 0b01 422 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 423 ori(tmp3_t, tmp1_mark, markWord::unlocked_value); 424 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 425 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t); 426 beq(tmp1_mark, tmp3_t, unlocked); 427 428 // Compare and exchange failed. 429 // Restore lock-stack and handle the unlock in runtime. 430 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 431 DEBUG_ONLY(sd(obj, Address(tmp3_t));) 432 addw(tmp2_top, tmp2_top, oopSize); 433 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 434 j(slow_path); 435 } 436 437 { // Handle inflated monitor. 438 bind(inflated_load_monitor); 439 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 440 #ifdef ASSERT 441 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 442 bnez(tmp3_t, inflated); 443 stop("Fast Unlock not monitor"); 444 #endif 445 446 bind(inflated); 447 448 #ifdef ASSERT 449 Label check_done; 450 subw(tmp2_top, tmp2_top, oopSize); 451 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset())); 452 blt(tmp2_top, tmp3_t, check_done); 453 add(tmp3_t, xthread, tmp2_top); 454 ld(tmp3_t, Address(tmp3_t)); 455 bne(obj, tmp3_t, inflated); 456 stop("Fast Unlock lock on stack"); 457 bind(check_done); 458 #endif 459 460 // mark contains the tagged ObjectMonitor*. 461 const Register tmp1_monitor = tmp1_mark; 462 const uintptr_t monitor_tag = markWord::monitor_value; 463 464 // Untag the monitor. 465 sub(tmp1_monitor, tmp1_mark, monitor_tag); 466 467 const Register tmp2_recursions = tmp2; 468 Label not_recursive; 469 470 // Check if recursive. 471 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 472 beqz(tmp2_recursions, not_recursive); 473 474 // Recursive unlock. 475 addi(tmp2_recursions, tmp2_recursions, -1); 476 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 477 j(unlocked); 478 479 bind(not_recursive); 480 481 Label release; 482 const Register tmp2_owner_addr = tmp2; 483 484 // Compute owner address. 485 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset())); 486 487 // Check if the entry lists are empty. 488 ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset())); 489 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset())); 490 orr(t0, t0, tmp3_t); 491 beqz(t0, release); 492 493 // The owner may be anonymous and we removed the last obj entry in 494 // the lock-stack. This loses the information about the owner. 495 // Write the thread to the owner field so the runtime knows the owner. 496 Register tid = flag; 497 mv(tid, Address(xthread, JavaThread::lock_id_offset())); 498 sd(tid, Address(tmp2_owner_addr)); 499 j(slow_path); 500 501 bind(release); 502 // Set owner to null. 503 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 504 sd(zr, Address(tmp2_owner_addr)); 505 } 506 507 bind(unlocked); 508 mv(flag, zr); 509 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 510 511 #ifdef ASSERT 512 // Check that unlocked label is reached with flag == 0. 513 Label flag_correct; 514 beqz(flag, flag_correct); 515 stop("Fast Lock Flag != 0"); 516 #endif 517 518 bind(slow_path); 519 #ifdef ASSERT 520 // Check that slow_path label is reached with flag != 0. 521 bnez(flag, flag_correct); 522 stop("Fast Lock Flag == 0"); 523 bind(flag_correct); 524 #endif 525 // C2 uses the value of flag (0 vs !0) to determine the continuation. 526 } 527 528 // short string 529 // StringUTF16.indexOfChar 530 // StringLatin1.indexOfChar 531 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1, 532 Register ch, Register result, 533 bool isL) 534 { 535 Register ch1 = t0; 536 Register index = t1; 537 538 BLOCK_COMMENT("string_indexof_char_short {"); 539 540 Label LOOP, LOOP1, LOOP4, LOOP8; 541 Label MATCH, MATCH1, MATCH2, MATCH3, 542 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH; 543 544 mv(result, -1); 545 mv(index, zr); 546 547 bind(LOOP); 548 addi(t0, index, 8); 549 ble(t0, cnt1, LOOP8); 550 addi(t0, index, 4); 551 ble(t0, cnt1, LOOP4); 552 j(LOOP1); 553 554 bind(LOOP8); 555 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 556 beq(ch, ch1, MATCH); 557 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 558 beq(ch, ch1, MATCH1); 559 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 560 beq(ch, ch1, MATCH2); 561 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 562 beq(ch, ch1, MATCH3); 563 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8)); 564 beq(ch, ch1, MATCH4); 565 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10)); 566 beq(ch, ch1, MATCH5); 567 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12)); 568 beq(ch, ch1, MATCH6); 569 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14)); 570 beq(ch, ch1, MATCH7); 571 addi(index, index, 8); 572 addi(str1, str1, isL ? 8 : 16); 573 blt(index, cnt1, LOOP); 574 j(NOMATCH); 575 576 bind(LOOP4); 577 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 578 beq(ch, ch1, MATCH); 579 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 580 beq(ch, ch1, MATCH1); 581 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 582 beq(ch, ch1, MATCH2); 583 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 584 beq(ch, ch1, MATCH3); 585 addi(index, index, 4); 586 addi(str1, str1, isL ? 4 : 8); 587 bge(index, cnt1, NOMATCH); 588 589 bind(LOOP1); 590 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1)); 591 beq(ch, ch1, MATCH); 592 addi(index, index, 1); 593 addi(str1, str1, isL ? 1 : 2); 594 blt(index, cnt1, LOOP1); 595 j(NOMATCH); 596 597 bind(MATCH1); 598 addi(index, index, 1); 599 j(MATCH); 600 601 bind(MATCH2); 602 addi(index, index, 2); 603 j(MATCH); 604 605 bind(MATCH3); 606 addi(index, index, 3); 607 j(MATCH); 608 609 bind(MATCH4); 610 addi(index, index, 4); 611 j(MATCH); 612 613 bind(MATCH5); 614 addi(index, index, 5); 615 j(MATCH); 616 617 bind(MATCH6); 618 addi(index, index, 6); 619 j(MATCH); 620 621 bind(MATCH7); 622 addi(index, index, 7); 623 624 bind(MATCH); 625 mv(result, index); 626 bind(NOMATCH); 627 BLOCK_COMMENT("} string_indexof_char_short"); 628 } 629 630 // StringUTF16.indexOfChar 631 // StringLatin1.indexOfChar 632 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 633 Register ch, Register result, 634 Register tmp1, Register tmp2, 635 Register tmp3, Register tmp4, 636 bool isL) 637 { 638 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG; 639 Register ch1 = t0; 640 Register orig_cnt = t1; 641 Register mask1 = tmp3; 642 Register mask2 = tmp2; 643 Register match_mask = tmp1; 644 Register trailing_char = tmp4; 645 Register unaligned_elems = tmp4; 646 647 BLOCK_COMMENT("string_indexof_char {"); 648 beqz(cnt1, NOMATCH); 649 650 addi(t0, cnt1, isL ? -32 : -16); 651 bgtz(t0, DO_LONG); 652 string_indexof_char_short(str1, cnt1, ch, result, isL); 653 j(DONE); 654 655 bind(DO_LONG); 656 mv(orig_cnt, cnt1); 657 if (AvoidUnalignedAccesses) { 658 Label ALIGNED; 659 andi(unaligned_elems, str1, 0x7); 660 beqz(unaligned_elems, ALIGNED); 661 sub(unaligned_elems, unaligned_elems, 8); 662 neg(unaligned_elems, unaligned_elems); 663 if (!isL) { 664 srli(unaligned_elems, unaligned_elems, 1); 665 } 666 // do unaligned part per element 667 string_indexof_char_short(str1, unaligned_elems, ch, result, isL); 668 bgez(result, DONE); 669 mv(orig_cnt, cnt1); 670 sub(cnt1, cnt1, unaligned_elems); 671 bind(ALIGNED); 672 } 673 674 // duplicate ch 675 if (isL) { 676 slli(ch1, ch, 8); 677 orr(ch, ch1, ch); 678 } 679 slli(ch1, ch, 16); 680 orr(ch, ch1, ch); 681 slli(ch1, ch, 32); 682 orr(ch, ch1, ch); 683 684 if (!isL) { 685 slli(cnt1, cnt1, 1); 686 } 687 688 uint64_t mask0101 = UCONST64(0x0101010101010101); 689 uint64_t mask0001 = UCONST64(0x0001000100010001); 690 mv(mask1, isL ? mask0101 : mask0001); 691 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 692 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 693 mv(mask2, isL ? mask7f7f : mask7fff); 694 695 bind(CH1_LOOP); 696 ld(ch1, Address(str1)); 697 addi(str1, str1, 8); 698 addi(cnt1, cnt1, -8); 699 compute_match_mask(ch1, ch, match_mask, mask1, mask2); 700 bnez(match_mask, HIT); 701 bgtz(cnt1, CH1_LOOP); 702 j(NOMATCH); 703 704 bind(HIT); 705 ctzc_bit(trailing_char, match_mask, isL, ch1, result); 706 srli(trailing_char, trailing_char, 3); 707 addi(cnt1, cnt1, 8); 708 ble(cnt1, trailing_char, NOMATCH); 709 // match case 710 if (!isL) { 711 srli(cnt1, cnt1, 1); 712 srli(trailing_char, trailing_char, 1); 713 } 714 715 sub(result, orig_cnt, cnt1); 716 add(result, result, trailing_char); 717 j(DONE); 718 719 bind(NOMATCH); 720 mv(result, -1); 721 722 bind(DONE); 723 BLOCK_COMMENT("} string_indexof_char"); 724 } 725 726 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp); 727 728 // Search for needle in haystack and return index or -1 729 // x10: result 730 // x11: haystack 731 // x12: haystack_len 732 // x13: needle 733 // x14: needle_len 734 void C2_MacroAssembler::string_indexof(Register haystack, Register needle, 735 Register haystack_len, Register needle_len, 736 Register tmp1, Register tmp2, 737 Register tmp3, Register tmp4, 738 Register tmp5, Register tmp6, 739 Register result, int ae) 740 { 741 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 742 743 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH; 744 745 Register ch1 = t0; 746 Register ch2 = t1; 747 Register nlen_tmp = tmp1; // needle len tmp 748 Register hlen_tmp = tmp2; // haystack len tmp 749 Register result_tmp = tmp4; 750 751 bool isLL = ae == StrIntrinsicNode::LL; 752 753 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 754 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 755 int needle_chr_shift = needle_isL ? 0 : 1; 756 int haystack_chr_shift = haystack_isL ? 0 : 1; 757 int needle_chr_size = needle_isL ? 1 : 2; 758 int haystack_chr_size = haystack_isL ? 1 : 2; 759 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 760 (load_chr_insn)&MacroAssembler::lhu; 761 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 762 (load_chr_insn)&MacroAssembler::lhu; 763 764 BLOCK_COMMENT("string_indexof {"); 765 766 // Note, inline_string_indexOf() generates checks: 767 // if (pattern.count > src.count) return -1; 768 // if (pattern.count == 0) return 0; 769 770 // We have two strings, a source string in haystack, haystack_len and a pattern string 771 // in needle, needle_len. Find the first occurrence of pattern in source or return -1. 772 773 // For larger pattern and source we use a simplified Boyer Moore algorithm. 774 // With a small pattern and source we use linear scan. 775 776 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm. 777 sub(result_tmp, haystack_len, needle_len); 778 // needle_len < 8, use linear scan 779 sub(t0, needle_len, 8); 780 bltz(t0, LINEARSEARCH); 781 // needle_len >= 256, use linear scan 782 sub(t0, needle_len, 256); 783 bgez(t0, LINEARSTUB); 784 // needle_len >= haystack_len/4, use linear scan 785 srli(t0, haystack_len, 2); 786 bge(needle_len, t0, LINEARSTUB); 787 788 // Boyer-Moore-Horspool introduction: 789 // The Boyer Moore alogorithm is based on the description here:- 790 // 791 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 792 // 793 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 794 // and the 'Good Suffix' rule. 795 // 796 // These rules are essentially heuristics for how far we can shift the 797 // pattern along the search string. 798 // 799 // The implementation here uses the 'Bad Character' rule only because of the 800 // complexity of initialisation for the 'Good Suffix' rule. 801 // 802 // This is also known as the Boyer-Moore-Horspool algorithm: 803 // 804 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 805 // 806 // #define ASIZE 256 807 // 808 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) { 809 // int i, j; 810 // unsigned c; 811 // unsigned char bc[ASIZE]; 812 // 813 // /* Preprocessing */ 814 // for (i = 0; i < ASIZE; ++i) 815 // bc[i] = m; 816 // for (i = 0; i < m - 1; ) { 817 // c = pattern[i]; 818 // ++i; 819 // // c < 256 for Latin1 string, so, no need for branch 820 // #ifdef PATTERN_STRING_IS_LATIN1 821 // bc[c] = m - i; 822 // #else 823 // if (c < ASIZE) bc[c] = m - i; 824 // #endif 825 // } 826 // 827 // /* Searching */ 828 // j = 0; 829 // while (j <= n - m) { 830 // c = src[i+j]; 831 // if (pattern[m-1] == c) 832 // int k; 833 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 834 // if (k < 0) return j; 835 // // c < 256 for Latin1 string, so, no need for branch 836 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1 837 // // LL case: (c< 256) always true. Remove branch 838 // j += bc[pattern[j+m-1]]; 839 // #endif 840 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF 841 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 842 // if (c < ASIZE) 843 // j += bc[pattern[j+m-1]]; 844 // else 845 // j += 1 846 // #endif 847 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1 848 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 849 // if (c < ASIZE) 850 // j += bc[pattern[j+m-1]]; 851 // else 852 // j += m 853 // #endif 854 // } 855 // return -1; 856 // } 857 858 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result 859 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 860 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 861 862 Register haystack_end = haystack_len; 863 Register skipch = tmp2; 864 865 // pattern length is >=8, so, we can read at least 1 register for cases when 866 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 867 // UL case. We'll re-read last character in inner pre-loop code to have 868 // single outer pre-loop load 869 const int firstStep = isLL ? 7 : 3; 870 871 const int ASIZE = 256; 872 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd) 873 874 sub(sp, sp, ASIZE); 875 876 // init BC offset table with default value: needle_len 877 slli(t0, needle_len, 8); 878 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len] 879 slli(tmp1, t0, 16); 880 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len] 881 slli(tmp1, t0, 32); 882 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len] 883 884 mv(ch1, sp); // ch1 is t0 885 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations 886 887 bind(BM_INIT_LOOP); 888 // for (i = 0; i < ASIZE; ++i) 889 // bc[i] = m; 890 for (int i = 0; i < 4; i++) { 891 sd(tmp5, Address(ch1, i * wordSize)); 892 } 893 add(ch1, ch1, 32); 894 sub(tmp6, tmp6, 4); 895 bgtz(tmp6, BM_INIT_LOOP); 896 897 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern 898 Register orig_haystack = tmp5; 899 mv(orig_haystack, haystack); 900 // result_tmp = tmp4 901 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift); 902 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1 903 mv(tmp3, needle); 904 905 // for (i = 0; i < m - 1; ) { 906 // c = pattern[i]; 907 // ++i; 908 // // c < 256 for Latin1 string, so, no need for branch 909 // #ifdef PATTERN_STRING_IS_LATIN1 910 // bc[c] = m - i; 911 // #else 912 // if (c < ASIZE) bc[c] = m - i; 913 // #endif 914 // } 915 bind(BCLOOP); 916 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg); 917 add(tmp3, tmp3, needle_chr_size); 918 if (!needle_isL) { 919 // ae == StrIntrinsicNode::UU 920 mv(tmp6, ASIZE); 921 bgeu(ch1, tmp6, BCSKIP); 922 } 923 add(tmp4, sp, ch1); 924 sb(ch2, Address(tmp4)); // store skip offset to BC offset table 925 926 bind(BCSKIP); 927 sub(ch2, ch2, 1); // for next pattern element, skip distance -1 928 bgtz(ch2, BCLOOP); 929 930 // tmp6: pattern end, address after needle 931 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift); 932 if (needle_isL == haystack_isL) { 933 // load last 8 bytes (8LL/4UU symbols) 934 ld(tmp6, Address(tmp6, -wordSize)); 935 } else { 936 // UL: from UTF-16(source) search Latin1(pattern) 937 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols) 938 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d 939 // We'll have to wait until load completed, but it's still faster than per-character loads+checks 940 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a 941 slli(ch2, tmp6, XLEN - 24); 942 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b 943 slli(ch1, tmp6, XLEN - 16); 944 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c 945 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d 946 slli(ch2, ch2, 16); 947 orr(ch2, ch2, ch1); // 0x00000b0c 948 slli(result, tmp3, 48); // use result as temp register 949 orr(tmp6, tmp6, result); // 0x0a00000d 950 slli(result, ch2, 16); 951 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d 952 } 953 954 // i = m - 1; 955 // skipch = j + i; 956 // if (skipch == pattern[m - 1] 957 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 958 // else 959 // move j with bad char offset table 960 bind(BMLOOPSTR2); 961 // compare pattern to source string backward 962 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift); 963 (this->*haystack_load_1chr)(skipch, Address(result), noreg); 964 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8 965 if (needle_isL == haystack_isL) { 966 // re-init tmp3. It's for free because it's executed in parallel with 967 // load above. Alternative is to initialize it before loop, but it'll 968 // affect performance on in-order systems with 2 or more ld/st pipelines 969 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1] 970 } 971 if (!isLL) { // UU/UL case 972 slli(ch2, nlen_tmp, 1); // offsets in bytes 973 } 974 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char 975 add(result, haystack, isLL ? nlen_tmp : ch2); 976 // load 8 bytes from source string 977 // if isLL is false then read granularity can be 2 978 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway 979 mv(ch1, tmp6); 980 if (isLL) { 981 j(BMLOOPSTR1_AFTER_LOAD); 982 } else { 983 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 984 j(BMLOOPSTR1_CMP); 985 } 986 987 bind(BMLOOPSTR1); 988 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift); 989 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 990 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift); 991 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 992 993 bind(BMLOOPSTR1_AFTER_LOAD); 994 sub(nlen_tmp, nlen_tmp, 1); 995 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP); 996 997 bind(BMLOOPSTR1_CMP); 998 beq(ch1, ch2, BMLOOPSTR1); 999 1000 bind(BMSKIP); 1001 if (!isLL) { 1002 // if we've met UTF symbol while searching Latin1 pattern, then we can 1003 // skip needle_len symbols 1004 if (needle_isL != haystack_isL) { 1005 mv(result_tmp, needle_len); 1006 } else { 1007 mv(result_tmp, 1); 1008 } 1009 mv(t0, ASIZE); 1010 bgeu(skipch, t0, BMADV); 1011 } 1012 add(result_tmp, sp, skipch); 1013 lbu(result_tmp, Address(result_tmp)); // load skip offset 1014 1015 bind(BMADV); 1016 sub(nlen_tmp, needle_len, 1); 1017 // move haystack after bad char skip offset 1018 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift); 1019 ble(haystack, haystack_end, BMLOOPSTR2); 1020 add(sp, sp, ASIZE); 1021 j(NOMATCH); 1022 1023 bind(BMLOOPSTR1_LASTCMP); 1024 bne(ch1, ch2, BMSKIP); 1025 1026 bind(BMMATCH); 1027 sub(result, haystack, orig_haystack); 1028 if (!haystack_isL) { 1029 srli(result, result, 1); 1030 } 1031 add(sp, sp, ASIZE); 1032 j(DONE); 1033 1034 bind(LINEARSTUB); 1035 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm 1036 bltz(t0, LINEARSEARCH); 1037 mv(result, zr); 1038 RuntimeAddress stub = nullptr; 1039 if (isLL) { 1040 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll()); 1041 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 1042 } else if (needle_isL) { 1043 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul()); 1044 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 1045 } else { 1046 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu()); 1047 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 1048 } 1049 address call = trampoline_call(stub); 1050 if (call == nullptr) { 1051 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH)); 1052 ciEnv::current()->record_failure("CodeCache is full"); 1053 return; 1054 } 1055 j(DONE); 1056 1057 bind(NOMATCH); 1058 mv(result, -1); 1059 j(DONE); 1060 1061 bind(LINEARSEARCH); 1062 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae); 1063 1064 bind(DONE); 1065 BLOCK_COMMENT("} string_indexof"); 1066 } 1067 1068 // string_indexof 1069 // result: x10 1070 // src: x11 1071 // src_count: x12 1072 // pattern: x13 1073 // pattern_count: x14 or 1/2/3/4 1074 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle, 1075 Register haystack_len, Register needle_len, 1076 Register tmp1, Register tmp2, 1077 Register tmp3, Register tmp4, 1078 int needle_con_cnt, Register result, int ae) 1079 { 1080 // Note: 1081 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant 1082 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1 1083 assert(needle_con_cnt <= 4, "Invalid needle constant count"); 1084 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1085 1086 Register ch1 = t0; 1087 Register ch2 = t1; 1088 Register hlen_neg = haystack_len, nlen_neg = needle_len; 1089 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4; 1090 1091 bool isLL = ae == StrIntrinsicNode::LL; 1092 1093 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 1094 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 1095 int needle_chr_shift = needle_isL ? 0 : 1; 1096 int haystack_chr_shift = haystack_isL ? 0 : 1; 1097 int needle_chr_size = needle_isL ? 1 : 2; 1098 int haystack_chr_size = haystack_isL ? 1 : 2; 1099 1100 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 1101 (load_chr_insn)&MacroAssembler::lhu; 1102 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 1103 (load_chr_insn)&MacroAssembler::lhu; 1104 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu; 1105 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld; 1106 1107 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE; 1108 1109 Register first = tmp3; 1110 1111 if (needle_con_cnt == -1) { 1112 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 1113 1114 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2); 1115 bltz(t0, DOSHORT); 1116 1117 (this->*needle_load_1chr)(first, Address(needle), noreg); 1118 slli(t0, needle_len, needle_chr_shift); 1119 add(needle, needle, t0); 1120 neg(nlen_neg, t0); 1121 slli(t0, result_tmp, haystack_chr_shift); 1122 add(haystack, haystack, t0); 1123 neg(hlen_neg, t0); 1124 1125 bind(FIRST_LOOP); 1126 add(t0, haystack, hlen_neg); 1127 (this->*haystack_load_1chr)(ch2, Address(t0), noreg); 1128 beq(first, ch2, STR1_LOOP); 1129 1130 bind(STR2_NEXT); 1131 add(hlen_neg, hlen_neg, haystack_chr_size); 1132 blez(hlen_neg, FIRST_LOOP); 1133 j(NOMATCH); 1134 1135 bind(STR1_LOOP); 1136 add(nlen_tmp, nlen_neg, needle_chr_size); 1137 add(hlen_tmp, hlen_neg, haystack_chr_size); 1138 bgez(nlen_tmp, MATCH); 1139 1140 bind(STR1_NEXT); 1141 add(ch1, needle, nlen_tmp); 1142 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 1143 add(ch2, haystack, hlen_tmp); 1144 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1145 bne(ch1, ch2, STR2_NEXT); 1146 add(nlen_tmp, nlen_tmp, needle_chr_size); 1147 add(hlen_tmp, hlen_tmp, haystack_chr_size); 1148 bltz(nlen_tmp, STR1_NEXT); 1149 j(MATCH); 1150 1151 bind(DOSHORT); 1152 if (needle_isL == haystack_isL) { 1153 sub(t0, needle_len, 2); 1154 bltz(t0, DO1); 1155 bgtz(t0, DO3); 1156 } 1157 } 1158 1159 if (needle_con_cnt == 4) { 1160 Label CH1_LOOP; 1161 (this->*load_4chr)(ch1, Address(needle), noreg); 1162 sub(result_tmp, haystack_len, 4); 1163 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp 1164 add(haystack, haystack, tmp3); 1165 neg(hlen_neg, tmp3); 1166 if (AvoidUnalignedAccesses) { 1167 // preload first value, then we will read by 1 character per loop, instead of four 1168 // just shifting previous ch2 right by size of character in bits 1169 add(tmp3, haystack, hlen_neg); 1170 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1171 if (isLL) { 1172 // need to erase 1 most significant byte in 32-bit value of ch2 1173 slli(ch2, ch2, 40); 1174 srli(ch2, ch2, 32); 1175 } else { 1176 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation 1177 } 1178 } 1179 1180 bind(CH1_LOOP); 1181 add(tmp3, haystack, hlen_neg); 1182 if (AvoidUnalignedAccesses) { 1183 srli(ch2, ch2, isLL ? 8 : 16); 1184 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg); 1185 slli(tmp3, tmp3, isLL ? 24 : 48); 1186 add(ch2, ch2, tmp3); 1187 } else { 1188 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1189 } 1190 beq(ch1, ch2, MATCH); 1191 add(hlen_neg, hlen_neg, haystack_chr_size); 1192 blez(hlen_neg, CH1_LOOP); 1193 j(NOMATCH); 1194 } 1195 1196 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) { 1197 Label CH1_LOOP; 1198 BLOCK_COMMENT("string_indexof DO2 {"); 1199 bind(DO2); 1200 (this->*load_2chr)(ch1, Address(needle), noreg); 1201 if (needle_con_cnt == 2) { 1202 sub(result_tmp, haystack_len, 2); 1203 } 1204 slli(tmp3, result_tmp, haystack_chr_shift); 1205 add(haystack, haystack, tmp3); 1206 neg(hlen_neg, tmp3); 1207 if (AvoidUnalignedAccesses) { 1208 // preload first value, then we will read by 1 character per loop, instead of two 1209 // just shifting previous ch2 right by size of character in bits 1210 add(tmp3, haystack, hlen_neg); 1211 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1212 slli(ch2, ch2, isLL ? 8 : 16); 1213 } 1214 bind(CH1_LOOP); 1215 add(tmp3, haystack, hlen_neg); 1216 if (AvoidUnalignedAccesses) { 1217 srli(ch2, ch2, isLL ? 8 : 16); 1218 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg); 1219 slli(tmp3, tmp3, isLL ? 8 : 16); 1220 add(ch2, ch2, tmp3); 1221 } else { 1222 (this->*load_2chr)(ch2, Address(tmp3), noreg); 1223 } 1224 beq(ch1, ch2, MATCH); 1225 add(hlen_neg, hlen_neg, haystack_chr_size); 1226 blez(hlen_neg, CH1_LOOP); 1227 j(NOMATCH); 1228 BLOCK_COMMENT("} string_indexof DO2"); 1229 } 1230 1231 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) { 1232 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1233 BLOCK_COMMENT("string_indexof DO3 {"); 1234 1235 bind(DO3); 1236 (this->*load_2chr)(first, Address(needle), noreg); 1237 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg); 1238 if (needle_con_cnt == 3) { 1239 sub(result_tmp, haystack_len, 3); 1240 } 1241 slli(hlen_tmp, result_tmp, haystack_chr_shift); 1242 add(haystack, haystack, hlen_tmp); 1243 neg(hlen_neg, hlen_tmp); 1244 1245 bind(FIRST_LOOP); 1246 add(ch2, haystack, hlen_neg); 1247 if (AvoidUnalignedAccesses) { 1248 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2 1249 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1250 slli(tmp2, tmp2, isLL ? 8 : 16); 1251 add(ch2, ch2, tmp2); 1252 } else { 1253 (this->*load_2chr)(ch2, Address(ch2), noreg); 1254 } 1255 beq(first, ch2, STR1_LOOP); 1256 1257 bind(STR2_NEXT); 1258 add(hlen_neg, hlen_neg, haystack_chr_size); 1259 blez(hlen_neg, FIRST_LOOP); 1260 j(NOMATCH); 1261 1262 bind(STR1_LOOP); 1263 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size); 1264 add(ch2, haystack, hlen_tmp); 1265 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1266 bne(ch1, ch2, STR2_NEXT); 1267 j(MATCH); 1268 BLOCK_COMMENT("} string_indexof DO3"); 1269 } 1270 1271 if (needle_con_cnt == -1 || needle_con_cnt == 1) { 1272 Label DO1_LOOP; 1273 1274 BLOCK_COMMENT("string_indexof DO1 {"); 1275 bind(DO1); 1276 (this->*needle_load_1chr)(ch1, Address(needle), noreg); 1277 sub(result_tmp, haystack_len, 1); 1278 slli(tmp3, result_tmp, haystack_chr_shift); 1279 add(haystack, haystack, tmp3); 1280 neg(hlen_neg, tmp3); 1281 1282 bind(DO1_LOOP); 1283 add(tmp3, haystack, hlen_neg); 1284 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1285 beq(ch1, ch2, MATCH); 1286 add(hlen_neg, hlen_neg, haystack_chr_size); 1287 blez(hlen_neg, DO1_LOOP); 1288 BLOCK_COMMENT("} string_indexof DO1"); 1289 } 1290 1291 bind(NOMATCH); 1292 mv(result, -1); 1293 j(DONE); 1294 1295 bind(MATCH); 1296 srai(t0, hlen_neg, haystack_chr_shift); 1297 add(result, result_tmp, t0); 1298 1299 bind(DONE); 1300 } 1301 1302 // Compare strings. 1303 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1304 Register cnt1, Register cnt2, Register result, 1305 Register tmp1, Register tmp2, Register tmp3, 1306 int ae) 1307 { 1308 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1309 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1310 SHORT_LOOP_START, TAIL_CHECK, L; 1311 1312 const int STUB_THRESHOLD = 64 + 8; 1313 bool isLL = ae == StrIntrinsicNode::LL; 1314 bool isLU = ae == StrIntrinsicNode::LU; 1315 bool isUL = ae == StrIntrinsicNode::UL; 1316 1317 bool str1_isL = isLL || isLU; 1318 bool str2_isL = isLL || isUL; 1319 1320 // for L strings, 1 byte for 1 character 1321 // for U strings, 2 bytes for 1 character 1322 int str1_chr_size = str1_isL ? 1 : 2; 1323 int str2_chr_size = str2_isL ? 1 : 2; 1324 int minCharsInWord = isLL ? wordSize : wordSize / 2; 1325 1326 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1327 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1328 1329 BLOCK_COMMENT("string_compare {"); 1330 1331 // Bizzarely, the counts are passed in bytes, regardless of whether they 1332 // are L or U strings, however the result is always in characters. 1333 if (!str1_isL) { 1334 sraiw(cnt1, cnt1, 1); 1335 } 1336 if (!str2_isL) { 1337 sraiw(cnt2, cnt2, 1); 1338 } 1339 1340 // Compute the minimum of the string lengths and save the difference in result. 1341 sub(result, cnt1, cnt2); 1342 bgt(cnt1, cnt2, L); 1343 mv(cnt2, cnt1); 1344 bind(L); 1345 1346 // A very short string 1347 mv(t0, minCharsInWord); 1348 ble(cnt2, t0, SHORT_STRING); 1349 1350 // Compare longwords 1351 // load first parts of strings and finish initialization while loading 1352 { 1353 if (str1_isL == str2_isL) { // LL or UU 1354 // check if str1 and str2 is same pointer 1355 beq(str1, str2, DONE); 1356 // load 8 bytes once to compare 1357 ld(tmp1, Address(str1)); 1358 ld(tmp2, Address(str2)); 1359 mv(t0, STUB_THRESHOLD); 1360 bge(cnt2, t0, STUB); 1361 sub(cnt2, cnt2, minCharsInWord); 1362 beqz(cnt2, TAIL_CHECK); 1363 // convert cnt2 from characters to bytes 1364 if (!str1_isL) { 1365 slli(cnt2, cnt2, 1); 1366 } 1367 add(str2, str2, cnt2); 1368 add(str1, str1, cnt2); 1369 sub(cnt2, zr, cnt2); 1370 } else if (isLU) { // LU case 1371 lwu(tmp1, Address(str1)); 1372 ld(tmp2, Address(str2)); 1373 mv(t0, STUB_THRESHOLD); 1374 bge(cnt2, t0, STUB); 1375 addi(cnt2, cnt2, -4); 1376 add(str1, str1, cnt2); 1377 sub(cnt1, zr, cnt2); 1378 slli(cnt2, cnt2, 1); 1379 add(str2, str2, cnt2); 1380 inflate_lo32(tmp3, tmp1); 1381 mv(tmp1, tmp3); 1382 sub(cnt2, zr, cnt2); 1383 addi(cnt1, cnt1, 4); 1384 } else { // UL case 1385 ld(tmp1, Address(str1)); 1386 lwu(tmp2, Address(str2)); 1387 mv(t0, STUB_THRESHOLD); 1388 bge(cnt2, t0, STUB); 1389 addi(cnt2, cnt2, -4); 1390 slli(t0, cnt2, 1); 1391 sub(cnt1, zr, t0); 1392 add(str1, str1, t0); 1393 add(str2, str2, cnt2); 1394 inflate_lo32(tmp3, tmp2); 1395 mv(tmp2, tmp3); 1396 sub(cnt2, zr, cnt2); 1397 addi(cnt1, cnt1, 8); 1398 } 1399 addi(cnt2, cnt2, isUL ? 4 : 8); 1400 bne(tmp1, tmp2, DIFFERENCE); 1401 bgez(cnt2, TAIL); 1402 1403 // main loop 1404 bind(NEXT_WORD); 1405 if (str1_isL == str2_isL) { // LL or UU 1406 add(t0, str1, cnt2); 1407 ld(tmp1, Address(t0)); 1408 add(t0, str2, cnt2); 1409 ld(tmp2, Address(t0)); 1410 addi(cnt2, cnt2, 8); 1411 } else if (isLU) { // LU case 1412 add(t0, str1, cnt1); 1413 lwu(tmp1, Address(t0)); 1414 add(t0, str2, cnt2); 1415 ld(tmp2, Address(t0)); 1416 addi(cnt1, cnt1, 4); 1417 inflate_lo32(tmp3, tmp1); 1418 mv(tmp1, tmp3); 1419 addi(cnt2, cnt2, 8); 1420 } else { // UL case 1421 add(t0, str2, cnt2); 1422 lwu(tmp2, Address(t0)); 1423 add(t0, str1, cnt1); 1424 ld(tmp1, Address(t0)); 1425 inflate_lo32(tmp3, tmp2); 1426 mv(tmp2, tmp3); 1427 addi(cnt1, cnt1, 8); 1428 addi(cnt2, cnt2, 4); 1429 } 1430 bne(tmp1, tmp2, DIFFERENCE); 1431 bltz(cnt2, NEXT_WORD); 1432 bind(TAIL); 1433 if (str1_isL == str2_isL) { // LL or UU 1434 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2); 1435 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2); 1436 } else if (isLU) { // LU case 1437 load_int_misaligned(tmp1, Address(str1), tmp3, false); 1438 load_long_misaligned(tmp2, Address(str2), tmp3, 2); 1439 inflate_lo32(tmp3, tmp1); 1440 mv(tmp1, tmp3); 1441 } else { // UL case 1442 load_int_misaligned(tmp2, Address(str2), tmp3, false); 1443 load_long_misaligned(tmp1, Address(str1), tmp3, 2); 1444 inflate_lo32(tmp3, tmp2); 1445 mv(tmp2, tmp3); 1446 } 1447 bind(TAIL_CHECK); 1448 beq(tmp1, tmp2, DONE); 1449 1450 // Find the first different characters in the longwords and 1451 // compute their difference. 1452 bind(DIFFERENCE); 1453 xorr(tmp3, tmp1, tmp2); 1454 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb 1455 srl(tmp1, tmp1, result); 1456 srl(tmp2, tmp2, result); 1457 if (isLL) { 1458 andi(tmp1, tmp1, 0xFF); 1459 andi(tmp2, tmp2, 0xFF); 1460 } else { 1461 andi(tmp1, tmp1, 0xFFFF); 1462 andi(tmp2, tmp2, 0xFFFF); 1463 } 1464 sub(result, tmp1, tmp2); 1465 j(DONE); 1466 } 1467 1468 bind(STUB); 1469 RuntimeAddress stub = nullptr; 1470 switch (ae) { 1471 case StrIntrinsicNode::LL: 1472 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL()); 1473 break; 1474 case StrIntrinsicNode::UU: 1475 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU()); 1476 break; 1477 case StrIntrinsicNode::LU: 1478 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU()); 1479 break; 1480 case StrIntrinsicNode::UL: 1481 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL()); 1482 break; 1483 default: 1484 ShouldNotReachHere(); 1485 } 1486 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1487 address call = trampoline_call(stub); 1488 if (call == nullptr) { 1489 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1490 ciEnv::current()->record_failure("CodeCache is full"); 1491 return; 1492 } 1493 j(DONE); 1494 1495 bind(SHORT_STRING); 1496 // Is the minimum length zero? 1497 beqz(cnt2, DONE); 1498 // arrange code to do most branches while loading and loading next characters 1499 // while comparing previous 1500 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1501 addi(str1, str1, str1_chr_size); 1502 addi(cnt2, cnt2, -1); 1503 beqz(cnt2, SHORT_LAST_INIT); 1504 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1505 addi(str2, str2, str2_chr_size); 1506 j(SHORT_LOOP_START); 1507 bind(SHORT_LOOP); 1508 addi(cnt2, cnt2, -1); 1509 beqz(cnt2, SHORT_LAST); 1510 bind(SHORT_LOOP_START); 1511 (this->*str1_load_chr)(tmp2, Address(str1), t0); 1512 addi(str1, str1, str1_chr_size); 1513 (this->*str2_load_chr)(t0, Address(str2), t0); 1514 addi(str2, str2, str2_chr_size); 1515 bne(tmp1, cnt1, SHORT_LOOP_TAIL); 1516 addi(cnt2, cnt2, -1); 1517 beqz(cnt2, SHORT_LAST2); 1518 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1519 addi(str1, str1, str1_chr_size); 1520 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1521 addi(str2, str2, str2_chr_size); 1522 beq(tmp2, t0, SHORT_LOOP); 1523 sub(result, tmp2, t0); 1524 j(DONE); 1525 bind(SHORT_LOOP_TAIL); 1526 sub(result, tmp1, cnt1); 1527 j(DONE); 1528 bind(SHORT_LAST2); 1529 beq(tmp2, t0, DONE); 1530 sub(result, tmp2, t0); 1531 1532 j(DONE); 1533 bind(SHORT_LAST_INIT); 1534 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1535 addi(str2, str2, str2_chr_size); 1536 bind(SHORT_LAST); 1537 beq(tmp1, cnt1, DONE); 1538 sub(result, tmp1, cnt1); 1539 1540 bind(DONE); 1541 1542 BLOCK_COMMENT("} string_compare"); 1543 } 1544 1545 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, 1546 Register tmp1, Register tmp2, Register tmp3, 1547 Register result, int elem_size) { 1548 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 1549 assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0); 1550 1551 int elem_per_word = wordSize/elem_size; 1552 int log_elem_size = exact_log2(elem_size); 1553 int length_offset = arrayOopDesc::length_offset_in_bytes(); 1554 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 1555 1556 Register cnt1 = tmp3; 1557 Register cnt2 = tmp1; // cnt2 only used in array length compare 1558 Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01; 1559 1560 BLOCK_COMMENT("arrays_equals {"); 1561 1562 // if (a1 == a2), return true 1563 beq(a1, a2, SAME); 1564 1565 mv(result, false); 1566 // if (a1 == nullptr || a2 == nullptr) 1567 // return false; 1568 beqz(a1, DONE); 1569 beqz(a2, DONE); 1570 1571 // if (a1.length != a2.length) 1572 // return false; 1573 lwu(cnt1, Address(a1, length_offset)); 1574 lwu(cnt2, Address(a2, length_offset)); 1575 bne(cnt1, cnt2, DONE); 1576 1577 la(a1, Address(a1, base_offset)); 1578 la(a2, Address(a2, base_offset)); 1579 // Check for short strings, i.e. smaller than wordSize. 1580 addi(cnt1, cnt1, -elem_per_word); 1581 bltz(cnt1, SHORT); 1582 1583 // Main 8 byte comparison loop. 1584 bind(NEXT_WORD); { 1585 ld(tmp1, Address(a1)); 1586 ld(tmp2, Address(a2)); 1587 addi(cnt1, cnt1, -elem_per_word); 1588 addi(a1, a1, wordSize); 1589 addi(a2, a2, wordSize); 1590 bne(tmp1, tmp2, DONE); 1591 } bgez(cnt1, NEXT_WORD); 1592 1593 addi(tmp1, cnt1, elem_per_word); 1594 beqz(tmp1, SAME); 1595 1596 bind(SHORT); 1597 test_bit(tmp1, cnt1, 2 - log_elem_size); 1598 beqz(tmp1, TAIL03); // 0-7 bytes left. 1599 { 1600 lwu(tmp1, Address(a1)); 1601 lwu(tmp2, Address(a2)); 1602 addi(a1, a1, 4); 1603 addi(a2, a2, 4); 1604 bne(tmp1, tmp2, DONE); 1605 } 1606 1607 bind(TAIL03); 1608 test_bit(tmp1, cnt1, 1 - log_elem_size); 1609 beqz(tmp1, TAIL01); // 0-3 bytes left. 1610 { 1611 lhu(tmp1, Address(a1)); 1612 lhu(tmp2, Address(a2)); 1613 addi(a1, a1, 2); 1614 addi(a2, a2, 2); 1615 bne(tmp1, tmp2, DONE); 1616 } 1617 1618 bind(TAIL01); 1619 if (elem_size == 1) { // Only needed when comparing byte arrays. 1620 test_bit(tmp1, cnt1, 0); 1621 beqz(tmp1, SAME); // 0-1 bytes left. 1622 { 1623 lbu(tmp1, Address(a1)); 1624 lbu(tmp2, Address(a2)); 1625 bne(tmp1, tmp2, DONE); 1626 } 1627 } 1628 1629 bind(SAME); 1630 mv(result, true); 1631 // That's it. 1632 bind(DONE); 1633 1634 BLOCK_COMMENT("} arrays_equals"); 1635 } 1636 1637 // Compare Strings 1638 1639 // For Strings we're passed the address of the first characters in a1 and a2 1640 // and the length in cnt1. There are two implementations. 1641 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed 1642 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte. 1643 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte. 1644 1645 void C2_MacroAssembler::string_equals(Register a1, Register a2, 1646 Register result, Register cnt1) 1647 { 1648 Label SAME, DONE, SHORT, NEXT_WORD; 1649 Register tmp1 = t0; 1650 Register tmp2 = t1; 1651 1652 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); 1653 1654 BLOCK_COMMENT("string_equals {"); 1655 1656 mv(result, false); 1657 1658 // Check for short strings, i.e. smaller than wordSize. 1659 addi(cnt1, cnt1, -wordSize); 1660 bltz(cnt1, SHORT); 1661 1662 // Main 8 byte comparison loop. 1663 bind(NEXT_WORD); { 1664 ld(tmp1, Address(a1)); 1665 ld(tmp2, Address(a2)); 1666 addi(cnt1, cnt1, -wordSize); 1667 addi(a1, a1, wordSize); 1668 addi(a2, a2, wordSize); 1669 bne(tmp1, tmp2, DONE); 1670 } bgez(cnt1, NEXT_WORD); 1671 1672 addi(tmp1, cnt1, wordSize); 1673 beqz(tmp1, SAME); 1674 1675 bind(SHORT); 1676 Label TAIL03, TAIL01; 1677 1678 // 0-7 bytes left. 1679 test_bit(tmp1, cnt1, 2); 1680 beqz(tmp1, TAIL03); 1681 { 1682 lwu(tmp1, Address(a1)); 1683 lwu(tmp2, Address(a2)); 1684 addi(a1, a1, 4); 1685 addi(a2, a2, 4); 1686 bne(tmp1, tmp2, DONE); 1687 } 1688 1689 bind(TAIL03); 1690 // 0-3 bytes left. 1691 test_bit(tmp1, cnt1, 1); 1692 beqz(tmp1, TAIL01); 1693 { 1694 lhu(tmp1, Address(a1)); 1695 lhu(tmp2, Address(a2)); 1696 addi(a1, a1, 2); 1697 addi(a2, a2, 2); 1698 bne(tmp1, tmp2, DONE); 1699 } 1700 1701 bind(TAIL01); 1702 // 0-1 bytes left. 1703 test_bit(tmp1, cnt1, 0); 1704 beqz(tmp1, SAME); 1705 { 1706 lbu(tmp1, Address(a1)); 1707 lbu(tmp2, Address(a2)); 1708 bne(tmp1, tmp2, DONE); 1709 } 1710 1711 // Arrays are equal. 1712 bind(SAME); 1713 mv(result, true); 1714 1715 // That's it. 1716 bind(DONE); 1717 BLOCK_COMMENT("} string_equals"); 1718 } 1719 1720 // jdk.internal.util.ArraysSupport.vectorizedHashCode 1721 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 1722 Register tmp1, Register tmp2, Register tmp3, 1723 Register tmp4, Register tmp5, Register tmp6, 1724 BasicType eltype) 1725 { 1726 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); 1727 1728 const int elsize = arrays_hashcode_elsize(eltype); 1729 const int chunks_end_shift = exact_log2(elsize); 1730 1731 switch (eltype) { 1732 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 1733 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 1734 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 1735 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 1736 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 1737 default: 1738 ShouldNotReachHere(); 1739 } 1740 1741 const int stride = 4; 1742 const Register pow31_4 = tmp1; 1743 const Register pow31_3 = tmp2; 1744 const Register pow31_2 = tmp3; 1745 const Register chunks = tmp4; 1746 const Register chunks_end = chunks; 1747 1748 Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP; 1749 1750 // result has a value initially 1751 1752 beqz(cnt, DONE); 1753 1754 andi(chunks, cnt, ~(stride-1)); 1755 beqz(chunks, TAIL); 1756 1757 mv(pow31_4, 923521); // [31^^4] 1758 mv(pow31_3, 29791); // [31^^3] 1759 mv(pow31_2, 961); // [31^^2] 1760 1761 slli(chunks_end, chunks, chunks_end_shift); 1762 add(chunks_end, ary, chunks_end); 1763 andi(cnt, cnt, stride-1); // don't forget about tail! 1764 1765 bind(WIDE_LOOP); 1766 mulw(result, result, pow31_4); // 31^^4 * h 1767 arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype); 1768 arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype); 1769 arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype); 1770 arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype); 1771 mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0] 1772 addw(result, result, t0); 1773 mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1] 1774 addw(result, result, t1); 1775 slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2] 1776 subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2] 1777 addw(result, result, tmp5); 1778 addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1] 1779 // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3] 1780 addi(ary, ary, elsize * stride); 1781 bne(ary, chunks_end, WIDE_LOOP); 1782 beqz(cnt, DONE); 1783 1784 bind(TAIL); 1785 slli(chunks_end, cnt, chunks_end_shift); 1786 add(chunks_end, ary, chunks_end); 1787 1788 bind(TAIL_LOOP); 1789 arrays_hashcode_elload(t0, Address(ary), eltype); 1790 slli(t1, result, 5); // optimize 31 * result 1791 subw(result, t1, result); // with result<<5 - result 1792 addw(result, result, t0); 1793 addi(ary, ary, elsize); 1794 bne(ary, chunks_end, TAIL_LOOP); 1795 1796 bind(DONE); 1797 BLOCK_COMMENT("} // arrays_hashcode"); 1798 } 1799 1800 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 1801 switch (eltype) { 1802 case T_BOOLEAN: return sizeof(jboolean); 1803 case T_BYTE: return sizeof(jbyte); 1804 case T_SHORT: return sizeof(jshort); 1805 case T_CHAR: return sizeof(jchar); 1806 case T_INT: return sizeof(jint); 1807 default: 1808 ShouldNotReachHere(); 1809 return -1; 1810 } 1811 } 1812 1813 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 1814 switch (eltype) { 1815 // T_BOOLEAN used as surrogate for unsigned byte 1816 case T_BOOLEAN: lbu(dst, src); break; 1817 case T_BYTE: lb(dst, src); break; 1818 case T_SHORT: lh(dst, src); break; 1819 case T_CHAR: lhu(dst, src); break; 1820 case T_INT: lw(dst, src); break; 1821 default: 1822 ShouldNotReachHere(); 1823 } 1824 } 1825 1826 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); 1827 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, 1828 bool is_far, bool is_unordered); 1829 1830 static conditional_branch_insn conditional_branches[] = 1831 { 1832 /* SHORT branches */ 1833 (conditional_branch_insn)&MacroAssembler::beq, 1834 (conditional_branch_insn)&MacroAssembler::bgt, 1835 nullptr, // BoolTest::overflow 1836 (conditional_branch_insn)&MacroAssembler::blt, 1837 (conditional_branch_insn)&MacroAssembler::bne, 1838 (conditional_branch_insn)&MacroAssembler::ble, 1839 nullptr, // BoolTest::no_overflow 1840 (conditional_branch_insn)&MacroAssembler::bge, 1841 1842 /* UNSIGNED branches */ 1843 (conditional_branch_insn)&MacroAssembler::beq, 1844 (conditional_branch_insn)&MacroAssembler::bgtu, 1845 nullptr, 1846 (conditional_branch_insn)&MacroAssembler::bltu, 1847 (conditional_branch_insn)&MacroAssembler::bne, 1848 (conditional_branch_insn)&MacroAssembler::bleu, 1849 nullptr, 1850 (conditional_branch_insn)&MacroAssembler::bgeu 1851 }; 1852 1853 static float_conditional_branch_insn float_conditional_branches[] = 1854 { 1855 /* FLOAT SHORT branches */ 1856 (float_conditional_branch_insn)&MacroAssembler::float_beq, 1857 (float_conditional_branch_insn)&MacroAssembler::float_bgt, 1858 nullptr, // BoolTest::overflow 1859 (float_conditional_branch_insn)&MacroAssembler::float_blt, 1860 (float_conditional_branch_insn)&MacroAssembler::float_bne, 1861 (float_conditional_branch_insn)&MacroAssembler::float_ble, 1862 nullptr, // BoolTest::no_overflow 1863 (float_conditional_branch_insn)&MacroAssembler::float_bge, 1864 1865 /* DOUBLE SHORT branches */ 1866 (float_conditional_branch_insn)&MacroAssembler::double_beq, 1867 (float_conditional_branch_insn)&MacroAssembler::double_bgt, 1868 nullptr, 1869 (float_conditional_branch_insn)&MacroAssembler::double_blt, 1870 (float_conditional_branch_insn)&MacroAssembler::double_bne, 1871 (float_conditional_branch_insn)&MacroAssembler::double_ble, 1872 nullptr, 1873 (float_conditional_branch_insn)&MacroAssembler::double_bge 1874 }; 1875 1876 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) { 1877 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])), 1878 "invalid conditional branch index"); 1879 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far); 1880 } 1881 1882 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use 1883 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode(). 1884 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) { 1885 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])), 1886 "invalid float conditional branch index"); 1887 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask); 1888 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far, 1889 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true); 1890 } 1891 1892 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1893 switch (cmpFlag) { 1894 case BoolTest::eq: 1895 case BoolTest::le: 1896 beqz(op1, L, is_far); 1897 break; 1898 case BoolTest::ne: 1899 case BoolTest::gt: 1900 bnez(op1, L, is_far); 1901 break; 1902 default: 1903 ShouldNotReachHere(); 1904 } 1905 } 1906 1907 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1908 switch (cmpFlag) { 1909 case BoolTest::eq: 1910 beqz(op1, L, is_far); 1911 break; 1912 case BoolTest::ne: 1913 bnez(op1, L, is_far); 1914 break; 1915 default: 1916 ShouldNotReachHere(); 1917 } 1918 } 1919 1920 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) { 1921 Label L; 1922 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L); 1923 mv(dst, src); 1924 bind(L); 1925 } 1926 1927 // Set dst to NaN if any NaN input. 1928 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2, 1929 bool is_double, bool is_min) { 1930 assert_different_registers(dst, src1, src2); 1931 1932 Label Done, Compare; 1933 1934 is_double ? fclass_d(t0, src1) 1935 : fclass_s(t0, src1); 1936 is_double ? fclass_d(t1, src2) 1937 : fclass_s(t1, src2); 1938 orr(t0, t0, t1); 1939 andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN 1940 beqz(t0, Compare); 1941 is_double ? fadd_d(dst, src1, src2) 1942 : fadd_s(dst, src1, src2); 1943 j(Done); 1944 1945 bind(Compare); 1946 if (is_double) { 1947 is_min ? fmin_d(dst, src1, src2) 1948 : fmax_d(dst, src1, src2); 1949 } else { 1950 is_min ? fmin_s(dst, src1, src2) 1951 : fmax_s(dst, src1, src2); 1952 } 1953 1954 bind(Done); 1955 } 1956 1957 // According to Java SE specification, for floating-point round operations, if 1958 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the 1959 // rounded result; this differs from behavior of RISC-V fcvt instructions (which 1960 // round out-of-range values to the nearest max or min value), therefore special 1961 // handling is needed by NaN, +/-Infinity, +/-0. 1962 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode, 1963 Register tmp1, Register tmp2, Register tmp3) { 1964 1965 assert_different_registers(dst, src); 1966 assert_different_registers(tmp1, tmp2, tmp3); 1967 1968 // Set rounding mode for conversions 1969 // Here we use similar modes to double->long and long->double conversions 1970 // Different mode for long->double conversion matter only if long value was not representable as double, 1971 // we got long value as a result of double->long conversion so, it is definitely representable 1972 RoundingMode rm; 1973 switch (round_mode) { 1974 case RoundDoubleModeNode::rmode_ceil: 1975 rm = RoundingMode::rup; 1976 break; 1977 case RoundDoubleModeNode::rmode_floor: 1978 rm = RoundingMode::rdn; 1979 break; 1980 case RoundDoubleModeNode::rmode_rint: 1981 rm = RoundingMode::rne; 1982 break; 1983 default: 1984 ShouldNotReachHere(); 1985 } 1986 1987 // tmp1 - is a register to store double converted to long int 1988 // tmp2 - is a register to create constant for comparison 1989 // tmp3 - is a register where we store modified result of double->long conversion 1990 Label done, bad_val; 1991 1992 // Conversion from double to long 1993 fcvt_l_d(tmp1, src, rm); 1994 1995 // Generate constant (tmp2) 1996 // tmp2 = 100...0000 1997 addi(tmp2, zr, 1); 1998 slli(tmp2, tmp2, 63); 1999 2000 // Prepare converted long (tmp1) 2001 // as a result when conversion overflow we got: 2002 // tmp1 = 011...1111 or 100...0000 2003 // Convert it to: tmp3 = 100...0000 2004 addi(tmp3, tmp1, 1); 2005 andi(tmp3, tmp3, -2); 2006 beq(tmp3, tmp2, bad_val); 2007 2008 // Conversion from long to double 2009 fcvt_d_l(dst, tmp1, rm); 2010 // Add sign of input value to result for +/- 0 cases 2011 fsgnj_d(dst, dst, src); 2012 j(done); 2013 2014 // If got conversion overflow return src 2015 bind(bad_val); 2016 fmv_d(dst, src); 2017 2018 bind(done); 2019 } 2020 2021 // According to Java SE specification, for floating-point signum operations, if 2022 // on input we have NaN or +/-0.0 value we should return it, 2023 // otherwise return +/- 1.0 using sign of input. 2024 // one - gives us a floating-point 1.0 (got from matching rule) 2025 // bool is_double - specifies single or double precision operations will be used. 2026 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) { 2027 Label done; 2028 2029 is_double ? fclass_d(t0, dst) 2030 : fclass_s(t0, dst); 2031 2032 // check if input is -0, +0, signaling NaN or quiet NaN 2033 andi(t0, t0, fclass_mask::zero | fclass_mask::nan); 2034 2035 bnez(t0, done); 2036 2037 // use floating-point 1.0 with a sign of input 2038 is_double ? fsgnj_d(dst, one, dst) 2039 : fsgnj_s(dst, one, dst); 2040 2041 bind(done); 2042 } 2043 2044 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) { 2045 #define __ masm. 2046 FloatRegister dst = stub.data<0>(); 2047 Register src = stub.data<1>(); 2048 Register tmp = stub.data<2>(); 2049 __ bind(stub.entry()); 2050 2051 // following instructions mainly focus on NaN, as riscv does not handle 2052 // NaN well with fcvt, but the code also works for Inf at the same time. 2053 2054 // construct a NaN in 32 bits from the NaN in 16 bits, 2055 // we need the payloads of non-canonical NaNs to be preserved. 2056 __ mv(tmp, 0x7f800000); 2057 // sign-bit was already set via sign-extension if necessary. 2058 __ slli(t0, src, 13); 2059 __ orr(tmp, t0, tmp); 2060 __ fmv_w_x(dst, tmp); 2061 2062 __ j(stub.continuation()); 2063 #undef __ 2064 } 2065 2066 // j.l.Float.float16ToFloat 2067 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) { 2068 auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path); 2069 2070 // On riscv, NaN needs a special process as fcvt does not work in that case. 2071 // On riscv, Inf does not need a special process as fcvt can handle it correctly. 2072 // but we consider to get the slow path to process NaN and Inf at the same time, 2073 // as both of them are rare cases, and if we try to get the slow path to handle 2074 // only NaN case it would sacrifise the performance for normal cases, 2075 // i.e. non-NaN and non-Inf cases. 2076 2077 // check whether it's a NaN or +/- Inf. 2078 mv(t0, 0x7c00); 2079 andr(tmp, src, t0); 2080 // jump to stub processing NaN and Inf cases. 2081 beq(t0, tmp, stub->entry()); 2082 2083 // non-NaN or non-Inf cases, just use built-in instructions. 2084 fmv_h_x(dst, src); 2085 fcvt_s_h(dst, dst); 2086 2087 bind(stub->continuation()); 2088 } 2089 2090 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) { 2091 #define __ masm. 2092 Register dst = stub.data<0>(); 2093 FloatRegister src = stub.data<1>(); 2094 Register tmp = stub.data<2>(); 2095 __ bind(stub.entry()); 2096 2097 __ fmv_x_w(dst, src); 2098 2099 // preserve the payloads of non-canonical NaNs. 2100 __ srai(dst, dst, 13); 2101 // preserve the sign bit. 2102 __ srai(tmp, dst, 13); 2103 __ slli(tmp, tmp, 10); 2104 __ mv(t0, 0x3ff); 2105 __ orr(tmp, tmp, t0); 2106 2107 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2108 __ andr(dst, dst, tmp); 2109 2110 __ j(stub.continuation()); 2111 #undef __ 2112 } 2113 2114 // j.l.Float.floatToFloat16 2115 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) { 2116 auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path); 2117 2118 // On riscv, NaN needs a special process as fcvt does not work in that case. 2119 2120 // check whether it's a NaN. 2121 // replace fclass with feq as performance optimization. 2122 feq_s(t0, src, src); 2123 // jump to stub processing NaN cases. 2124 beqz(t0, stub->entry()); 2125 2126 // non-NaN cases, just use built-in instructions. 2127 fcvt_h_s(ftmp, src); 2128 fmv_x_h(dst, ftmp); 2129 2130 bind(stub->continuation()); 2131 } 2132 2133 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) { 2134 #define __ masm. 2135 VectorRegister dst = stub.data<0>(); 2136 VectorRegister src = stub.data<1>(); 2137 uint vector_length = stub.data<2>(); 2138 __ bind(stub.entry()); 2139 2140 // following instructions mainly focus on NaN, as riscv does not handle 2141 // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time. 2142 // 2143 // construct NaN's in 32 bits from the NaN's in 16 bits, 2144 // we need the payloads of non-canonical NaNs to be preserved. 2145 2146 // adjust vector type to 2 * SEW. 2147 __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1); 2148 // widen and sign-extend src data. 2149 __ vsext_vf2(dst, src, Assembler::v0_t); 2150 __ mv(t0, 0x7f800000); 2151 // sign-bit was already set via sign-extension if necessary. 2152 __ vsll_vi(dst, dst, 13, Assembler::v0_t); 2153 __ vor_vx(dst, dst, t0, Assembler::v0_t); 2154 2155 __ j(stub.continuation()); 2156 #undef __ 2157 } 2158 2159 // j.l.Float.float16ToFloat 2160 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) { 2161 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint> 2162 (dst, src, vector_length, 24, float16_to_float_v_slow_path); 2163 assert_different_registers(dst, src); 2164 2165 // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case. 2166 // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly. 2167 // but we consider to get the slow path to process NaN and Inf at the same time, 2168 // as both of them are rare cases, and if we try to get the slow path to handle 2169 // only NaN case it would sacrifise the performance for normal cases, 2170 // i.e. non-NaN and non-Inf cases. 2171 2172 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2); 2173 2174 // check whether there is a NaN or +/- Inf. 2175 mv(t0, 0x7c00); 2176 vand_vx(v0, src, t0); 2177 // v0 will be used as mask in slow path. 2178 vmseq_vx(v0, v0, t0); 2179 vcpop_m(t0, v0); 2180 2181 // For non-NaN or non-Inf cases, just use built-in instructions. 2182 vfwcvt_f_f_v(dst, src); 2183 2184 // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide. 2185 bnez(t0, stub->entry()); 2186 2187 bind(stub->continuation()); 2188 } 2189 2190 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm, 2191 C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) { 2192 #define __ masm. 2193 VectorRegister dst = stub.data<0>(); 2194 VectorRegister src = stub.data<1>(); 2195 VectorRegister tmp = stub.data<2>(); 2196 __ bind(stub.entry()); 2197 2198 // mul is already set to mf2 in float_to_float16_v. 2199 2200 // preserve the payloads of non-canonical NaNs. 2201 __ vnsra_wi(dst, src, 13, Assembler::v0_t); 2202 2203 // preserve the sign bit. 2204 __ vnsra_wi(tmp, src, 26, Assembler::v0_t); 2205 __ vsll_vi(tmp, tmp, 10, Assembler::v0_t); 2206 __ mv(t0, 0x3ff); 2207 __ vor_vx(tmp, tmp, t0, Assembler::v0_t); 2208 2209 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2210 __ vand_vv(dst, dst, tmp, Assembler::v0_t); 2211 2212 __ j(stub.continuation()); 2213 #undef __ 2214 } 2215 2216 // j.l.Float.float16ToFloat 2217 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp, 2218 Register tmp, uint vector_length) { 2219 assert_different_registers(dst, src, vtmp); 2220 2221 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister> 2222 (dst, src, vtmp, 28, float_to_float16_v_slow_path); 2223 2224 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case. 2225 2226 vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1); 2227 2228 // check whether there is a NaN. 2229 // replace v_fclass with vmseq_vv as performance optimization. 2230 vmfne_vv(v0, src, src); 2231 vcpop_m(t0, v0); 2232 2233 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp); 2234 2235 // For non-NaN cases, just use built-in instructions. 2236 vfncvt_f_f_w(dst, src); 2237 2238 // jump to stub processing NaN cases. 2239 bnez(t0, stub->entry()); 2240 2241 bind(stub->continuation()); 2242 } 2243 2244 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) { 2245 vsetvli_helper(bt, vlen); 2246 2247 // check if input is -0, +0, signaling NaN or quiet NaN 2248 vfclass_v(v0, dst); 2249 mv(t0, fclass_mask::zero | fclass_mask::nan); 2250 vand_vx(v0, v0, t0); 2251 vmseq_vi(v0, v0, 0); 2252 2253 // use floating-point 1.0 with a sign of input 2254 vfsgnj_vv(dst, one, dst, v0_t); 2255 } 2256 2257 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) { 2258 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2259 // intrinsic is enabled when MaxVectorSize >= 16 2260 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2261 long len = is_long ? 64 : 32; 2262 2263 // load the src data(in bits) to be compressed. 2264 vsetivli(x0, 1, sew, Assembler::m1); 2265 vmv_s_x(v0, src); 2266 // reset the src data(in bytes) to zero. 2267 mv(t0, len); 2268 vsetvli(x0, t0, Assembler::e8, lmul); 2269 vmv_v_i(v4, 0); 2270 // convert the src data from bits to bytes. 2271 vmerge_vim(v4, v4, 1); // v0 as the implicit mask register 2272 // reset the dst data(in bytes) to zero. 2273 vmv_v_i(v8, 0); 2274 // load the mask data(in bits). 2275 vsetivli(x0, 1, sew, Assembler::m1); 2276 vmv_s_x(v0, mask); 2277 // compress the src data(in bytes) to dst(in bytes). 2278 vsetvli(x0, t0, Assembler::e8, lmul); 2279 vcompress_vm(v8, v4, v0); 2280 // convert the dst data from bytes to bits. 2281 vmseq_vi(v0, v8, 1); 2282 // store result back. 2283 vsetivli(x0, 1, sew, Assembler::m1); 2284 vmv_x_s(dst, v0); 2285 } 2286 2287 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) { 2288 compress_bits_v(dst, src, mask, /* is_long */ false); 2289 } 2290 2291 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) { 2292 compress_bits_v(dst, src, mask, /* is_long */ true); 2293 } 2294 2295 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) { 2296 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2297 // intrinsic is enabled when MaxVectorSize >= 16 2298 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2299 long len = is_long ? 64 : 32; 2300 2301 // load the src data(in bits) to be expanded. 2302 vsetivli(x0, 1, sew, Assembler::m1); 2303 vmv_s_x(v0, src); 2304 // reset the src data(in bytes) to zero. 2305 mv(t0, len); 2306 vsetvli(x0, t0, Assembler::e8, lmul); 2307 vmv_v_i(v4, 0); 2308 // convert the src data from bits to bytes. 2309 vmerge_vim(v4, v4, 1); // v0 as implicit mask register 2310 // reset the dst data(in bytes) to zero. 2311 vmv_v_i(v12, 0); 2312 // load the mask data(in bits). 2313 vsetivli(x0, 1, sew, Assembler::m1); 2314 vmv_s_x(v0, mask); 2315 // expand the src data(in bytes) to dst(in bytes). 2316 vsetvli(x0, t0, Assembler::e8, lmul); 2317 viota_m(v8, v0); 2318 vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register 2319 // convert the dst data from bytes to bits. 2320 vmseq_vi(v0, v12, 1); 2321 // store result back. 2322 vsetivli(x0, 1, sew, Assembler::m1); 2323 vmv_x_s(dst, v0); 2324 } 2325 2326 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) { 2327 expand_bits_v(dst, src, mask, /* is_long */ false); 2328 } 2329 2330 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) { 2331 expand_bits_v(dst, src, mask, /* is_long */ true); 2332 } 2333 2334 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, 2335 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) { 2336 Label loop; 2337 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16; 2338 2339 bind(loop); 2340 vsetvli(tmp1, cnt, sew, Assembler::m2); 2341 vlex_v(vr1, a1, sew); 2342 vlex_v(vr2, a2, sew); 2343 vmsne_vv(vrs, vr1, vr2); 2344 vfirst_m(tmp2, vrs); 2345 bgez(tmp2, DONE); 2346 sub(cnt, cnt, tmp1); 2347 if (!islatin) { 2348 slli(tmp1, tmp1, 1); // get byte counts 2349 } 2350 add(a1, a1, tmp1); 2351 add(a2, a2, tmp1); 2352 bnez(cnt, loop); 2353 2354 mv(result, true); 2355 } 2356 2357 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) { 2358 Label DONE; 2359 Register tmp1 = t0; 2360 Register tmp2 = t1; 2361 2362 BLOCK_COMMENT("string_equals_v {"); 2363 2364 mv(result, false); 2365 2366 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE); 2367 2368 bind(DONE); 2369 BLOCK_COMMENT("} string_equals_v"); 2370 } 2371 2372 // used by C2 ClearArray patterns. 2373 // base: Address of a buffer to be zeroed 2374 // cnt: Count in HeapWords 2375 // 2376 // base, cnt, v4, v5, v6, v7 and t0 are clobbered. 2377 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) { 2378 Label loop; 2379 2380 // making zero words 2381 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2382 vxor_vv(v4, v4, v4); 2383 2384 bind(loop); 2385 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2386 vse64_v(v4, base); 2387 sub(cnt, cnt, t0); 2388 shadd(base, t0, base, t0, 3); 2389 bnez(cnt, loop); 2390 } 2391 2392 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result, 2393 Register cnt1, int elem_size) { 2394 Label DONE; 2395 Register tmp1 = t0; 2396 Register tmp2 = t1; 2397 Register cnt2 = tmp2; 2398 int length_offset = arrayOopDesc::length_offset_in_bytes(); 2399 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 2400 2401 BLOCK_COMMENT("arrays_equals_v {"); 2402 2403 // if (a1 == a2), return true 2404 mv(result, true); 2405 beq(a1, a2, DONE); 2406 2407 mv(result, false); 2408 // if a1 == null or a2 == null, return false 2409 beqz(a1, DONE); 2410 beqz(a2, DONE); 2411 // if (a1.length != a2.length), return false 2412 lwu(cnt1, Address(a1, length_offset)); 2413 lwu(cnt2, Address(a2, length_offset)); 2414 bne(cnt1, cnt2, DONE); 2415 2416 la(a1, Address(a1, base_offset)); 2417 la(a2, Address(a2, base_offset)); 2418 2419 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE); 2420 2421 bind(DONE); 2422 2423 BLOCK_COMMENT("} arrays_equals_v"); 2424 } 2425 2426 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2, 2427 Register result, Register tmp1, Register tmp2, int encForm) { 2428 Label DIFFERENCE, DONE, L, loop; 2429 bool encLL = encForm == StrIntrinsicNode::LL; 2430 bool encLU = encForm == StrIntrinsicNode::LU; 2431 bool encUL = encForm == StrIntrinsicNode::UL; 2432 2433 bool str1_isL = encLL || encLU; 2434 bool str2_isL = encLL || encUL; 2435 2436 int minCharsInWord = encLL ? wordSize : wordSize / 2; 2437 2438 BLOCK_COMMENT("string_compare {"); 2439 2440 // for Latin strings, 1 byte for 1 character 2441 // for UTF16 strings, 2 bytes for 1 character 2442 if (!str1_isL) 2443 sraiw(cnt1, cnt1, 1); 2444 if (!str2_isL) 2445 sraiw(cnt2, cnt2, 1); 2446 2447 // if str1 == str2, return the difference 2448 // save the minimum of the string lengths in cnt2. 2449 sub(result, cnt1, cnt2); 2450 bgt(cnt1, cnt2, L); 2451 mv(cnt2, cnt1); 2452 bind(L); 2453 2454 if (str1_isL == str2_isL) { // LL or UU 2455 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE); 2456 j(DONE); 2457 } else { // LU or UL 2458 Register strL = encLU ? str1 : str2; 2459 Register strU = encLU ? str2 : str1; 2460 VectorRegister vstr1 = encLU ? v8 : v4; 2461 VectorRegister vstr2 = encLU ? v4 : v8; 2462 2463 bind(loop); 2464 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2); 2465 vle8_v(vstr1, strL); 2466 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4); 2467 vzext_vf2(vstr2, vstr1); 2468 vle16_v(vstr1, strU); 2469 vmsne_vv(v4, vstr2, vstr1); 2470 vfirst_m(tmp2, v4); 2471 bgez(tmp2, DIFFERENCE); 2472 sub(cnt2, cnt2, tmp1); 2473 add(strL, strL, tmp1); 2474 shadd(strU, tmp1, strU, tmp1, 1); 2475 bnez(cnt2, loop); 2476 j(DONE); 2477 } 2478 2479 bind(DIFFERENCE); 2480 slli(tmp1, tmp2, 1); 2481 add(str1, str1, str1_isL ? tmp2 : tmp1); 2482 add(str2, str2, str2_isL ? tmp2 : tmp1); 2483 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0)); 2484 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0)); 2485 sub(result, tmp1, tmp2); 2486 2487 bind(DONE); 2488 } 2489 2490 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) { 2491 Label loop; 2492 assert_different_registers(src, dst, len, tmp, t0); 2493 2494 BLOCK_COMMENT("byte_array_inflate_v {"); 2495 bind(loop); 2496 vsetvli(tmp, len, Assembler::e8, Assembler::m2); 2497 vle8_v(v6, src); 2498 vsetvli(t0, len, Assembler::e16, Assembler::m4); 2499 vzext_vf2(v4, v6); 2500 vse16_v(v4, dst); 2501 sub(len, len, tmp); 2502 add(src, src, tmp); 2503 shadd(dst, tmp, dst, tmp, 1); 2504 bnez(len, loop); 2505 BLOCK_COMMENT("} byte_array_inflate_v"); 2506 } 2507 2508 // Compress char[] array to byte[]. 2509 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) 2510 // result: the array length if every element in array can be encoded, 2511 // otherwise, the index of first non-latin1 (> 0xff) character. 2512 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, 2513 Register result, Register tmp) { 2514 encode_iso_array_v(src, dst, len, result, tmp, false); 2515 } 2516 2517 // Intrinsic for 2518 // 2519 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray 2520 // return the number of characters copied. 2521 // - java/lang/StringUTF16.compress 2522 // return index of non-latin1 character if copy fails, otherwise 'len'. 2523 // 2524 // This version always returns the number of characters copied. A successful 2525 // copy will complete with the post-condition: 'res' == 'len', while an 2526 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'. 2527 // 2528 // Clobbers: src, dst, len, result, t0 2529 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, 2530 Register result, Register tmp, bool ascii) { 2531 Label loop, fail, done; 2532 2533 BLOCK_COMMENT("encode_iso_array_v {"); 2534 mv(result, 0); 2535 2536 bind(loop); 2537 mv(tmp, ascii ? 0x7f : 0xff); 2538 vsetvli(t0, len, Assembler::e16, Assembler::m2); 2539 vle16_v(v2, src); 2540 2541 vmsgtu_vx(v1, v2, tmp); 2542 vfirst_m(tmp, v1); 2543 vmsbf_m(v0, v1); 2544 // compress char to byte 2545 vsetvli(t0, len, Assembler::e8); 2546 vncvt_x_x_w(v1, v2, Assembler::v0_t); 2547 vse8_v(v1, dst, Assembler::v0_t); 2548 2549 // fail if char > 0x7f/0xff 2550 bgez(tmp, fail); 2551 add(result, result, t0); 2552 add(dst, dst, t0); 2553 sub(len, len, t0); 2554 shadd(src, t0, src, t0, 1); 2555 bnez(len, loop); 2556 j(done); 2557 2558 bind(fail); 2559 add(result, result, tmp); 2560 2561 bind(done); 2562 BLOCK_COMMENT("} encode_iso_array_v"); 2563 } 2564 2565 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) { 2566 Label LOOP, SET_RESULT, DONE; 2567 2568 BLOCK_COMMENT("count_positives_v {"); 2569 assert_different_registers(ary, len, result, tmp); 2570 2571 mv(result, zr); 2572 2573 bind(LOOP); 2574 vsetvli(t0, len, Assembler::e8, Assembler::m4); 2575 vle8_v(v4, ary); 2576 vmslt_vx(v4, v4, zr); 2577 vfirst_m(tmp, v4); 2578 bgez(tmp, SET_RESULT); 2579 // if tmp == -1, all bytes are positive 2580 add(result, result, t0); 2581 2582 sub(len, len, t0); 2583 add(ary, ary, t0); 2584 bnez(len, LOOP); 2585 j(DONE); 2586 2587 // add remaining positive bytes count 2588 bind(SET_RESULT); 2589 add(result, result, tmp); 2590 2591 bind(DONE); 2592 BLOCK_COMMENT("} count_positives_v"); 2593 } 2594 2595 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1, 2596 Register ch, Register result, 2597 Register tmp1, Register tmp2, 2598 bool isL) { 2599 mv(result, zr); 2600 2601 Label loop, MATCH, DONE; 2602 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16; 2603 bind(loop); 2604 vsetvli(tmp1, cnt1, sew, Assembler::m4); 2605 vlex_v(v4, str1, sew); 2606 vmseq_vx(v4, v4, ch); 2607 vfirst_m(tmp2, v4); 2608 bgez(tmp2, MATCH); // if equal, return index 2609 2610 add(result, result, tmp1); 2611 sub(cnt1, cnt1, tmp1); 2612 if (!isL) slli(tmp1, tmp1, 1); 2613 add(str1, str1, tmp1); 2614 bnez(cnt1, loop); 2615 2616 mv(result, -1); 2617 j(DONE); 2618 2619 bind(MATCH); 2620 add(result, result, tmp2); 2621 2622 bind(DONE); 2623 } 2624 2625 // Set dst to NaN if any NaN input. 2626 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2627 BasicType bt, bool is_min, uint vector_length) { 2628 assert_different_registers(dst, src1, src2); 2629 2630 vsetvli_helper(bt, vector_length); 2631 2632 is_min ? vfmin_vv(dst, src1, src2) 2633 : vfmax_vv(dst, src1, src2); 2634 2635 vmfne_vv(v0, src1, src1); 2636 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2637 vmfne_vv(v0, src2, src2); 2638 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2639 } 2640 2641 // Set dst to NaN if any NaN input. 2642 // The destination vector register elements corresponding to masked-off elements 2643 // are handled with a mask-undisturbed policy. 2644 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2645 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2, 2646 BasicType bt, bool is_min, uint vector_length) { 2647 assert_different_registers(src1, src2, tmp1, tmp2); 2648 vsetvli_helper(bt, vector_length); 2649 2650 // Check vector elements of src1 and src2 for NaN. 2651 vmfeq_vv(tmp1, src1, src1); 2652 vmfeq_vv(tmp2, src2, src2); 2653 2654 vmandn_mm(v0, vmask, tmp1); 2655 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2656 vmandn_mm(v0, vmask, tmp2); 2657 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2658 2659 vmand_mm(tmp2, tmp1, tmp2); 2660 vmand_mm(v0, vmask, tmp2); 2661 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t) 2662 : vfmax_vv(dst, src1, src2, Assembler::v0_t); 2663 } 2664 2665 // Set dst to NaN if any NaN input. 2666 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst, 2667 FloatRegister src1, VectorRegister src2, 2668 VectorRegister tmp1, VectorRegister tmp2, 2669 bool is_double, bool is_min, uint vector_length, VectorMask vm) { 2670 assert_different_registers(dst, src1); 2671 assert_different_registers(src2, tmp1, tmp2); 2672 2673 Label L_done, L_NaN_1, L_NaN_2; 2674 // Set dst to src1 if src1 is NaN 2675 is_double ? feq_d(t0, src1, src1) 2676 : feq_s(t0, src1, src1); 2677 beqz(t0, L_NaN_2); 2678 2679 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 2680 vfmv_s_f(tmp2, src1); 2681 2682 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm) 2683 : vfredmax_vs(tmp1, src2, tmp2, vm); 2684 vfmv_f_s(dst, tmp1); 2685 2686 // Checking NaNs in src2 2687 vmfne_vv(tmp1, src2, src2, vm); 2688 vcpop_m(t0, tmp1, vm); 2689 beqz(t0, L_done); 2690 2691 bind(L_NaN_1); 2692 vfredusum_vs(tmp1, src2, tmp2, vm); 2693 vfmv_f_s(dst, tmp1); 2694 j(L_done); 2695 2696 bind(L_NaN_2); 2697 is_double ? fmv_d(dst, src1) 2698 : fmv_s(dst, src1); 2699 bind(L_done); 2700 } 2701 2702 bool C2_MacroAssembler::in_scratch_emit_size() { 2703 if (ciEnv::current()->task() != nullptr) { 2704 PhaseOutput* phase_output = Compile::current()->output(); 2705 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2706 return true; 2707 } 2708 } 2709 return MacroAssembler::in_scratch_emit_size(); 2710 } 2711 2712 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, 2713 VectorRegister src2, VectorRegister tmp, 2714 int opc, BasicType bt, uint vector_length, VectorMask vm) { 2715 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2716 vsetvli_helper(bt, vector_length); 2717 vmv_s_x(tmp, src1); 2718 switch (opc) { 2719 case Op_AddReductionVI: 2720 case Op_AddReductionVL: 2721 vredsum_vs(tmp, src2, tmp, vm); 2722 break; 2723 case Op_AndReductionV: 2724 vredand_vs(tmp, src2, tmp, vm); 2725 break; 2726 case Op_OrReductionV: 2727 vredor_vs(tmp, src2, tmp, vm); 2728 break; 2729 case Op_XorReductionV: 2730 vredxor_vs(tmp, src2, tmp, vm); 2731 break; 2732 case Op_MaxReductionV: 2733 vredmax_vs(tmp, src2, tmp, vm); 2734 break; 2735 case Op_MinReductionV: 2736 vredmin_vs(tmp, src2, tmp, vm); 2737 break; 2738 default: 2739 ShouldNotReachHere(); 2740 } 2741 vmv_x_s(dst, tmp); 2742 } 2743 2744 // Set vl and vtype for full and partial vector operations. 2745 // (vma = mu, vta = tu, vill = false) 2746 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) { 2747 Assembler::SEW sew = Assembler::elemtype_to_sew(bt); 2748 if (vector_length <= 31) { 2749 vsetivli(tmp, vector_length, sew, vlmul); 2750 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) { 2751 vsetvli(tmp, x0, sew, vlmul); 2752 } else { 2753 mv(tmp, vector_length); 2754 vsetvli(tmp, tmp, sew, vlmul); 2755 } 2756 } 2757 2758 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2759 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2760 assert(is_integral_type(bt), "unsupported element type"); 2761 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2762 vsetvli_helper(bt, vector_length); 2763 vmclr_m(vd); 2764 switch (cond) { 2765 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break; 2766 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break; 2767 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break; 2768 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break; 2769 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break; 2770 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break; 2771 default: 2772 assert(false, "unsupported compare condition"); 2773 ShouldNotReachHere(); 2774 } 2775 } 2776 2777 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2778 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2779 assert(is_floating_point_type(bt), "unsupported element type"); 2780 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2781 vsetvli_helper(bt, vector_length); 2782 vmclr_m(vd); 2783 switch (cond) { 2784 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break; 2785 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break; 2786 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break; 2787 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break; 2788 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break; 2789 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break; 2790 default: 2791 assert(false, "unsupported compare condition"); 2792 ShouldNotReachHere(); 2793 } 2794 } 2795 2796 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2797 VectorRegister src, BasicType src_bt, bool is_signed) { 2798 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size"); 2799 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2800 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands 2801 // The destination EEW is greater than the source EEW, the source EMUL is at least 1, 2802 // and the overlap is in the highest-numbered part of the destination register group. 2803 // Since LMUL=1, vd and vs cannot be the same. 2804 assert_different_registers(dst, src); 2805 2806 vsetvli_helper(dst_bt, vector_length); 2807 if (is_signed) { 2808 if (src_bt == T_BYTE) { 2809 switch (dst_bt) { 2810 case T_SHORT: 2811 vsext_vf2(dst, src); 2812 break; 2813 case T_INT: 2814 vsext_vf4(dst, src); 2815 break; 2816 case T_LONG: 2817 vsext_vf8(dst, src); 2818 break; 2819 default: 2820 ShouldNotReachHere(); 2821 } 2822 } else if (src_bt == T_SHORT) { 2823 if (dst_bt == T_INT) { 2824 vsext_vf2(dst, src); 2825 } else { 2826 vsext_vf4(dst, src); 2827 } 2828 } else if (src_bt == T_INT) { 2829 vsext_vf2(dst, src); 2830 } 2831 } else { 2832 if (src_bt == T_BYTE) { 2833 switch (dst_bt) { 2834 case T_SHORT: 2835 vzext_vf2(dst, src); 2836 break; 2837 case T_INT: 2838 vzext_vf4(dst, src); 2839 break; 2840 case T_LONG: 2841 vzext_vf8(dst, src); 2842 break; 2843 default: 2844 ShouldNotReachHere(); 2845 } 2846 } else if (src_bt == T_SHORT) { 2847 if (dst_bt == T_INT) { 2848 vzext_vf2(dst, src); 2849 } else { 2850 vzext_vf4(dst, src); 2851 } 2852 } else if (src_bt == T_INT) { 2853 vzext_vf2(dst, src); 2854 } 2855 } 2856 } 2857 2858 // Vector narrow from src to dst with specified element sizes. 2859 // High part of dst vector will be filled with zero. 2860 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2861 VectorRegister src, BasicType src_bt) { 2862 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size"); 2863 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2864 mv(t0, vector_length); 2865 if (src_bt == T_LONG) { 2866 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions 2867 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source. 2868 // So we can currently only scale down by 1/2 the width at a time. 2869 vsetvli(t0, t0, Assembler::e32, Assembler::mf2); 2870 vncvt_x_x_w(dst, src); 2871 if (dst_bt == T_SHORT || dst_bt == T_BYTE) { 2872 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2873 vncvt_x_x_w(dst, dst); 2874 if (dst_bt == T_BYTE) { 2875 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2876 vncvt_x_x_w(dst, dst); 2877 } 2878 } 2879 } else if (src_bt == T_INT) { 2880 // T_SHORT 2881 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2882 vncvt_x_x_w(dst, src); 2883 if (dst_bt == T_BYTE) { 2884 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2885 vncvt_x_x_w(dst, dst); 2886 } 2887 } else if (src_bt == T_SHORT) { 2888 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2889 vncvt_x_x_w(dst, src); 2890 } 2891 } 2892 2893 #define VFCVT_SAFE(VFLOATCVT) \ 2894 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \ 2895 assert_different_registers(dst, src); \ 2896 vxor_vv(dst, dst, dst); \ 2897 vmfeq_vv(v0, src, src); \ 2898 VFLOATCVT(dst, src, Assembler::v0_t); \ 2899 } 2900 2901 VFCVT_SAFE(vfcvt_rtz_x_f_v); 2902 2903 #undef VFCVT_SAFE 2904 2905 // Extract a scalar element from an vector at position 'idx'. 2906 // The input elements in src are expected to be of integral type. 2907 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt, 2908 int idx, VectorRegister tmp) { 2909 assert(is_integral_type(bt), "unsupported element type"); 2910 assert(idx >= 0, "idx cannot be negative"); 2911 // Only need the first element after vector slidedown 2912 vsetvli_helper(bt, 1); 2913 if (idx == 0) { 2914 vmv_x_s(dst, src); 2915 } else if (idx <= 31) { 2916 vslidedown_vi(tmp, src, idx); 2917 vmv_x_s(dst, tmp); 2918 } else { 2919 mv(t0, idx); 2920 vslidedown_vx(tmp, src, t0); 2921 vmv_x_s(dst, tmp); 2922 } 2923 } 2924 2925 // Extract a scalar element from an vector at position 'idx'. 2926 // The input elements in src are expected to be of floating point type. 2927 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, 2928 int idx, VectorRegister tmp) { 2929 assert(is_floating_point_type(bt), "unsupported element type"); 2930 assert(idx >= 0, "idx cannot be negative"); 2931 // Only need the first element after vector slidedown 2932 vsetvli_helper(bt, 1); 2933 if (idx == 0) { 2934 vfmv_f_s(dst, src); 2935 } else if (idx <= 31) { 2936 vslidedown_vi(tmp, src, idx); 2937 vfmv_f_s(dst, tmp); 2938 } else { 2939 mv(t0, idx); 2940 vslidedown_vx(tmp, src, t0); 2941 vfmv_f_s(dst, tmp); 2942 } 2943 }