1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/compile.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, 48 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) { 49 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure. 50 Register flag = t1; 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmp1Reg; 54 Register tmp = tmp2Reg; 55 Label object_has_monitor; 56 // Finish fast lock successfully. MUST branch to with flag == 0 57 Label locked; 58 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 59 Label slow_path; 60 61 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 62 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0); 63 64 mv(flag, 1); 65 66 // Load markWord from object into displaced_header. 67 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 68 69 if (DiagnoseSyncOnValueBasedClasses != 0) { 70 load_klass(tmp, oop); 71 lwu(tmp, Address(tmp, Klass::access_flags_offset())); 72 test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 73 bnez(tmp, slow_path); 74 } 75 76 // Check for existing monitor 77 test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value)); 78 bnez(tmp, object_has_monitor); 79 80 if (LockingMode == LM_MONITOR) { 81 j(slow_path); 82 } else { 83 assert(LockingMode == LM_LEGACY, "must be"); 84 // Set tmp to be (markWord of object | UNLOCK_VALUE). 85 ori(tmp, disp_hdr, markWord::unlocked_value); 86 87 // Initialize the box. (Must happen before we update the object mark!) 88 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 89 90 // Compare object markWord with an unlocked value (tmp) and if 91 // equal exchange the stack address of our box with object markWord. 92 // On failure disp_hdr contains the possibly locked markWord. 93 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, 94 Assembler::aq, Assembler::rl, /*result*/disp_hdr); 95 beq(disp_hdr, tmp, locked); 96 97 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 98 99 // If the compare-and-exchange succeeded, then we found an unlocked 100 // object, will have now locked it will continue at label locked 101 // We did not see an unlocked object so try the fast recursive case. 102 103 // Check if the owner is self by comparing the value in the 104 // markWord of object (disp_hdr) with the stack pointer. 105 sub(disp_hdr, disp_hdr, sp); 106 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place)); 107 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked, 108 // hence we can store 0 as the displaced header in the box, which indicates that it is a 109 // recursive lock. 110 andr(tmp/*==0?*/, disp_hdr, tmp); 111 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 112 beqz(tmp, locked); 113 j(slow_path); 114 } 115 116 // Handle existing monitor. 117 bind(object_has_monitor); 118 // The object's monitor m is unlocked iff m->owner == nullptr, 119 // otherwise m->owner may contain a thread or a stack address. 120 // 121 // Try to CAS m->owner from null to current thread. 122 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value)); 123 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, 124 Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected) 125 126 // Store a non-null value into the box to avoid looking like a re-entrant 127 // lock. The fast-path monitor unlock code checks for 128 // markWord::monitor_value so use markWord::unused_mark which has the 129 // relevant bit set, and also matches ObjectSynchronizer::slow_enter. 130 mv(tmp, (address)markWord::unused_mark().value()); 131 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 132 133 beqz(tmp3Reg, locked); // CAS success means locking succeeded 134 135 bne(tmp3Reg, xthread, slow_path); // Check for recursive locking 136 137 // Recursive lock case 138 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg); 139 140 bind(locked); 141 mv(flag, zr); 142 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg); 143 144 #ifdef ASSERT 145 // Check that locked label is reached with flag == 0. 146 Label flag_correct; 147 beqz(flag, flag_correct); 148 stop("Fast Lock Flag != 0"); 149 #endif 150 151 bind(slow_path); 152 #ifdef ASSERT 153 // Check that slow_path label is reached with flag != 0. 154 bnez(flag, flag_correct); 155 stop("Fast Lock Flag == 0"); 156 bind(flag_correct); 157 #endif 158 // C2 uses the value of flag (0 vs !0) to determine the continuation. 159 } 160 161 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, 162 Register tmp1Reg, Register tmp2Reg) { 163 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure. 164 Register flag = t1; 165 Register oop = objectReg; 166 Register box = boxReg; 167 Register disp_hdr = tmp1Reg; 168 Register tmp = tmp2Reg; 169 Label object_has_monitor; 170 // Finish fast lock successfully. MUST branch to with flag == 0 171 Label unlocked; 172 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 173 Label slow_path; 174 175 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 176 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0); 177 178 mv(flag, 1); 179 180 if (LockingMode == LM_LEGACY) { 181 // Find the lock address and load the displaced header from the stack. 182 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 183 184 // If the displaced header is 0, we have a recursive unlock. 185 beqz(disp_hdr, unlocked); 186 } 187 188 // Handle existing monitor. 189 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 190 test_bit(t0, tmp, exact_log2(markWord::monitor_value)); 191 bnez(t0, object_has_monitor); 192 193 if (LockingMode == LM_MONITOR) { 194 j(slow_path); 195 } else { 196 assert(LockingMode == LM_LEGACY, "must be"); 197 // Check if it is still a light weight lock, this is true if we 198 // see the stack address of the basicLock in the markWord of the 199 // object. 200 201 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, 202 Assembler::relaxed, Assembler::rl, /*result*/tmp); 203 beq(box, tmp, unlocked); // box == tmp if cas succeeds 204 j(slow_path); 205 } 206 207 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 208 209 // Handle existing monitor. 210 bind(object_has_monitor); 211 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 212 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 213 214 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 215 216 Label notRecursive; 217 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive. 218 219 // Recursive lock 220 addi(disp_hdr, disp_hdr, -1); 221 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 222 j(unlocked); 223 224 bind(notRecursive); 225 ld(t0, Address(tmp, ObjectMonitor::EntryList_offset())); 226 ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 227 orr(t0, t0, disp_hdr); // Will be 0 if both are 0. 228 bnez(t0, slow_path); 229 230 // need a release store here 231 la(tmp, Address(tmp, ObjectMonitor::owner_offset())); 232 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 233 sd(zr, Address(tmp)); // set unowned 234 235 bind(unlocked); 236 mv(flag, zr); 237 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg); 238 239 #ifdef ASSERT 240 // Check that unlocked label is reached with flag == 0. 241 Label flag_correct; 242 beqz(flag, flag_correct); 243 stop("Fast Lock Flag != 0"); 244 #endif 245 246 bind(slow_path); 247 #ifdef ASSERT 248 // Check that slow_path label is reached with flag != 0. 249 bnez(flag, flag_correct); 250 stop("Fast Lock Flag == 0"); 251 bind(flag_correct); 252 #endif 253 // C2 uses the value of flag (0 vs !0) to determine the continuation. 254 } 255 256 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) { 257 // Flag register, zero for success; non-zero for failure. 258 Register flag = t1; 259 260 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 261 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0); 262 263 mv(flag, 1); 264 265 // Handle inflated monitor. 266 Label inflated; 267 // Finish fast lock successfully. MUST branch to with flag == 0 268 Label locked; 269 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0 270 Label slow_path; 271 272 if (DiagnoseSyncOnValueBasedClasses != 0) { 273 load_klass(tmp1, obj); 274 lwu(tmp1, Address(tmp1, Klass::access_flags_offset())); 275 test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 276 bnez(tmp1, slow_path); 277 } 278 279 const Register tmp1_mark = tmp1; 280 281 { // Lightweight locking 282 283 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0 284 Label push; 285 286 const Register tmp2_top = tmp2; 287 const Register tmp3_t = tmp3; 288 289 // Check if lock-stack is full. 290 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 291 mv(tmp3_t, (unsigned)LockStack::end_offset()); 292 bge(tmp2_top, tmp3_t, slow_path); 293 294 // Check if recursive. 295 add(tmp3_t, xthread, tmp2_top); 296 ld(tmp3_t, Address(tmp3_t, -oopSize)); 297 beq(obj, tmp3_t, push); 298 299 // Relaxed normal load to check for monitor. Optimization for monitor case. 300 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 301 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 302 bnez(tmp3_t, inflated); 303 304 // Not inflated 305 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la"); 306 307 // Try to lock. Transition lock-bits 0b01 => 0b00 308 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value); 309 xori(tmp3_t, tmp1_mark, markWord::unlocked_value); 310 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 311 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t); 312 bne(tmp1_mark, tmp3_t, slow_path); 313 314 bind(push); 315 // After successful lock, push object on lock-stack. 316 add(tmp3_t, xthread, tmp2_top); 317 sd(obj, Address(tmp3_t)); 318 addw(tmp2_top, tmp2_top, oopSize); 319 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 320 j(locked); 321 } 322 323 { // Handle inflated monitor. 324 bind(inflated); 325 326 if (!UseObjectMonitorTable) { 327 // mark contains the tagged ObjectMonitor*. 328 const Register tmp1_tagged_monitor = tmp1_mark; 329 const uintptr_t monitor_tag = markWord::monitor_value; 330 const Register tmp2_owner_addr = tmp2; 331 const Register tmp3_owner = tmp3; 332 333 // Compute owner address. 334 la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag))); 335 336 // CAS owner (null => current thread). 337 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64, 338 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner); 339 beqz(tmp3_owner, locked); 340 341 // Check if recursive. 342 bne(tmp3_owner, xthread, slow_path); 343 344 // Recursive. 345 increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3); 346 } else { 347 // OMCache lookup not supported yet. Take the slowpath. 348 j(slow_path); 349 } 350 } 351 352 bind(locked); 353 mv(flag, zr); 354 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 355 356 #ifdef ASSERT 357 // Check that locked label is reached with flag == 0. 358 Label flag_correct; 359 beqz(flag, flag_correct); 360 stop("Fast Lock Flag != 0"); 361 #endif 362 363 bind(slow_path); 364 #ifdef ASSERT 365 // Check that slow_path label is reached with flag != 0. 366 bnez(flag, flag_correct); 367 stop("Fast Lock Flag == 0"); 368 bind(flag_correct); 369 #endif 370 // C2 uses the value of flag (0 vs !0) to determine the continuation. 371 } 372 373 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2, 374 Register tmp3) { 375 // Flag register, zero for success; non-zero for failure. 376 Register flag = t1; 377 378 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 379 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0); 380 381 mv(flag, 1); 382 383 // Handle inflated monitor. 384 Label inflated, inflated_load_monitor; 385 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0 386 Label unlocked; 387 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0 388 Label slow_path; 389 390 const Register tmp1_mark = tmp1; 391 const Register tmp2_top = tmp2; 392 const Register tmp3_t = tmp3; 393 394 { // Lightweight unlock 395 396 // Check if obj is top of lock-stack. 397 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 398 subw(tmp2_top, tmp2_top, oopSize); 399 add(tmp3_t, xthread, tmp2_top); 400 ld(tmp3_t, Address(tmp3_t)); 401 // Top of lock stack was not obj. Must be monitor. 402 bne(obj, tmp3_t, inflated_load_monitor); 403 404 // Pop lock-stack. 405 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 406 DEBUG_ONLY(sd(zr, Address(tmp3_t));) 407 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 408 409 // Check if recursive. 410 add(tmp3_t, xthread, tmp2_top); 411 ld(tmp3_t, Address(tmp3_t, -oopSize)); 412 beq(obj, tmp3_t, unlocked); 413 414 // Not recursive. 415 // Load Mark. 416 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 417 418 // Check header for monitor (0b10). 419 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 420 bnez(tmp3_t, inflated); 421 422 // Try to unlock. Transition lock bits 0b00 => 0b01 423 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 424 ori(tmp3_t, tmp1_mark, markWord::unlocked_value); 425 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64, 426 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t); 427 beq(tmp1_mark, tmp3_t, unlocked); 428 429 // Compare and exchange failed. 430 // Restore lock-stack and handle the unlock in runtime. 431 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);) 432 DEBUG_ONLY(sd(obj, Address(tmp3_t));) 433 addw(tmp2_top, tmp2_top, oopSize); 434 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset())); 435 j(slow_path); 436 } 437 438 { // Handle inflated monitor. 439 bind(inflated_load_monitor); 440 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 441 #ifdef ASSERT 442 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value)); 443 bnez(tmp3_t, inflated); 444 stop("Fast Unlock not monitor"); 445 #endif 446 447 bind(inflated); 448 449 #ifdef ASSERT 450 Label check_done; 451 subw(tmp2_top, tmp2_top, oopSize); 452 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset())); 453 blt(tmp2_top, tmp3_t, check_done); 454 add(tmp3_t, xthread, tmp2_top); 455 ld(tmp3_t, Address(tmp3_t)); 456 bne(obj, tmp3_t, inflated); 457 stop("Fast Unlock lock on stack"); 458 bind(check_done); 459 #endif 460 461 if (!UseObjectMonitorTable) { 462 // mark contains the tagged ObjectMonitor*. 463 const Register tmp1_monitor = tmp1_mark; 464 const uintptr_t monitor_tag = markWord::monitor_value; 465 466 // Untag the monitor. 467 sub(tmp1_monitor, tmp1_mark, monitor_tag); 468 469 const Register tmp2_recursions = tmp2; 470 Label not_recursive; 471 472 // Check if recursive. 473 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 474 beqz(tmp2_recursions, not_recursive); 475 476 // Recursive unlock. 477 addi(tmp2_recursions, tmp2_recursions, -1); 478 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset())); 479 j(unlocked); 480 481 bind(not_recursive); 482 483 Label release; 484 const Register tmp2_owner_addr = tmp2; 485 486 // Compute owner address. 487 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset())); 488 489 // Check if the entry lists are empty. 490 ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset())); 491 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset())); 492 orr(t0, t0, tmp3_t); 493 beqz(t0, release); 494 495 // The owner may be anonymous and we removed the last obj entry in 496 // the lock-stack. This loses the information about the owner. 497 // Write the thread to the owner field so the runtime knows the owner. 498 sd(xthread, Address(tmp2_owner_addr)); 499 j(slow_path); 500 501 bind(release); 502 // Set owner to null. 503 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 504 sd(zr, Address(tmp2_owner_addr)); 505 } else { 506 // OMCache lookup not supported yet. Take the slowpath. 507 j(slow_path); 508 } 509 } 510 511 bind(unlocked); 512 mv(flag, zr); 513 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3); 514 515 #ifdef ASSERT 516 // Check that unlocked label is reached with flag == 0. 517 Label flag_correct; 518 beqz(flag, flag_correct); 519 stop("Fast Lock Flag != 0"); 520 #endif 521 522 bind(slow_path); 523 #ifdef ASSERT 524 // Check that slow_path label is reached with flag != 0. 525 bnez(flag, flag_correct); 526 stop("Fast Lock Flag == 0"); 527 bind(flag_correct); 528 #endif 529 // C2 uses the value of flag (0 vs !0) to determine the continuation. 530 } 531 532 // short string 533 // StringUTF16.indexOfChar 534 // StringLatin1.indexOfChar 535 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1, 536 Register ch, Register result, 537 bool isL) 538 { 539 Register ch1 = t0; 540 Register index = t1; 541 542 BLOCK_COMMENT("string_indexof_char_short {"); 543 544 Label LOOP, LOOP1, LOOP4, LOOP8; 545 Label MATCH, MATCH1, MATCH2, MATCH3, 546 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH; 547 548 mv(result, -1); 549 mv(index, zr); 550 551 bind(LOOP); 552 addi(t0, index, 8); 553 ble(t0, cnt1, LOOP8); 554 addi(t0, index, 4); 555 ble(t0, cnt1, LOOP4); 556 j(LOOP1); 557 558 bind(LOOP8); 559 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 560 beq(ch, ch1, MATCH); 561 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 562 beq(ch, ch1, MATCH1); 563 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 564 beq(ch, ch1, MATCH2); 565 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 566 beq(ch, ch1, MATCH3); 567 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8)); 568 beq(ch, ch1, MATCH4); 569 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10)); 570 beq(ch, ch1, MATCH5); 571 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12)); 572 beq(ch, ch1, MATCH6); 573 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14)); 574 beq(ch, ch1, MATCH7); 575 addi(index, index, 8); 576 addi(str1, str1, isL ? 8 : 16); 577 blt(index, cnt1, LOOP); 578 j(NOMATCH); 579 580 bind(LOOP4); 581 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 582 beq(ch, ch1, MATCH); 583 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 584 beq(ch, ch1, MATCH1); 585 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 586 beq(ch, ch1, MATCH2); 587 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 588 beq(ch, ch1, MATCH3); 589 addi(index, index, 4); 590 addi(str1, str1, isL ? 4 : 8); 591 bge(index, cnt1, NOMATCH); 592 593 bind(LOOP1); 594 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1)); 595 beq(ch, ch1, MATCH); 596 addi(index, index, 1); 597 addi(str1, str1, isL ? 1 : 2); 598 blt(index, cnt1, LOOP1); 599 j(NOMATCH); 600 601 bind(MATCH1); 602 addi(index, index, 1); 603 j(MATCH); 604 605 bind(MATCH2); 606 addi(index, index, 2); 607 j(MATCH); 608 609 bind(MATCH3); 610 addi(index, index, 3); 611 j(MATCH); 612 613 bind(MATCH4); 614 addi(index, index, 4); 615 j(MATCH); 616 617 bind(MATCH5); 618 addi(index, index, 5); 619 j(MATCH); 620 621 bind(MATCH6); 622 addi(index, index, 6); 623 j(MATCH); 624 625 bind(MATCH7); 626 addi(index, index, 7); 627 628 bind(MATCH); 629 mv(result, index); 630 bind(NOMATCH); 631 BLOCK_COMMENT("} string_indexof_char_short"); 632 } 633 634 // StringUTF16.indexOfChar 635 // StringLatin1.indexOfChar 636 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 637 Register ch, Register result, 638 Register tmp1, Register tmp2, 639 Register tmp3, Register tmp4, 640 bool isL) 641 { 642 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG; 643 Register ch1 = t0; 644 Register orig_cnt = t1; 645 Register mask1 = tmp3; 646 Register mask2 = tmp2; 647 Register match_mask = tmp1; 648 Register trailing_char = tmp4; 649 Register unaligned_elems = tmp4; 650 651 BLOCK_COMMENT("string_indexof_char {"); 652 beqz(cnt1, NOMATCH); 653 654 addi(t0, cnt1, isL ? -32 : -16); 655 bgtz(t0, DO_LONG); 656 string_indexof_char_short(str1, cnt1, ch, result, isL); 657 j(DONE); 658 659 bind(DO_LONG); 660 mv(orig_cnt, cnt1); 661 if (AvoidUnalignedAccesses) { 662 Label ALIGNED; 663 andi(unaligned_elems, str1, 0x7); 664 beqz(unaligned_elems, ALIGNED); 665 sub(unaligned_elems, unaligned_elems, 8); 666 neg(unaligned_elems, unaligned_elems); 667 if (!isL) { 668 srli(unaligned_elems, unaligned_elems, 1); 669 } 670 // do unaligned part per element 671 string_indexof_char_short(str1, unaligned_elems, ch, result, isL); 672 bgez(result, DONE); 673 mv(orig_cnt, cnt1); 674 sub(cnt1, cnt1, unaligned_elems); 675 bind(ALIGNED); 676 } 677 678 // duplicate ch 679 if (isL) { 680 slli(ch1, ch, 8); 681 orr(ch, ch1, ch); 682 } 683 slli(ch1, ch, 16); 684 orr(ch, ch1, ch); 685 slli(ch1, ch, 32); 686 orr(ch, ch1, ch); 687 688 if (!isL) { 689 slli(cnt1, cnt1, 1); 690 } 691 692 uint64_t mask0101 = UCONST64(0x0101010101010101); 693 uint64_t mask0001 = UCONST64(0x0001000100010001); 694 mv(mask1, isL ? mask0101 : mask0001); 695 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 696 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 697 mv(mask2, isL ? mask7f7f : mask7fff); 698 699 bind(CH1_LOOP); 700 ld(ch1, Address(str1)); 701 addi(str1, str1, 8); 702 addi(cnt1, cnt1, -8); 703 compute_match_mask(ch1, ch, match_mask, mask1, mask2); 704 bnez(match_mask, HIT); 705 bgtz(cnt1, CH1_LOOP); 706 j(NOMATCH); 707 708 bind(HIT); 709 ctzc_bit(trailing_char, match_mask, isL, ch1, result); 710 srli(trailing_char, trailing_char, 3); 711 addi(cnt1, cnt1, 8); 712 ble(cnt1, trailing_char, NOMATCH); 713 // match case 714 if (!isL) { 715 srli(cnt1, cnt1, 1); 716 srli(trailing_char, trailing_char, 1); 717 } 718 719 sub(result, orig_cnt, cnt1); 720 add(result, result, trailing_char); 721 j(DONE); 722 723 bind(NOMATCH); 724 mv(result, -1); 725 726 bind(DONE); 727 BLOCK_COMMENT("} string_indexof_char"); 728 } 729 730 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp); 731 732 // Search for needle in haystack and return index or -1 733 // x10: result 734 // x11: haystack 735 // x12: haystack_len 736 // x13: needle 737 // x14: needle_len 738 void C2_MacroAssembler::string_indexof(Register haystack, Register needle, 739 Register haystack_len, Register needle_len, 740 Register tmp1, Register tmp2, 741 Register tmp3, Register tmp4, 742 Register tmp5, Register tmp6, 743 Register result, int ae) 744 { 745 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 746 747 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH; 748 749 Register ch1 = t0; 750 Register ch2 = t1; 751 Register nlen_tmp = tmp1; // needle len tmp 752 Register hlen_tmp = tmp2; // haystack len tmp 753 Register result_tmp = tmp4; 754 755 bool isLL = ae == StrIntrinsicNode::LL; 756 757 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 758 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 759 int needle_chr_shift = needle_isL ? 0 : 1; 760 int haystack_chr_shift = haystack_isL ? 0 : 1; 761 int needle_chr_size = needle_isL ? 1 : 2; 762 int haystack_chr_size = haystack_isL ? 1 : 2; 763 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 764 (load_chr_insn)&MacroAssembler::lhu; 765 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 766 (load_chr_insn)&MacroAssembler::lhu; 767 768 BLOCK_COMMENT("string_indexof {"); 769 770 // Note, inline_string_indexOf() generates checks: 771 // if (pattern.count > src.count) return -1; 772 // if (pattern.count == 0) return 0; 773 774 // We have two strings, a source string in haystack, haystack_len and a pattern string 775 // in needle, needle_len. Find the first occurrence of pattern in source or return -1. 776 777 // For larger pattern and source we use a simplified Boyer Moore algorithm. 778 // With a small pattern and source we use linear scan. 779 780 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm. 781 sub(result_tmp, haystack_len, needle_len); 782 // needle_len < 8, use linear scan 783 sub(t0, needle_len, 8); 784 bltz(t0, LINEARSEARCH); 785 // needle_len >= 256, use linear scan 786 sub(t0, needle_len, 256); 787 bgez(t0, LINEARSTUB); 788 // needle_len >= haystack_len/4, use linear scan 789 srli(t0, haystack_len, 2); 790 bge(needle_len, t0, LINEARSTUB); 791 792 // Boyer-Moore-Horspool introduction: 793 // The Boyer Moore alogorithm is based on the description here:- 794 // 795 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 796 // 797 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 798 // and the 'Good Suffix' rule. 799 // 800 // These rules are essentially heuristics for how far we can shift the 801 // pattern along the search string. 802 // 803 // The implementation here uses the 'Bad Character' rule only because of the 804 // complexity of initialisation for the 'Good Suffix' rule. 805 // 806 // This is also known as the Boyer-Moore-Horspool algorithm: 807 // 808 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 809 // 810 // #define ASIZE 256 811 // 812 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) { 813 // int i, j; 814 // unsigned c; 815 // unsigned char bc[ASIZE]; 816 // 817 // /* Preprocessing */ 818 // for (i = 0; i < ASIZE; ++i) 819 // bc[i] = m; 820 // for (i = 0; i < m - 1; ) { 821 // c = pattern[i]; 822 // ++i; 823 // // c < 256 for Latin1 string, so, no need for branch 824 // #ifdef PATTERN_STRING_IS_LATIN1 825 // bc[c] = m - i; 826 // #else 827 // if (c < ASIZE) bc[c] = m - i; 828 // #endif 829 // } 830 // 831 // /* Searching */ 832 // j = 0; 833 // while (j <= n - m) { 834 // c = src[i+j]; 835 // if (pattern[m-1] == c) 836 // int k; 837 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 838 // if (k < 0) return j; 839 // // c < 256 for Latin1 string, so, no need for branch 840 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1 841 // // LL case: (c< 256) always true. Remove branch 842 // j += bc[pattern[j+m-1]]; 843 // #endif 844 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF 845 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 846 // if (c < ASIZE) 847 // j += bc[pattern[j+m-1]]; 848 // else 849 // j += 1 850 // #endif 851 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1 852 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 853 // if (c < ASIZE) 854 // j += bc[pattern[j+m-1]]; 855 // else 856 // j += m 857 // #endif 858 // } 859 // return -1; 860 // } 861 862 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result 863 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 864 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 865 866 Register haystack_end = haystack_len; 867 Register skipch = tmp2; 868 869 // pattern length is >=8, so, we can read at least 1 register for cases when 870 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 871 // UL case. We'll re-read last character in inner pre-loop code to have 872 // single outer pre-loop load 873 const int firstStep = isLL ? 7 : 3; 874 875 const int ASIZE = 256; 876 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd) 877 878 sub(sp, sp, ASIZE); 879 880 // init BC offset table with default value: needle_len 881 slli(t0, needle_len, 8); 882 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len] 883 slli(tmp1, t0, 16); 884 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len] 885 slli(tmp1, t0, 32); 886 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len] 887 888 mv(ch1, sp); // ch1 is t0 889 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations 890 891 bind(BM_INIT_LOOP); 892 // for (i = 0; i < ASIZE; ++i) 893 // bc[i] = m; 894 for (int i = 0; i < 4; i++) { 895 sd(tmp5, Address(ch1, i * wordSize)); 896 } 897 add(ch1, ch1, 32); 898 sub(tmp6, tmp6, 4); 899 bgtz(tmp6, BM_INIT_LOOP); 900 901 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern 902 Register orig_haystack = tmp5; 903 mv(orig_haystack, haystack); 904 // result_tmp = tmp4 905 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift); 906 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1 907 mv(tmp3, needle); 908 909 // for (i = 0; i < m - 1; ) { 910 // c = pattern[i]; 911 // ++i; 912 // // c < 256 for Latin1 string, so, no need for branch 913 // #ifdef PATTERN_STRING_IS_LATIN1 914 // bc[c] = m - i; 915 // #else 916 // if (c < ASIZE) bc[c] = m - i; 917 // #endif 918 // } 919 bind(BCLOOP); 920 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg); 921 add(tmp3, tmp3, needle_chr_size); 922 if (!needle_isL) { 923 // ae == StrIntrinsicNode::UU 924 mv(tmp6, ASIZE); 925 bgeu(ch1, tmp6, BCSKIP); 926 } 927 add(tmp4, sp, ch1); 928 sb(ch2, Address(tmp4)); // store skip offset to BC offset table 929 930 bind(BCSKIP); 931 sub(ch2, ch2, 1); // for next pattern element, skip distance -1 932 bgtz(ch2, BCLOOP); 933 934 // tmp6: pattern end, address after needle 935 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift); 936 if (needle_isL == haystack_isL) { 937 // load last 8 bytes (8LL/4UU symbols) 938 ld(tmp6, Address(tmp6, -wordSize)); 939 } else { 940 // UL: from UTF-16(source) search Latin1(pattern) 941 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols) 942 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d 943 // We'll have to wait until load completed, but it's still faster than per-character loads+checks 944 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a 945 slli(ch2, tmp6, XLEN - 24); 946 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b 947 slli(ch1, tmp6, XLEN - 16); 948 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c 949 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d 950 slli(ch2, ch2, 16); 951 orr(ch2, ch2, ch1); // 0x00000b0c 952 slli(result, tmp3, 48); // use result as temp register 953 orr(tmp6, tmp6, result); // 0x0a00000d 954 slli(result, ch2, 16); 955 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d 956 } 957 958 // i = m - 1; 959 // skipch = j + i; 960 // if (skipch == pattern[m - 1] 961 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 962 // else 963 // move j with bad char offset table 964 bind(BMLOOPSTR2); 965 // compare pattern to source string backward 966 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift); 967 (this->*haystack_load_1chr)(skipch, Address(result), noreg); 968 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8 969 if (needle_isL == haystack_isL) { 970 // re-init tmp3. It's for free because it's executed in parallel with 971 // load above. Alternative is to initialize it before loop, but it'll 972 // affect performance on in-order systems with 2 or more ld/st pipelines 973 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1] 974 } 975 if (!isLL) { // UU/UL case 976 slli(ch2, nlen_tmp, 1); // offsets in bytes 977 } 978 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char 979 add(result, haystack, isLL ? nlen_tmp : ch2); 980 // load 8 bytes from source string 981 // if isLL is false then read granularity can be 2 982 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway 983 mv(ch1, tmp6); 984 if (isLL) { 985 j(BMLOOPSTR1_AFTER_LOAD); 986 } else { 987 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 988 j(BMLOOPSTR1_CMP); 989 } 990 991 bind(BMLOOPSTR1); 992 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift); 993 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 994 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift); 995 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 996 997 bind(BMLOOPSTR1_AFTER_LOAD); 998 sub(nlen_tmp, nlen_tmp, 1); 999 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP); 1000 1001 bind(BMLOOPSTR1_CMP); 1002 beq(ch1, ch2, BMLOOPSTR1); 1003 1004 bind(BMSKIP); 1005 if (!isLL) { 1006 // if we've met UTF symbol while searching Latin1 pattern, then we can 1007 // skip needle_len symbols 1008 if (needle_isL != haystack_isL) { 1009 mv(result_tmp, needle_len); 1010 } else { 1011 mv(result_tmp, 1); 1012 } 1013 mv(t0, ASIZE); 1014 bgeu(skipch, t0, BMADV); 1015 } 1016 add(result_tmp, sp, skipch); 1017 lbu(result_tmp, Address(result_tmp)); // load skip offset 1018 1019 bind(BMADV); 1020 sub(nlen_tmp, needle_len, 1); 1021 // move haystack after bad char skip offset 1022 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift); 1023 ble(haystack, haystack_end, BMLOOPSTR2); 1024 add(sp, sp, ASIZE); 1025 j(NOMATCH); 1026 1027 bind(BMLOOPSTR1_LASTCMP); 1028 bne(ch1, ch2, BMSKIP); 1029 1030 bind(BMMATCH); 1031 sub(result, haystack, orig_haystack); 1032 if (!haystack_isL) { 1033 srli(result, result, 1); 1034 } 1035 add(sp, sp, ASIZE); 1036 j(DONE); 1037 1038 bind(LINEARSTUB); 1039 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm 1040 bltz(t0, LINEARSEARCH); 1041 mv(result, zr); 1042 RuntimeAddress stub = nullptr; 1043 if (isLL) { 1044 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll()); 1045 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 1046 } else if (needle_isL) { 1047 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul()); 1048 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 1049 } else { 1050 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu()); 1051 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 1052 } 1053 address call = trampoline_call(stub); 1054 if (call == nullptr) { 1055 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH)); 1056 ciEnv::current()->record_failure("CodeCache is full"); 1057 return; 1058 } 1059 j(DONE); 1060 1061 bind(NOMATCH); 1062 mv(result, -1); 1063 j(DONE); 1064 1065 bind(LINEARSEARCH); 1066 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae); 1067 1068 bind(DONE); 1069 BLOCK_COMMENT("} string_indexof"); 1070 } 1071 1072 // string_indexof 1073 // result: x10 1074 // src: x11 1075 // src_count: x12 1076 // pattern: x13 1077 // pattern_count: x14 or 1/2/3/4 1078 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle, 1079 Register haystack_len, Register needle_len, 1080 Register tmp1, Register tmp2, 1081 Register tmp3, Register tmp4, 1082 int needle_con_cnt, Register result, int ae) 1083 { 1084 // Note: 1085 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant 1086 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1 1087 assert(needle_con_cnt <= 4, "Invalid needle constant count"); 1088 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1089 1090 Register ch1 = t0; 1091 Register ch2 = t1; 1092 Register hlen_neg = haystack_len, nlen_neg = needle_len; 1093 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4; 1094 1095 bool isLL = ae == StrIntrinsicNode::LL; 1096 1097 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 1098 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 1099 int needle_chr_shift = needle_isL ? 0 : 1; 1100 int haystack_chr_shift = haystack_isL ? 0 : 1; 1101 int needle_chr_size = needle_isL ? 1 : 2; 1102 int haystack_chr_size = haystack_isL ? 1 : 2; 1103 1104 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 1105 (load_chr_insn)&MacroAssembler::lhu; 1106 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 1107 (load_chr_insn)&MacroAssembler::lhu; 1108 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu; 1109 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld; 1110 1111 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE; 1112 1113 Register first = tmp3; 1114 1115 if (needle_con_cnt == -1) { 1116 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 1117 1118 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2); 1119 bltz(t0, DOSHORT); 1120 1121 (this->*needle_load_1chr)(first, Address(needle), noreg); 1122 slli(t0, needle_len, needle_chr_shift); 1123 add(needle, needle, t0); 1124 neg(nlen_neg, t0); 1125 slli(t0, result_tmp, haystack_chr_shift); 1126 add(haystack, haystack, t0); 1127 neg(hlen_neg, t0); 1128 1129 bind(FIRST_LOOP); 1130 add(t0, haystack, hlen_neg); 1131 (this->*haystack_load_1chr)(ch2, Address(t0), noreg); 1132 beq(first, ch2, STR1_LOOP); 1133 1134 bind(STR2_NEXT); 1135 add(hlen_neg, hlen_neg, haystack_chr_size); 1136 blez(hlen_neg, FIRST_LOOP); 1137 j(NOMATCH); 1138 1139 bind(STR1_LOOP); 1140 add(nlen_tmp, nlen_neg, needle_chr_size); 1141 add(hlen_tmp, hlen_neg, haystack_chr_size); 1142 bgez(nlen_tmp, MATCH); 1143 1144 bind(STR1_NEXT); 1145 add(ch1, needle, nlen_tmp); 1146 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 1147 add(ch2, haystack, hlen_tmp); 1148 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1149 bne(ch1, ch2, STR2_NEXT); 1150 add(nlen_tmp, nlen_tmp, needle_chr_size); 1151 add(hlen_tmp, hlen_tmp, haystack_chr_size); 1152 bltz(nlen_tmp, STR1_NEXT); 1153 j(MATCH); 1154 1155 bind(DOSHORT); 1156 if (needle_isL == haystack_isL) { 1157 sub(t0, needle_len, 2); 1158 bltz(t0, DO1); 1159 bgtz(t0, DO3); 1160 } 1161 } 1162 1163 if (needle_con_cnt == 4) { 1164 Label CH1_LOOP; 1165 (this->*load_4chr)(ch1, Address(needle), noreg); 1166 sub(result_tmp, haystack_len, 4); 1167 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp 1168 add(haystack, haystack, tmp3); 1169 neg(hlen_neg, tmp3); 1170 if (AvoidUnalignedAccesses) { 1171 // preload first value, then we will read by 1 character per loop, instead of four 1172 // just shifting previous ch2 right by size of character in bits 1173 add(tmp3, haystack, hlen_neg); 1174 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1175 if (isLL) { 1176 // need to erase 1 most significant byte in 32-bit value of ch2 1177 slli(ch2, ch2, 40); 1178 srli(ch2, ch2, 32); 1179 } else { 1180 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation 1181 } 1182 } 1183 1184 bind(CH1_LOOP); 1185 add(tmp3, haystack, hlen_neg); 1186 if (AvoidUnalignedAccesses) { 1187 srli(ch2, ch2, isLL ? 8 : 16); 1188 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg); 1189 slli(tmp3, tmp3, isLL ? 24 : 48); 1190 add(ch2, ch2, tmp3); 1191 } else { 1192 (this->*load_4chr)(ch2, Address(tmp3), noreg); 1193 } 1194 beq(ch1, ch2, MATCH); 1195 add(hlen_neg, hlen_neg, haystack_chr_size); 1196 blez(hlen_neg, CH1_LOOP); 1197 j(NOMATCH); 1198 } 1199 1200 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) { 1201 Label CH1_LOOP; 1202 BLOCK_COMMENT("string_indexof DO2 {"); 1203 bind(DO2); 1204 (this->*load_2chr)(ch1, Address(needle), noreg); 1205 if (needle_con_cnt == 2) { 1206 sub(result_tmp, haystack_len, 2); 1207 } 1208 slli(tmp3, result_tmp, haystack_chr_shift); 1209 add(haystack, haystack, tmp3); 1210 neg(hlen_neg, tmp3); 1211 if (AvoidUnalignedAccesses) { 1212 // preload first value, then we will read by 1 character per loop, instead of two 1213 // just shifting previous ch2 right by size of character in bits 1214 add(tmp3, haystack, hlen_neg); 1215 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1216 slli(ch2, ch2, isLL ? 8 : 16); 1217 } 1218 bind(CH1_LOOP); 1219 add(tmp3, haystack, hlen_neg); 1220 if (AvoidUnalignedAccesses) { 1221 srli(ch2, ch2, isLL ? 8 : 16); 1222 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg); 1223 slli(tmp3, tmp3, isLL ? 8 : 16); 1224 add(ch2, ch2, tmp3); 1225 } else { 1226 (this->*load_2chr)(ch2, Address(tmp3), noreg); 1227 } 1228 beq(ch1, ch2, MATCH); 1229 add(hlen_neg, hlen_neg, haystack_chr_size); 1230 blez(hlen_neg, CH1_LOOP); 1231 j(NOMATCH); 1232 BLOCK_COMMENT("} string_indexof DO2"); 1233 } 1234 1235 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) { 1236 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1237 BLOCK_COMMENT("string_indexof DO3 {"); 1238 1239 bind(DO3); 1240 (this->*load_2chr)(first, Address(needle), noreg); 1241 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg); 1242 if (needle_con_cnt == 3) { 1243 sub(result_tmp, haystack_len, 3); 1244 } 1245 slli(hlen_tmp, result_tmp, haystack_chr_shift); 1246 add(haystack, haystack, hlen_tmp); 1247 neg(hlen_neg, hlen_tmp); 1248 1249 bind(FIRST_LOOP); 1250 add(ch2, haystack, hlen_neg); 1251 if (AvoidUnalignedAccesses) { 1252 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2 1253 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1254 slli(tmp2, tmp2, isLL ? 8 : 16); 1255 add(ch2, ch2, tmp2); 1256 } else { 1257 (this->*load_2chr)(ch2, Address(ch2), noreg); 1258 } 1259 beq(first, ch2, STR1_LOOP); 1260 1261 bind(STR2_NEXT); 1262 add(hlen_neg, hlen_neg, haystack_chr_size); 1263 blez(hlen_neg, FIRST_LOOP); 1264 j(NOMATCH); 1265 1266 bind(STR1_LOOP); 1267 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size); 1268 add(ch2, haystack, hlen_tmp); 1269 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1270 bne(ch1, ch2, STR2_NEXT); 1271 j(MATCH); 1272 BLOCK_COMMENT("} string_indexof DO3"); 1273 } 1274 1275 if (needle_con_cnt == -1 || needle_con_cnt == 1) { 1276 Label DO1_LOOP; 1277 1278 BLOCK_COMMENT("string_indexof DO1 {"); 1279 bind(DO1); 1280 (this->*needle_load_1chr)(ch1, Address(needle), noreg); 1281 sub(result_tmp, haystack_len, 1); 1282 slli(tmp3, result_tmp, haystack_chr_shift); 1283 add(haystack, haystack, tmp3); 1284 neg(hlen_neg, tmp3); 1285 1286 bind(DO1_LOOP); 1287 add(tmp3, haystack, hlen_neg); 1288 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1289 beq(ch1, ch2, MATCH); 1290 add(hlen_neg, hlen_neg, haystack_chr_size); 1291 blez(hlen_neg, DO1_LOOP); 1292 BLOCK_COMMENT("} string_indexof DO1"); 1293 } 1294 1295 bind(NOMATCH); 1296 mv(result, -1); 1297 j(DONE); 1298 1299 bind(MATCH); 1300 srai(t0, hlen_neg, haystack_chr_shift); 1301 add(result, result_tmp, t0); 1302 1303 bind(DONE); 1304 } 1305 1306 // Compare strings. 1307 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1308 Register cnt1, Register cnt2, Register result, 1309 Register tmp1, Register tmp2, Register tmp3, 1310 int ae) 1311 { 1312 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1313 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1314 SHORT_LOOP_START, TAIL_CHECK, L; 1315 1316 const int STUB_THRESHOLD = 64 + 8; 1317 bool isLL = ae == StrIntrinsicNode::LL; 1318 bool isLU = ae == StrIntrinsicNode::LU; 1319 bool isUL = ae == StrIntrinsicNode::UL; 1320 1321 bool str1_isL = isLL || isLU; 1322 bool str2_isL = isLL || isUL; 1323 1324 // for L strings, 1 byte for 1 character 1325 // for U strings, 2 bytes for 1 character 1326 int str1_chr_size = str1_isL ? 1 : 2; 1327 int str2_chr_size = str2_isL ? 1 : 2; 1328 int minCharsInWord = isLL ? wordSize : wordSize / 2; 1329 1330 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1331 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1332 1333 BLOCK_COMMENT("string_compare {"); 1334 1335 // Bizzarely, the counts are passed in bytes, regardless of whether they 1336 // are L or U strings, however the result is always in characters. 1337 if (!str1_isL) { 1338 sraiw(cnt1, cnt1, 1); 1339 } 1340 if (!str2_isL) { 1341 sraiw(cnt2, cnt2, 1); 1342 } 1343 1344 // Compute the minimum of the string lengths and save the difference in result. 1345 sub(result, cnt1, cnt2); 1346 bgt(cnt1, cnt2, L); 1347 mv(cnt2, cnt1); 1348 bind(L); 1349 1350 // A very short string 1351 mv(t0, minCharsInWord); 1352 ble(cnt2, t0, SHORT_STRING); 1353 1354 // Compare longwords 1355 // load first parts of strings and finish initialization while loading 1356 { 1357 if (str1_isL == str2_isL) { // LL or UU 1358 // check if str1 and str2 is same pointer 1359 beq(str1, str2, DONE); 1360 // load 8 bytes once to compare 1361 ld(tmp1, Address(str1)); 1362 ld(tmp2, Address(str2)); 1363 mv(t0, STUB_THRESHOLD); 1364 bge(cnt2, t0, STUB); 1365 sub(cnt2, cnt2, minCharsInWord); 1366 beqz(cnt2, TAIL_CHECK); 1367 // convert cnt2 from characters to bytes 1368 if (!str1_isL) { 1369 slli(cnt2, cnt2, 1); 1370 } 1371 add(str2, str2, cnt2); 1372 add(str1, str1, cnt2); 1373 sub(cnt2, zr, cnt2); 1374 } else if (isLU) { // LU case 1375 lwu(tmp1, Address(str1)); 1376 ld(tmp2, Address(str2)); 1377 mv(t0, STUB_THRESHOLD); 1378 bge(cnt2, t0, STUB); 1379 addi(cnt2, cnt2, -4); 1380 add(str1, str1, cnt2); 1381 sub(cnt1, zr, cnt2); 1382 slli(cnt2, cnt2, 1); 1383 add(str2, str2, cnt2); 1384 inflate_lo32(tmp3, tmp1); 1385 mv(tmp1, tmp3); 1386 sub(cnt2, zr, cnt2); 1387 addi(cnt1, cnt1, 4); 1388 } else { // UL case 1389 ld(tmp1, Address(str1)); 1390 lwu(tmp2, Address(str2)); 1391 mv(t0, STUB_THRESHOLD); 1392 bge(cnt2, t0, STUB); 1393 addi(cnt2, cnt2, -4); 1394 slli(t0, cnt2, 1); 1395 sub(cnt1, zr, t0); 1396 add(str1, str1, t0); 1397 add(str2, str2, cnt2); 1398 inflate_lo32(tmp3, tmp2); 1399 mv(tmp2, tmp3); 1400 sub(cnt2, zr, cnt2); 1401 addi(cnt1, cnt1, 8); 1402 } 1403 addi(cnt2, cnt2, isUL ? 4 : 8); 1404 bne(tmp1, tmp2, DIFFERENCE); 1405 bgez(cnt2, TAIL); 1406 1407 // main loop 1408 bind(NEXT_WORD); 1409 if (str1_isL == str2_isL) { // LL or UU 1410 add(t0, str1, cnt2); 1411 ld(tmp1, Address(t0)); 1412 add(t0, str2, cnt2); 1413 ld(tmp2, Address(t0)); 1414 addi(cnt2, cnt2, 8); 1415 } else if (isLU) { // LU case 1416 add(t0, str1, cnt1); 1417 lwu(tmp1, Address(t0)); 1418 add(t0, str2, cnt2); 1419 ld(tmp2, Address(t0)); 1420 addi(cnt1, cnt1, 4); 1421 inflate_lo32(tmp3, tmp1); 1422 mv(tmp1, tmp3); 1423 addi(cnt2, cnt2, 8); 1424 } else { // UL case 1425 add(t0, str2, cnt2); 1426 lwu(tmp2, Address(t0)); 1427 add(t0, str1, cnt1); 1428 ld(tmp1, Address(t0)); 1429 inflate_lo32(tmp3, tmp2); 1430 mv(tmp2, tmp3); 1431 addi(cnt1, cnt1, 8); 1432 addi(cnt2, cnt2, 4); 1433 } 1434 bne(tmp1, tmp2, DIFFERENCE); 1435 bltz(cnt2, NEXT_WORD); 1436 bind(TAIL); 1437 if (str1_isL == str2_isL) { // LL or UU 1438 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2); 1439 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2); 1440 } else if (isLU) { // LU case 1441 load_int_misaligned(tmp1, Address(str1), tmp3, false); 1442 load_long_misaligned(tmp2, Address(str2), tmp3, 2); 1443 inflate_lo32(tmp3, tmp1); 1444 mv(tmp1, tmp3); 1445 } else { // UL case 1446 load_int_misaligned(tmp2, Address(str2), tmp3, false); 1447 load_long_misaligned(tmp1, Address(str1), tmp3, 2); 1448 inflate_lo32(tmp3, tmp2); 1449 mv(tmp2, tmp3); 1450 } 1451 bind(TAIL_CHECK); 1452 beq(tmp1, tmp2, DONE); 1453 1454 // Find the first different characters in the longwords and 1455 // compute their difference. 1456 bind(DIFFERENCE); 1457 xorr(tmp3, tmp1, tmp2); 1458 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb 1459 srl(tmp1, tmp1, result); 1460 srl(tmp2, tmp2, result); 1461 if (isLL) { 1462 andi(tmp1, tmp1, 0xFF); 1463 andi(tmp2, tmp2, 0xFF); 1464 } else { 1465 andi(tmp1, tmp1, 0xFFFF); 1466 andi(tmp2, tmp2, 0xFFFF); 1467 } 1468 sub(result, tmp1, tmp2); 1469 j(DONE); 1470 } 1471 1472 bind(STUB); 1473 RuntimeAddress stub = nullptr; 1474 switch (ae) { 1475 case StrIntrinsicNode::LL: 1476 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL()); 1477 break; 1478 case StrIntrinsicNode::UU: 1479 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU()); 1480 break; 1481 case StrIntrinsicNode::LU: 1482 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU()); 1483 break; 1484 case StrIntrinsicNode::UL: 1485 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL()); 1486 break; 1487 default: 1488 ShouldNotReachHere(); 1489 } 1490 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1491 address call = trampoline_call(stub); 1492 if (call == nullptr) { 1493 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1494 ciEnv::current()->record_failure("CodeCache is full"); 1495 return; 1496 } 1497 j(DONE); 1498 1499 bind(SHORT_STRING); 1500 // Is the minimum length zero? 1501 beqz(cnt2, DONE); 1502 // arrange code to do most branches while loading and loading next characters 1503 // while comparing previous 1504 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1505 addi(str1, str1, str1_chr_size); 1506 addi(cnt2, cnt2, -1); 1507 beqz(cnt2, SHORT_LAST_INIT); 1508 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1509 addi(str2, str2, str2_chr_size); 1510 j(SHORT_LOOP_START); 1511 bind(SHORT_LOOP); 1512 addi(cnt2, cnt2, -1); 1513 beqz(cnt2, SHORT_LAST); 1514 bind(SHORT_LOOP_START); 1515 (this->*str1_load_chr)(tmp2, Address(str1), t0); 1516 addi(str1, str1, str1_chr_size); 1517 (this->*str2_load_chr)(t0, Address(str2), t0); 1518 addi(str2, str2, str2_chr_size); 1519 bne(tmp1, cnt1, SHORT_LOOP_TAIL); 1520 addi(cnt2, cnt2, -1); 1521 beqz(cnt2, SHORT_LAST2); 1522 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1523 addi(str1, str1, str1_chr_size); 1524 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1525 addi(str2, str2, str2_chr_size); 1526 beq(tmp2, t0, SHORT_LOOP); 1527 sub(result, tmp2, t0); 1528 j(DONE); 1529 bind(SHORT_LOOP_TAIL); 1530 sub(result, tmp1, cnt1); 1531 j(DONE); 1532 bind(SHORT_LAST2); 1533 beq(tmp2, t0, DONE); 1534 sub(result, tmp2, t0); 1535 1536 j(DONE); 1537 bind(SHORT_LAST_INIT); 1538 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1539 addi(str2, str2, str2_chr_size); 1540 bind(SHORT_LAST); 1541 beq(tmp1, cnt1, DONE); 1542 sub(result, tmp1, cnt1); 1543 1544 bind(DONE); 1545 1546 BLOCK_COMMENT("} string_compare"); 1547 } 1548 1549 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, 1550 Register tmp1, Register tmp2, Register tmp3, 1551 Register result, int elem_size) { 1552 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 1553 assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0); 1554 1555 int elem_per_word = wordSize/elem_size; 1556 int log_elem_size = exact_log2(elem_size); 1557 int length_offset = arrayOopDesc::length_offset_in_bytes(); 1558 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 1559 1560 Register cnt1 = tmp3; 1561 Register cnt2 = tmp1; // cnt2 only used in array length compare 1562 Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01; 1563 1564 BLOCK_COMMENT("arrays_equals {"); 1565 1566 // if (a1 == a2), return true 1567 beq(a1, a2, SAME); 1568 1569 mv(result, false); 1570 // if (a1 == nullptr || a2 == nullptr) 1571 // return false; 1572 beqz(a1, DONE); 1573 beqz(a2, DONE); 1574 1575 // if (a1.length != a2.length) 1576 // return false; 1577 lwu(cnt1, Address(a1, length_offset)); 1578 lwu(cnt2, Address(a2, length_offset)); 1579 bne(cnt1, cnt2, DONE); 1580 1581 la(a1, Address(a1, base_offset)); 1582 la(a2, Address(a2, base_offset)); 1583 // Check for short strings, i.e. smaller than wordSize. 1584 addi(cnt1, cnt1, -elem_per_word); 1585 bltz(cnt1, SHORT); 1586 1587 // Main 8 byte comparison loop. 1588 bind(NEXT_WORD); { 1589 ld(tmp1, Address(a1)); 1590 ld(tmp2, Address(a2)); 1591 addi(cnt1, cnt1, -elem_per_word); 1592 addi(a1, a1, wordSize); 1593 addi(a2, a2, wordSize); 1594 bne(tmp1, tmp2, DONE); 1595 } bgez(cnt1, NEXT_WORD); 1596 1597 addi(tmp1, cnt1, elem_per_word); 1598 beqz(tmp1, SAME); 1599 1600 bind(SHORT); 1601 test_bit(tmp1, cnt1, 2 - log_elem_size); 1602 beqz(tmp1, TAIL03); // 0-7 bytes left. 1603 { 1604 lwu(tmp1, Address(a1)); 1605 lwu(tmp2, Address(a2)); 1606 addi(a1, a1, 4); 1607 addi(a2, a2, 4); 1608 bne(tmp1, tmp2, DONE); 1609 } 1610 1611 bind(TAIL03); 1612 test_bit(tmp1, cnt1, 1 - log_elem_size); 1613 beqz(tmp1, TAIL01); // 0-3 bytes left. 1614 { 1615 lhu(tmp1, Address(a1)); 1616 lhu(tmp2, Address(a2)); 1617 addi(a1, a1, 2); 1618 addi(a2, a2, 2); 1619 bne(tmp1, tmp2, DONE); 1620 } 1621 1622 bind(TAIL01); 1623 if (elem_size == 1) { // Only needed when comparing byte arrays. 1624 test_bit(tmp1, cnt1, 0); 1625 beqz(tmp1, SAME); // 0-1 bytes left. 1626 { 1627 lbu(tmp1, Address(a1)); 1628 lbu(tmp2, Address(a2)); 1629 bne(tmp1, tmp2, DONE); 1630 } 1631 } 1632 1633 bind(SAME); 1634 mv(result, true); 1635 // That's it. 1636 bind(DONE); 1637 1638 BLOCK_COMMENT("} arrays_equals"); 1639 } 1640 1641 // Compare Strings 1642 1643 // For Strings we're passed the address of the first characters in a1 and a2 1644 // and the length in cnt1. There are two implementations. 1645 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed 1646 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte. 1647 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte. 1648 1649 void C2_MacroAssembler::string_equals(Register a1, Register a2, 1650 Register result, Register cnt1) 1651 { 1652 Label SAME, DONE, SHORT, NEXT_WORD; 1653 Register tmp1 = t0; 1654 Register tmp2 = t1; 1655 1656 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); 1657 1658 BLOCK_COMMENT("string_equals {"); 1659 1660 mv(result, false); 1661 1662 // Check for short strings, i.e. smaller than wordSize. 1663 addi(cnt1, cnt1, -wordSize); 1664 bltz(cnt1, SHORT); 1665 1666 // Main 8 byte comparison loop. 1667 bind(NEXT_WORD); { 1668 ld(tmp1, Address(a1)); 1669 ld(tmp2, Address(a2)); 1670 addi(cnt1, cnt1, -wordSize); 1671 addi(a1, a1, wordSize); 1672 addi(a2, a2, wordSize); 1673 bne(tmp1, tmp2, DONE); 1674 } bgez(cnt1, NEXT_WORD); 1675 1676 addi(tmp1, cnt1, wordSize); 1677 beqz(tmp1, SAME); 1678 1679 bind(SHORT); 1680 Label TAIL03, TAIL01; 1681 1682 // 0-7 bytes left. 1683 test_bit(tmp1, cnt1, 2); 1684 beqz(tmp1, TAIL03); 1685 { 1686 lwu(tmp1, Address(a1)); 1687 lwu(tmp2, Address(a2)); 1688 addi(a1, a1, 4); 1689 addi(a2, a2, 4); 1690 bne(tmp1, tmp2, DONE); 1691 } 1692 1693 bind(TAIL03); 1694 // 0-3 bytes left. 1695 test_bit(tmp1, cnt1, 1); 1696 beqz(tmp1, TAIL01); 1697 { 1698 lhu(tmp1, Address(a1)); 1699 lhu(tmp2, Address(a2)); 1700 addi(a1, a1, 2); 1701 addi(a2, a2, 2); 1702 bne(tmp1, tmp2, DONE); 1703 } 1704 1705 bind(TAIL01); 1706 // 0-1 bytes left. 1707 test_bit(tmp1, cnt1, 0); 1708 beqz(tmp1, SAME); 1709 { 1710 lbu(tmp1, Address(a1)); 1711 lbu(tmp2, Address(a2)); 1712 bne(tmp1, tmp2, DONE); 1713 } 1714 1715 // Arrays are equal. 1716 bind(SAME); 1717 mv(result, true); 1718 1719 // That's it. 1720 bind(DONE); 1721 BLOCK_COMMENT("} string_equals"); 1722 } 1723 1724 // jdk.internal.util.ArraysSupport.vectorizedHashCode 1725 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 1726 Register tmp1, Register tmp2, Register tmp3, 1727 Register tmp4, Register tmp5, Register tmp6, 1728 BasicType eltype) 1729 { 1730 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1); 1731 1732 const int elsize = arrays_hashcode_elsize(eltype); 1733 const int chunks_end_shift = exact_log2(elsize); 1734 1735 switch (eltype) { 1736 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 1737 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 1738 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 1739 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 1740 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 1741 default: 1742 ShouldNotReachHere(); 1743 } 1744 1745 const int stride = 4; 1746 const Register pow31_4 = tmp1; 1747 const Register pow31_3 = tmp2; 1748 const Register pow31_2 = tmp3; 1749 const Register chunks = tmp4; 1750 const Register chunks_end = chunks; 1751 1752 Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP; 1753 1754 // result has a value initially 1755 1756 beqz(cnt, DONE); 1757 1758 andi(chunks, cnt, ~(stride-1)); 1759 beqz(chunks, TAIL); 1760 1761 mv(pow31_4, 923521); // [31^^4] 1762 mv(pow31_3, 29791); // [31^^3] 1763 mv(pow31_2, 961); // [31^^2] 1764 1765 slli(chunks_end, chunks, chunks_end_shift); 1766 add(chunks_end, ary, chunks_end); 1767 andi(cnt, cnt, stride-1); // don't forget about tail! 1768 1769 bind(WIDE_LOOP); 1770 mulw(result, result, pow31_4); // 31^^4 * h 1771 arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype); 1772 arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype); 1773 arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype); 1774 arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype); 1775 mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0] 1776 addw(result, result, t0); 1777 mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1] 1778 addw(result, result, t1); 1779 slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2] 1780 subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2] 1781 addw(result, result, tmp5); 1782 addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1] 1783 // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3] 1784 addi(ary, ary, elsize * stride); 1785 bne(ary, chunks_end, WIDE_LOOP); 1786 beqz(cnt, DONE); 1787 1788 bind(TAIL); 1789 slli(chunks_end, cnt, chunks_end_shift); 1790 add(chunks_end, ary, chunks_end); 1791 1792 bind(TAIL_LOOP); 1793 arrays_hashcode_elload(t0, Address(ary), eltype); 1794 slli(t1, result, 5); // optimize 31 * result 1795 subw(result, t1, result); // with result<<5 - result 1796 addw(result, result, t0); 1797 addi(ary, ary, elsize); 1798 bne(ary, chunks_end, TAIL_LOOP); 1799 1800 bind(DONE); 1801 BLOCK_COMMENT("} // arrays_hashcode"); 1802 } 1803 1804 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 1805 switch (eltype) { 1806 case T_BOOLEAN: return sizeof(jboolean); 1807 case T_BYTE: return sizeof(jbyte); 1808 case T_SHORT: return sizeof(jshort); 1809 case T_CHAR: return sizeof(jchar); 1810 case T_INT: return sizeof(jint); 1811 default: 1812 ShouldNotReachHere(); 1813 return -1; 1814 } 1815 } 1816 1817 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 1818 switch (eltype) { 1819 // T_BOOLEAN used as surrogate for unsigned byte 1820 case T_BOOLEAN: lbu(dst, src); break; 1821 case T_BYTE: lb(dst, src); break; 1822 case T_SHORT: lh(dst, src); break; 1823 case T_CHAR: lhu(dst, src); break; 1824 case T_INT: lw(dst, src); break; 1825 default: 1826 ShouldNotReachHere(); 1827 } 1828 } 1829 1830 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); 1831 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, 1832 bool is_far, bool is_unordered); 1833 1834 static conditional_branch_insn conditional_branches[] = 1835 { 1836 /* SHORT branches */ 1837 (conditional_branch_insn)&MacroAssembler::beq, 1838 (conditional_branch_insn)&MacroAssembler::bgt, 1839 nullptr, // BoolTest::overflow 1840 (conditional_branch_insn)&MacroAssembler::blt, 1841 (conditional_branch_insn)&MacroAssembler::bne, 1842 (conditional_branch_insn)&MacroAssembler::ble, 1843 nullptr, // BoolTest::no_overflow 1844 (conditional_branch_insn)&MacroAssembler::bge, 1845 1846 /* UNSIGNED branches */ 1847 (conditional_branch_insn)&MacroAssembler::beq, 1848 (conditional_branch_insn)&MacroAssembler::bgtu, 1849 nullptr, 1850 (conditional_branch_insn)&MacroAssembler::bltu, 1851 (conditional_branch_insn)&MacroAssembler::bne, 1852 (conditional_branch_insn)&MacroAssembler::bleu, 1853 nullptr, 1854 (conditional_branch_insn)&MacroAssembler::bgeu 1855 }; 1856 1857 static float_conditional_branch_insn float_conditional_branches[] = 1858 { 1859 /* FLOAT SHORT branches */ 1860 (float_conditional_branch_insn)&MacroAssembler::float_beq, 1861 (float_conditional_branch_insn)&MacroAssembler::float_bgt, 1862 nullptr, // BoolTest::overflow 1863 (float_conditional_branch_insn)&MacroAssembler::float_blt, 1864 (float_conditional_branch_insn)&MacroAssembler::float_bne, 1865 (float_conditional_branch_insn)&MacroAssembler::float_ble, 1866 nullptr, // BoolTest::no_overflow 1867 (float_conditional_branch_insn)&MacroAssembler::float_bge, 1868 1869 /* DOUBLE SHORT branches */ 1870 (float_conditional_branch_insn)&MacroAssembler::double_beq, 1871 (float_conditional_branch_insn)&MacroAssembler::double_bgt, 1872 nullptr, 1873 (float_conditional_branch_insn)&MacroAssembler::double_blt, 1874 (float_conditional_branch_insn)&MacroAssembler::double_bne, 1875 (float_conditional_branch_insn)&MacroAssembler::double_ble, 1876 nullptr, 1877 (float_conditional_branch_insn)&MacroAssembler::double_bge 1878 }; 1879 1880 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) { 1881 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])), 1882 "invalid conditional branch index"); 1883 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far); 1884 } 1885 1886 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use 1887 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode(). 1888 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) { 1889 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])), 1890 "invalid float conditional branch index"); 1891 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask); 1892 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far, 1893 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true); 1894 } 1895 1896 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1897 switch (cmpFlag) { 1898 case BoolTest::eq: 1899 case BoolTest::le: 1900 beqz(op1, L, is_far); 1901 break; 1902 case BoolTest::ne: 1903 case BoolTest::gt: 1904 bnez(op1, L, is_far); 1905 break; 1906 default: 1907 ShouldNotReachHere(); 1908 } 1909 } 1910 1911 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1912 switch (cmpFlag) { 1913 case BoolTest::eq: 1914 beqz(op1, L, is_far); 1915 break; 1916 case BoolTest::ne: 1917 bnez(op1, L, is_far); 1918 break; 1919 default: 1920 ShouldNotReachHere(); 1921 } 1922 } 1923 1924 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) { 1925 Label L; 1926 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L); 1927 mv(dst, src); 1928 bind(L); 1929 } 1930 1931 // Set dst to NaN if any NaN input. 1932 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2, 1933 bool is_double, bool is_min) { 1934 assert_different_registers(dst, src1, src2); 1935 1936 Label Done, Compare; 1937 1938 is_double ? fclass_d(t0, src1) 1939 : fclass_s(t0, src1); 1940 is_double ? fclass_d(t1, src2) 1941 : fclass_s(t1, src2); 1942 orr(t0, t0, t1); 1943 andi(t0, t0, fclass_mask::nan); // if src1 or src2 is quiet or signaling NaN then return NaN 1944 beqz(t0, Compare); 1945 is_double ? fadd_d(dst, src1, src2) 1946 : fadd_s(dst, src1, src2); 1947 j(Done); 1948 1949 bind(Compare); 1950 if (is_double) { 1951 is_min ? fmin_d(dst, src1, src2) 1952 : fmax_d(dst, src1, src2); 1953 } else { 1954 is_min ? fmin_s(dst, src1, src2) 1955 : fmax_s(dst, src1, src2); 1956 } 1957 1958 bind(Done); 1959 } 1960 1961 // According to Java SE specification, for floating-point round operations, if 1962 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the 1963 // rounded result; this differs from behavior of RISC-V fcvt instructions (which 1964 // round out-of-range values to the nearest max or min value), therefore special 1965 // handling is needed by NaN, +/-Infinity, +/-0. 1966 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode, 1967 Register tmp1, Register tmp2, Register tmp3) { 1968 1969 assert_different_registers(dst, src); 1970 assert_different_registers(tmp1, tmp2, tmp3); 1971 1972 // Set rounding mode for conversions 1973 // Here we use similar modes to double->long and long->double conversions 1974 // Different mode for long->double conversion matter only if long value was not representable as double, 1975 // we got long value as a result of double->long conversion so, it is definitely representable 1976 RoundingMode rm; 1977 switch (round_mode) { 1978 case RoundDoubleModeNode::rmode_ceil: 1979 rm = RoundingMode::rup; 1980 break; 1981 case RoundDoubleModeNode::rmode_floor: 1982 rm = RoundingMode::rdn; 1983 break; 1984 case RoundDoubleModeNode::rmode_rint: 1985 rm = RoundingMode::rne; 1986 break; 1987 default: 1988 ShouldNotReachHere(); 1989 } 1990 1991 // tmp1 - is a register to store double converted to long int 1992 // tmp2 - is a register to create constant for comparison 1993 // tmp3 - is a register where we store modified result of double->long conversion 1994 Label done, bad_val; 1995 1996 // Conversion from double to long 1997 fcvt_l_d(tmp1, src, rm); 1998 1999 // Generate constant (tmp2) 2000 // tmp2 = 100...0000 2001 addi(tmp2, zr, 1); 2002 slli(tmp2, tmp2, 63); 2003 2004 // Prepare converted long (tmp1) 2005 // as a result when conversion overflow we got: 2006 // tmp1 = 011...1111 or 100...0000 2007 // Convert it to: tmp3 = 100...0000 2008 addi(tmp3, tmp1, 1); 2009 andi(tmp3, tmp3, -2); 2010 beq(tmp3, tmp2, bad_val); 2011 2012 // Conversion from long to double 2013 fcvt_d_l(dst, tmp1, rm); 2014 // Add sign of input value to result for +/- 0 cases 2015 fsgnj_d(dst, dst, src); 2016 j(done); 2017 2018 // If got conversion overflow return src 2019 bind(bad_val); 2020 fmv_d(dst, src); 2021 2022 bind(done); 2023 } 2024 2025 // According to Java SE specification, for floating-point signum operations, if 2026 // on input we have NaN or +/-0.0 value we should return it, 2027 // otherwise return +/- 1.0 using sign of input. 2028 // one - gives us a floating-point 1.0 (got from matching rule) 2029 // bool is_double - specifies single or double precision operations will be used. 2030 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) { 2031 Label done; 2032 2033 is_double ? fclass_d(t0, dst) 2034 : fclass_s(t0, dst); 2035 2036 // check if input is -0, +0, signaling NaN or quiet NaN 2037 andi(t0, t0, fclass_mask::zero | fclass_mask::nan); 2038 2039 bnez(t0, done); 2040 2041 // use floating-point 1.0 with a sign of input 2042 is_double ? fsgnj_d(dst, one, dst) 2043 : fsgnj_s(dst, one, dst); 2044 2045 bind(done); 2046 } 2047 2048 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) { 2049 #define __ masm. 2050 FloatRegister dst = stub.data<0>(); 2051 Register src = stub.data<1>(); 2052 Register tmp = stub.data<2>(); 2053 __ bind(stub.entry()); 2054 2055 // following instructions mainly focus on NaN, as riscv does not handle 2056 // NaN well with fcvt, but the code also works for Inf at the same time. 2057 2058 // construct a NaN in 32 bits from the NaN in 16 bits, 2059 // we need the payloads of non-canonical NaNs to be preserved. 2060 __ mv(tmp, 0x7f800000); 2061 // sign-bit was already set via sign-extension if necessary. 2062 __ slli(t0, src, 13); 2063 __ orr(tmp, t0, tmp); 2064 __ fmv_w_x(dst, tmp); 2065 2066 __ j(stub.continuation()); 2067 #undef __ 2068 } 2069 2070 // j.l.Float.float16ToFloat 2071 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) { 2072 auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path); 2073 2074 // On riscv, NaN needs a special process as fcvt does not work in that case. 2075 // On riscv, Inf does not need a special process as fcvt can handle it correctly. 2076 // but we consider to get the slow path to process NaN and Inf at the same time, 2077 // as both of them are rare cases, and if we try to get the slow path to handle 2078 // only NaN case it would sacrifise the performance for normal cases, 2079 // i.e. non-NaN and non-Inf cases. 2080 2081 // check whether it's a NaN or +/- Inf. 2082 mv(t0, 0x7c00); 2083 andr(tmp, src, t0); 2084 // jump to stub processing NaN and Inf cases. 2085 beq(t0, tmp, stub->entry()); 2086 2087 // non-NaN or non-Inf cases, just use built-in instructions. 2088 fmv_h_x(dst, src); 2089 fcvt_s_h(dst, dst); 2090 2091 bind(stub->continuation()); 2092 } 2093 2094 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) { 2095 #define __ masm. 2096 Register dst = stub.data<0>(); 2097 FloatRegister src = stub.data<1>(); 2098 Register tmp = stub.data<2>(); 2099 __ bind(stub.entry()); 2100 2101 __ fmv_x_w(dst, src); 2102 2103 // preserve the payloads of non-canonical NaNs. 2104 __ srai(dst, dst, 13); 2105 // preserve the sign bit. 2106 __ srai(tmp, dst, 13); 2107 __ slli(tmp, tmp, 10); 2108 __ mv(t0, 0x3ff); 2109 __ orr(tmp, tmp, t0); 2110 2111 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2112 __ andr(dst, dst, tmp); 2113 2114 __ j(stub.continuation()); 2115 #undef __ 2116 } 2117 2118 // j.l.Float.floatToFloat16 2119 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) { 2120 auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 130, float_to_float16_slow_path); 2121 2122 // On riscv, NaN needs a special process as fcvt does not work in that case. 2123 2124 // check whether it's a NaN. 2125 // replace fclass with feq as performance optimization. 2126 feq_s(t0, src, src); 2127 // jump to stub processing NaN cases. 2128 beqz(t0, stub->entry()); 2129 2130 // non-NaN cases, just use built-in instructions. 2131 fcvt_h_s(ftmp, src); 2132 fmv_x_h(dst, ftmp); 2133 2134 bind(stub->continuation()); 2135 } 2136 2137 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) { 2138 #define __ masm. 2139 VectorRegister dst = stub.data<0>(); 2140 VectorRegister src = stub.data<1>(); 2141 uint vector_length = stub.data<2>(); 2142 __ bind(stub.entry()); 2143 2144 // following instructions mainly focus on NaN, as riscv does not handle 2145 // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time. 2146 // 2147 // construct NaN's in 32 bits from the NaN's in 16 bits, 2148 // we need the payloads of non-canonical NaNs to be preserved. 2149 2150 // adjust vector type to 2 * SEW. 2151 __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1); 2152 // widen and sign-extend src data. 2153 __ vsext_vf2(dst, src, Assembler::v0_t); 2154 __ mv(t0, 0x7f800000); 2155 // sign-bit was already set via sign-extension if necessary. 2156 __ vsll_vi(dst, dst, 13, Assembler::v0_t); 2157 __ vor_vx(dst, dst, t0, Assembler::v0_t); 2158 2159 __ j(stub.continuation()); 2160 #undef __ 2161 } 2162 2163 // j.l.Float.float16ToFloat 2164 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) { 2165 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint> 2166 (dst, src, vector_length, 24, float16_to_float_v_slow_path); 2167 assert_different_registers(dst, src); 2168 2169 // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case. 2170 // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly. 2171 // but we consider to get the slow path to process NaN and Inf at the same time, 2172 // as both of them are rare cases, and if we try to get the slow path to handle 2173 // only NaN case it would sacrifise the performance for normal cases, 2174 // i.e. non-NaN and non-Inf cases. 2175 2176 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2); 2177 2178 // check whether there is a NaN or +/- Inf. 2179 mv(t0, 0x7c00); 2180 vand_vx(v0, src, t0); 2181 // v0 will be used as mask in slow path. 2182 vmseq_vx(v0, v0, t0); 2183 vcpop_m(t0, v0); 2184 2185 // For non-NaN or non-Inf cases, just use built-in instructions. 2186 vfwcvt_f_f_v(dst, src); 2187 2188 // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide. 2189 bnez(t0, stub->entry()); 2190 2191 bind(stub->continuation()); 2192 } 2193 2194 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm, 2195 C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) { 2196 #define __ masm. 2197 VectorRegister dst = stub.data<0>(); 2198 VectorRegister src = stub.data<1>(); 2199 VectorRegister tmp = stub.data<2>(); 2200 __ bind(stub.entry()); 2201 2202 // mul is already set to mf2 in float_to_float16_v. 2203 2204 // preserve the payloads of non-canonical NaNs. 2205 __ vnsra_wi(dst, src, 13, Assembler::v0_t); 2206 2207 // preserve the sign bit. 2208 __ vnsra_wi(tmp, src, 26, Assembler::v0_t); 2209 __ vsll_vi(tmp, tmp, 10, Assembler::v0_t); 2210 __ mv(t0, 0x3ff); 2211 __ vor_vx(tmp, tmp, t0, Assembler::v0_t); 2212 2213 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 2214 __ vand_vv(dst, dst, tmp, Assembler::v0_t); 2215 2216 __ j(stub.continuation()); 2217 #undef __ 2218 } 2219 2220 // j.l.Float.float16ToFloat 2221 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src, VectorRegister vtmp, 2222 Register tmp, uint vector_length) { 2223 assert_different_registers(dst, src, vtmp); 2224 2225 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister> 2226 (dst, src, vtmp, 28, float_to_float16_v_slow_path); 2227 2228 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case. 2229 2230 vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1); 2231 2232 // check whether there is a NaN. 2233 // replace v_fclass with vmseq_vv as performance optimization. 2234 vmfne_vv(v0, src, src); 2235 vcpop_m(t0, v0); 2236 2237 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp); 2238 2239 // For non-NaN cases, just use built-in instructions. 2240 vfncvt_f_f_w(dst, src); 2241 2242 // jump to stub processing NaN cases. 2243 bnez(t0, stub->entry()); 2244 2245 bind(stub->continuation()); 2246 } 2247 2248 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) { 2249 vsetvli_helper(bt, vlen); 2250 2251 // check if input is -0, +0, signaling NaN or quiet NaN 2252 vfclass_v(v0, dst); 2253 mv(t0, fclass_mask::zero | fclass_mask::nan); 2254 vand_vx(v0, v0, t0); 2255 vmseq_vi(v0, v0, 0); 2256 2257 // use floating-point 1.0 with a sign of input 2258 vfsgnj_vv(dst, one, dst, v0_t); 2259 } 2260 2261 void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) { 2262 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2263 // intrinsic is enabled when MaxVectorSize >= 16 2264 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2265 long len = is_long ? 64 : 32; 2266 2267 // load the src data(in bits) to be compressed. 2268 vsetivli(x0, 1, sew, Assembler::m1); 2269 vmv_s_x(v0, src); 2270 // reset the src data(in bytes) to zero. 2271 mv(t0, len); 2272 vsetvli(x0, t0, Assembler::e8, lmul); 2273 vmv_v_i(v4, 0); 2274 // convert the src data from bits to bytes. 2275 vmerge_vim(v4, v4, 1); // v0 as the implicit mask register 2276 // reset the dst data(in bytes) to zero. 2277 vmv_v_i(v8, 0); 2278 // load the mask data(in bits). 2279 vsetivli(x0, 1, sew, Assembler::m1); 2280 vmv_s_x(v0, mask); 2281 // compress the src data(in bytes) to dst(in bytes). 2282 vsetvli(x0, t0, Assembler::e8, lmul); 2283 vcompress_vm(v8, v4, v0); 2284 // convert the dst data from bytes to bits. 2285 vmseq_vi(v0, v8, 1); 2286 // store result back. 2287 vsetivli(x0, 1, sew, Assembler::m1); 2288 vmv_x_s(dst, v0); 2289 } 2290 2291 void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) { 2292 compress_bits_v(dst, src, mask, /* is_long */ false); 2293 } 2294 2295 void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) { 2296 compress_bits_v(dst, src, mask, /* is_long */ true); 2297 } 2298 2299 void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) { 2300 Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; 2301 // intrinsic is enabled when MaxVectorSize >= 16 2302 Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; 2303 long len = is_long ? 64 : 32; 2304 2305 // load the src data(in bits) to be expanded. 2306 vsetivli(x0, 1, sew, Assembler::m1); 2307 vmv_s_x(v0, src); 2308 // reset the src data(in bytes) to zero. 2309 mv(t0, len); 2310 vsetvli(x0, t0, Assembler::e8, lmul); 2311 vmv_v_i(v4, 0); 2312 // convert the src data from bits to bytes. 2313 vmerge_vim(v4, v4, 1); // v0 as implicit mask register 2314 // reset the dst data(in bytes) to zero. 2315 vmv_v_i(v12, 0); 2316 // load the mask data(in bits). 2317 vsetivli(x0, 1, sew, Assembler::m1); 2318 vmv_s_x(v0, mask); 2319 // expand the src data(in bytes) to dst(in bytes). 2320 vsetvli(x0, t0, Assembler::e8, lmul); 2321 viota_m(v8, v0); 2322 vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register 2323 // convert the dst data from bytes to bits. 2324 vmseq_vi(v0, v12, 1); 2325 // store result back. 2326 vsetivli(x0, 1, sew, Assembler::m1); 2327 vmv_x_s(dst, v0); 2328 } 2329 2330 void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) { 2331 expand_bits_v(dst, src, mask, /* is_long */ false); 2332 } 2333 2334 void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) { 2335 expand_bits_v(dst, src, mask, /* is_long */ true); 2336 } 2337 2338 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, 2339 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) { 2340 Label loop; 2341 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16; 2342 2343 bind(loop); 2344 vsetvli(tmp1, cnt, sew, Assembler::m2); 2345 vlex_v(vr1, a1, sew); 2346 vlex_v(vr2, a2, sew); 2347 vmsne_vv(vrs, vr1, vr2); 2348 vfirst_m(tmp2, vrs); 2349 bgez(tmp2, DONE); 2350 sub(cnt, cnt, tmp1); 2351 if (!islatin) { 2352 slli(tmp1, tmp1, 1); // get byte counts 2353 } 2354 add(a1, a1, tmp1); 2355 add(a2, a2, tmp1); 2356 bnez(cnt, loop); 2357 2358 mv(result, true); 2359 } 2360 2361 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) { 2362 Label DONE; 2363 Register tmp1 = t0; 2364 Register tmp2 = t1; 2365 2366 BLOCK_COMMENT("string_equals_v {"); 2367 2368 mv(result, false); 2369 2370 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE); 2371 2372 bind(DONE); 2373 BLOCK_COMMENT("} string_equals_v"); 2374 } 2375 2376 // used by C2 ClearArray patterns. 2377 // base: Address of a buffer to be zeroed 2378 // cnt: Count in HeapWords 2379 // 2380 // base, cnt, v4, v5, v6, v7 and t0 are clobbered. 2381 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) { 2382 Label loop; 2383 2384 // making zero words 2385 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2386 vxor_vv(v4, v4, v4); 2387 2388 bind(loop); 2389 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 2390 vse64_v(v4, base); 2391 sub(cnt, cnt, t0); 2392 shadd(base, t0, base, t0, 3); 2393 bnez(cnt, loop); 2394 } 2395 2396 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result, 2397 Register cnt1, int elem_size) { 2398 Label DONE; 2399 Register tmp1 = t0; 2400 Register tmp2 = t1; 2401 Register cnt2 = tmp2; 2402 int length_offset = arrayOopDesc::length_offset_in_bytes(); 2403 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 2404 2405 BLOCK_COMMENT("arrays_equals_v {"); 2406 2407 // if (a1 == a2), return true 2408 mv(result, true); 2409 beq(a1, a2, DONE); 2410 2411 mv(result, false); 2412 // if a1 == null or a2 == null, return false 2413 beqz(a1, DONE); 2414 beqz(a2, DONE); 2415 // if (a1.length != a2.length), return false 2416 lwu(cnt1, Address(a1, length_offset)); 2417 lwu(cnt2, Address(a2, length_offset)); 2418 bne(cnt1, cnt2, DONE); 2419 2420 la(a1, Address(a1, base_offset)); 2421 la(a2, Address(a2, base_offset)); 2422 2423 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE); 2424 2425 bind(DONE); 2426 2427 BLOCK_COMMENT("} arrays_equals_v"); 2428 } 2429 2430 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2, 2431 Register result, Register tmp1, Register tmp2, int encForm) { 2432 Label DIFFERENCE, DONE, L, loop; 2433 bool encLL = encForm == StrIntrinsicNode::LL; 2434 bool encLU = encForm == StrIntrinsicNode::LU; 2435 bool encUL = encForm == StrIntrinsicNode::UL; 2436 2437 bool str1_isL = encLL || encLU; 2438 bool str2_isL = encLL || encUL; 2439 2440 int minCharsInWord = encLL ? wordSize : wordSize / 2; 2441 2442 BLOCK_COMMENT("string_compare {"); 2443 2444 // for Latin strings, 1 byte for 1 character 2445 // for UTF16 strings, 2 bytes for 1 character 2446 if (!str1_isL) 2447 sraiw(cnt1, cnt1, 1); 2448 if (!str2_isL) 2449 sraiw(cnt2, cnt2, 1); 2450 2451 // if str1 == str2, return the difference 2452 // save the minimum of the string lengths in cnt2. 2453 sub(result, cnt1, cnt2); 2454 bgt(cnt1, cnt2, L); 2455 mv(cnt2, cnt1); 2456 bind(L); 2457 2458 if (str1_isL == str2_isL) { // LL or UU 2459 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE); 2460 j(DONE); 2461 } else { // LU or UL 2462 Register strL = encLU ? str1 : str2; 2463 Register strU = encLU ? str2 : str1; 2464 VectorRegister vstr1 = encLU ? v8 : v4; 2465 VectorRegister vstr2 = encLU ? v4 : v8; 2466 2467 bind(loop); 2468 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2); 2469 vle8_v(vstr1, strL); 2470 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4); 2471 vzext_vf2(vstr2, vstr1); 2472 vle16_v(vstr1, strU); 2473 vmsne_vv(v4, vstr2, vstr1); 2474 vfirst_m(tmp2, v4); 2475 bgez(tmp2, DIFFERENCE); 2476 sub(cnt2, cnt2, tmp1); 2477 add(strL, strL, tmp1); 2478 shadd(strU, tmp1, strU, tmp1, 1); 2479 bnez(cnt2, loop); 2480 j(DONE); 2481 } 2482 2483 bind(DIFFERENCE); 2484 slli(tmp1, tmp2, 1); 2485 add(str1, str1, str1_isL ? tmp2 : tmp1); 2486 add(str2, str2, str2_isL ? tmp2 : tmp1); 2487 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0)); 2488 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0)); 2489 sub(result, tmp1, tmp2); 2490 2491 bind(DONE); 2492 } 2493 2494 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) { 2495 Label loop; 2496 assert_different_registers(src, dst, len, tmp, t0); 2497 2498 BLOCK_COMMENT("byte_array_inflate_v {"); 2499 bind(loop); 2500 vsetvli(tmp, len, Assembler::e8, Assembler::m2); 2501 vle8_v(v6, src); 2502 vsetvli(t0, len, Assembler::e16, Assembler::m4); 2503 vzext_vf2(v4, v6); 2504 vse16_v(v4, dst); 2505 sub(len, len, tmp); 2506 add(src, src, tmp); 2507 shadd(dst, tmp, dst, tmp, 1); 2508 bnez(len, loop); 2509 BLOCK_COMMENT("} byte_array_inflate_v"); 2510 } 2511 2512 // Compress char[] array to byte[]. 2513 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) 2514 // result: the array length if every element in array can be encoded, 2515 // otherwise, the index of first non-latin1 (> 0xff) character. 2516 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, 2517 Register result, Register tmp) { 2518 encode_iso_array_v(src, dst, len, result, tmp, false); 2519 } 2520 2521 // Intrinsic for 2522 // 2523 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray 2524 // return the number of characters copied. 2525 // - java/lang/StringUTF16.compress 2526 // return index of non-latin1 character if copy fails, otherwise 'len'. 2527 // 2528 // This version always returns the number of characters copied. A successful 2529 // copy will complete with the post-condition: 'res' == 'len', while an 2530 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'. 2531 // 2532 // Clobbers: src, dst, len, result, t0 2533 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, 2534 Register result, Register tmp, bool ascii) { 2535 Label loop, fail, done; 2536 2537 BLOCK_COMMENT("encode_iso_array_v {"); 2538 mv(result, 0); 2539 2540 bind(loop); 2541 mv(tmp, ascii ? 0x7f : 0xff); 2542 vsetvli(t0, len, Assembler::e16, Assembler::m2); 2543 vle16_v(v2, src); 2544 2545 vmsgtu_vx(v1, v2, tmp); 2546 vfirst_m(tmp, v1); 2547 vmsbf_m(v0, v1); 2548 // compress char to byte 2549 vsetvli(t0, len, Assembler::e8); 2550 vncvt_x_x_w(v1, v2, Assembler::v0_t); 2551 vse8_v(v1, dst, Assembler::v0_t); 2552 2553 // fail if char > 0x7f/0xff 2554 bgez(tmp, fail); 2555 add(result, result, t0); 2556 add(dst, dst, t0); 2557 sub(len, len, t0); 2558 shadd(src, t0, src, t0, 1); 2559 bnez(len, loop); 2560 j(done); 2561 2562 bind(fail); 2563 add(result, result, tmp); 2564 2565 bind(done); 2566 BLOCK_COMMENT("} encode_iso_array_v"); 2567 } 2568 2569 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) { 2570 Label LOOP, SET_RESULT, DONE; 2571 2572 BLOCK_COMMENT("count_positives_v {"); 2573 assert_different_registers(ary, len, result, tmp); 2574 2575 mv(result, zr); 2576 2577 bind(LOOP); 2578 vsetvli(t0, len, Assembler::e8, Assembler::m4); 2579 vle8_v(v4, ary); 2580 vmslt_vx(v4, v4, zr); 2581 vfirst_m(tmp, v4); 2582 bgez(tmp, SET_RESULT); 2583 // if tmp == -1, all bytes are positive 2584 add(result, result, t0); 2585 2586 sub(len, len, t0); 2587 add(ary, ary, t0); 2588 bnez(len, LOOP); 2589 j(DONE); 2590 2591 // add remaining positive bytes count 2592 bind(SET_RESULT); 2593 add(result, result, tmp); 2594 2595 bind(DONE); 2596 BLOCK_COMMENT("} count_positives_v"); 2597 } 2598 2599 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1, 2600 Register ch, Register result, 2601 Register tmp1, Register tmp2, 2602 bool isL) { 2603 mv(result, zr); 2604 2605 Label loop, MATCH, DONE; 2606 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16; 2607 bind(loop); 2608 vsetvli(tmp1, cnt1, sew, Assembler::m4); 2609 vlex_v(v4, str1, sew); 2610 vmseq_vx(v4, v4, ch); 2611 vfirst_m(tmp2, v4); 2612 bgez(tmp2, MATCH); // if equal, return index 2613 2614 add(result, result, tmp1); 2615 sub(cnt1, cnt1, tmp1); 2616 if (!isL) slli(tmp1, tmp1, 1); 2617 add(str1, str1, tmp1); 2618 bnez(cnt1, loop); 2619 2620 mv(result, -1); 2621 j(DONE); 2622 2623 bind(MATCH); 2624 add(result, result, tmp2); 2625 2626 bind(DONE); 2627 } 2628 2629 // Set dst to NaN if any NaN input. 2630 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2631 BasicType bt, bool is_min, uint vector_length) { 2632 assert_different_registers(dst, src1, src2); 2633 2634 vsetvli_helper(bt, vector_length); 2635 2636 is_min ? vfmin_vv(dst, src1, src2) 2637 : vfmax_vv(dst, src1, src2); 2638 2639 vmfne_vv(v0, src1, src1); 2640 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2641 vmfne_vv(v0, src2, src2); 2642 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2643 } 2644 2645 // Set dst to NaN if any NaN input. 2646 // The destination vector register elements corresponding to masked-off elements 2647 // are handled with a mask-undisturbed policy. 2648 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 2649 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2, 2650 BasicType bt, bool is_min, uint vector_length) { 2651 assert_different_registers(src1, src2, tmp1, tmp2); 2652 vsetvli_helper(bt, vector_length); 2653 2654 // Check vector elements of src1 and src2 for NaN. 2655 vmfeq_vv(tmp1, src1, src1); 2656 vmfeq_vv(tmp2, src2, src2); 2657 2658 vmandn_mm(v0, vmask, tmp1); 2659 vfadd_vv(dst, src1, src1, Assembler::v0_t); 2660 vmandn_mm(v0, vmask, tmp2); 2661 vfadd_vv(dst, src2, src2, Assembler::v0_t); 2662 2663 vmand_mm(tmp2, tmp1, tmp2); 2664 vmand_mm(v0, vmask, tmp2); 2665 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t) 2666 : vfmax_vv(dst, src1, src2, Assembler::v0_t); 2667 } 2668 2669 // Set dst to NaN if any NaN input. 2670 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst, 2671 FloatRegister src1, VectorRegister src2, 2672 VectorRegister tmp1, VectorRegister tmp2, 2673 bool is_double, bool is_min, uint vector_length, VectorMask vm) { 2674 assert_different_registers(dst, src1); 2675 assert_different_registers(src2, tmp1, tmp2); 2676 2677 Label L_done, L_NaN_1, L_NaN_2; 2678 // Set dst to src1 if src1 is NaN 2679 is_double ? feq_d(t0, src1, src1) 2680 : feq_s(t0, src1, src1); 2681 beqz(t0, L_NaN_2); 2682 2683 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 2684 vfmv_s_f(tmp2, src1); 2685 2686 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm) 2687 : vfredmax_vs(tmp1, src2, tmp2, vm); 2688 vfmv_f_s(dst, tmp1); 2689 2690 // Checking NaNs in src2 2691 vmfne_vv(tmp1, src2, src2, vm); 2692 vcpop_m(t0, tmp1, vm); 2693 beqz(t0, L_done); 2694 2695 bind(L_NaN_1); 2696 vfredusum_vs(tmp1, src2, tmp2, vm); 2697 vfmv_f_s(dst, tmp1); 2698 j(L_done); 2699 2700 bind(L_NaN_2); 2701 is_double ? fmv_d(dst, src1) 2702 : fmv_s(dst, src1); 2703 bind(L_done); 2704 } 2705 2706 bool C2_MacroAssembler::in_scratch_emit_size() { 2707 if (ciEnv::current()->task() != nullptr) { 2708 PhaseOutput* phase_output = Compile::current()->output(); 2709 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2710 return true; 2711 } 2712 } 2713 return MacroAssembler::in_scratch_emit_size(); 2714 } 2715 2716 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, 2717 VectorRegister src2, VectorRegister tmp, 2718 int opc, BasicType bt, uint vector_length, VectorMask vm) { 2719 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2720 vsetvli_helper(bt, vector_length); 2721 vmv_s_x(tmp, src1); 2722 switch (opc) { 2723 case Op_AddReductionVI: 2724 case Op_AddReductionVL: 2725 vredsum_vs(tmp, src2, tmp, vm); 2726 break; 2727 case Op_AndReductionV: 2728 vredand_vs(tmp, src2, tmp, vm); 2729 break; 2730 case Op_OrReductionV: 2731 vredor_vs(tmp, src2, tmp, vm); 2732 break; 2733 case Op_XorReductionV: 2734 vredxor_vs(tmp, src2, tmp, vm); 2735 break; 2736 case Op_MaxReductionV: 2737 vredmax_vs(tmp, src2, tmp, vm); 2738 break; 2739 case Op_MinReductionV: 2740 vredmin_vs(tmp, src2, tmp, vm); 2741 break; 2742 default: 2743 ShouldNotReachHere(); 2744 } 2745 vmv_x_s(dst, tmp); 2746 } 2747 2748 // Set vl and vtype for full and partial vector operations. 2749 // (vma = mu, vta = tu, vill = false) 2750 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) { 2751 Assembler::SEW sew = Assembler::elemtype_to_sew(bt); 2752 if (vector_length <= 31) { 2753 vsetivli(tmp, vector_length, sew, vlmul); 2754 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) { 2755 vsetvli(tmp, x0, sew, vlmul); 2756 } else { 2757 mv(tmp, vector_length); 2758 vsetvli(tmp, tmp, sew, vlmul); 2759 } 2760 } 2761 2762 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2763 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2764 assert(is_integral_type(bt), "unsupported element type"); 2765 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2766 vsetvli_helper(bt, vector_length); 2767 vmclr_m(vd); 2768 switch (cond) { 2769 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break; 2770 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break; 2771 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break; 2772 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break; 2773 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break; 2774 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break; 2775 case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break; 2776 case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break; 2777 case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break; 2778 case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break; 2779 default: 2780 assert(false, "unsupported compare condition"); 2781 ShouldNotReachHere(); 2782 } 2783 } 2784 2785 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2786 int cond, BasicType bt, uint vector_length, VectorMask vm) { 2787 assert(is_floating_point_type(bt), "unsupported element type"); 2788 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2789 vsetvli_helper(bt, vector_length); 2790 vmclr_m(vd); 2791 switch (cond) { 2792 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break; 2793 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break; 2794 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break; 2795 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break; 2796 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break; 2797 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break; 2798 default: 2799 assert(false, "unsupported compare condition"); 2800 ShouldNotReachHere(); 2801 } 2802 } 2803 2804 // In Matcher::scalable_predicate_reg_slots, 2805 // we assume each predicate register is one-eighth of the size of 2806 // scalable vector register, one mask bit per vector byte. 2807 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) { 2808 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2809 add(t0, sp, offset); 2810 vse8_v(v, t0); 2811 } 2812 2813 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) { 2814 vsetvli_helper(T_BYTE, MaxVectorSize >> 3); 2815 add(t0, sp, offset); 2816 vle8_v(v, t0); 2817 } 2818 2819 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2820 VectorRegister src, BasicType src_bt, bool is_signed) { 2821 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size"); 2822 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2823 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands 2824 // The destination EEW is greater than the source EEW, the source EMUL is at least 1, 2825 // and the overlap is in the highest-numbered part of the destination register group. 2826 // Since LMUL=1, vd and vs cannot be the same. 2827 assert_different_registers(dst, src); 2828 2829 vsetvli_helper(dst_bt, vector_length); 2830 if (is_signed) { 2831 if (src_bt == T_BYTE) { 2832 switch (dst_bt) { 2833 case T_SHORT: 2834 vsext_vf2(dst, src); 2835 break; 2836 case T_INT: 2837 vsext_vf4(dst, src); 2838 break; 2839 case T_LONG: 2840 vsext_vf8(dst, src); 2841 break; 2842 default: 2843 ShouldNotReachHere(); 2844 } 2845 } else if (src_bt == T_SHORT) { 2846 if (dst_bt == T_INT) { 2847 vsext_vf2(dst, src); 2848 } else { 2849 vsext_vf4(dst, src); 2850 } 2851 } else if (src_bt == T_INT) { 2852 vsext_vf2(dst, src); 2853 } 2854 } else { 2855 if (src_bt == T_BYTE) { 2856 switch (dst_bt) { 2857 case T_SHORT: 2858 vzext_vf2(dst, src); 2859 break; 2860 case T_INT: 2861 vzext_vf4(dst, src); 2862 break; 2863 case T_LONG: 2864 vzext_vf8(dst, src); 2865 break; 2866 default: 2867 ShouldNotReachHere(); 2868 } 2869 } else if (src_bt == T_SHORT) { 2870 if (dst_bt == T_INT) { 2871 vzext_vf2(dst, src); 2872 } else { 2873 vzext_vf4(dst, src); 2874 } 2875 } else if (src_bt == T_INT) { 2876 vzext_vf2(dst, src); 2877 } 2878 } 2879 } 2880 2881 // Vector narrow from src to dst with specified element sizes. 2882 // High part of dst vector will be filled with zero. 2883 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length, 2884 VectorRegister src, BasicType src_bt) { 2885 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size"); 2886 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2887 mv(t0, vector_length); 2888 if (src_bt == T_LONG) { 2889 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions 2890 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source. 2891 // So we can currently only scale down by 1/2 the width at a time. 2892 vsetvli(t0, t0, Assembler::e32, Assembler::mf2); 2893 vncvt_x_x_w(dst, src); 2894 if (dst_bt == T_SHORT || dst_bt == T_BYTE) { 2895 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2896 vncvt_x_x_w(dst, dst); 2897 if (dst_bt == T_BYTE) { 2898 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2899 vncvt_x_x_w(dst, dst); 2900 } 2901 } 2902 } else if (src_bt == T_INT) { 2903 // T_SHORT 2904 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2905 vncvt_x_x_w(dst, src); 2906 if (dst_bt == T_BYTE) { 2907 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2908 vncvt_x_x_w(dst, dst); 2909 } 2910 } else if (src_bt == T_SHORT) { 2911 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2912 vncvt_x_x_w(dst, src); 2913 } 2914 } 2915 2916 #define VFCVT_SAFE(VFLOATCVT) \ 2917 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \ 2918 assert_different_registers(dst, src); \ 2919 vxor_vv(dst, dst, dst); \ 2920 vmfeq_vv(v0, src, src); \ 2921 VFLOATCVT(dst, src, Assembler::v0_t); \ 2922 } 2923 2924 VFCVT_SAFE(vfcvt_rtz_x_f_v); 2925 2926 #undef VFCVT_SAFE 2927 2928 // Extract a scalar element from an vector at position 'idx'. 2929 // The input elements in src are expected to be of integral type. 2930 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt, 2931 int idx, VectorRegister tmp) { 2932 assert(is_integral_type(bt), "unsupported element type"); 2933 assert(idx >= 0, "idx cannot be negative"); 2934 // Only need the first element after vector slidedown 2935 vsetvli_helper(bt, 1); 2936 if (idx == 0) { 2937 vmv_x_s(dst, src); 2938 } else if (idx <= 31) { 2939 vslidedown_vi(tmp, src, idx); 2940 vmv_x_s(dst, tmp); 2941 } else { 2942 mv(t0, idx); 2943 vslidedown_vx(tmp, src, t0); 2944 vmv_x_s(dst, tmp); 2945 } 2946 } 2947 2948 // Extract a scalar element from an vector at position 'idx'. 2949 // The input elements in src are expected to be of floating point type. 2950 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, 2951 int idx, VectorRegister tmp) { 2952 assert(is_floating_point_type(bt), "unsupported element type"); 2953 assert(idx >= 0, "idx cannot be negative"); 2954 // Only need the first element after vector slidedown 2955 vsetvli_helper(bt, 1); 2956 if (idx == 0) { 2957 vfmv_f_s(dst, src); 2958 } else if (idx <= 31) { 2959 vslidedown_vi(tmp, src, idx); 2960 vfmv_f_s(dst, tmp); 2961 } else { 2962 mv(t0, idx); 2963 vslidedown_vx(tmp, src, t0); 2964 vfmv_f_s(dst, tmp); 2965 } 2966 }