1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/compile.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, 47 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) { 48 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure. 49 Register flag = t1; 50 Register oop = objectReg; 51 Register box = boxReg; 52 Register disp_hdr = tmp1Reg; 53 Register tmp = tmp2Reg; 54 Label cont; 55 Label object_has_monitor; 56 Label count, no_count; 57 58 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0); 59 60 // Load markWord from object into displaced_header. 61 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 62 63 if (DiagnoseSyncOnValueBasedClasses != 0) { 64 load_klass(flag, oop); 65 lwu(flag, Address(flag, Klass::access_flags_offset())); 66 test_bit(flag, flag, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 67 bnez(flag, cont, true /* is_far */); 68 } 69 70 // Check for existing monitor 71 test_bit(t0, disp_hdr, exact_log2(markWord::monitor_value)); 72 bnez(t0, object_has_monitor); 73 74 if (LockingMode == LM_MONITOR) { 75 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path 76 j(cont); 77 } else if (LockingMode == LM_LEGACY) { 78 // Set tmp to be (markWord of object | UNLOCK_VALUE). 79 ori(tmp, disp_hdr, markWord::unlocked_value); 80 81 // Initialize the box. (Must happen before we update the object mark!) 82 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 83 84 // Compare object markWord with an unlocked value (tmp) and if 85 // equal exchange the stack address of our box with object markWord. 86 // On failure disp_hdr contains the possibly locked markWord. 87 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, Assembler::aq, 88 Assembler::rl, /*result*/disp_hdr); 89 mv(flag, zr); 90 beq(disp_hdr, tmp, cont); // prepare zero flag and goto cont if we won the cas 91 92 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 93 94 // If the compare-and-exchange succeeded, then we found an unlocked 95 // object, will have now locked it will continue at label cont 96 // We did not see an unlocked object so try the fast recursive case. 97 98 // Check if the owner is self by comparing the value in the 99 // markWord of object (disp_hdr) with the stack pointer. 100 sub(disp_hdr, disp_hdr, sp); 101 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place)); 102 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto cont, 103 // hence we can store 0 as the displaced header in the box, which indicates that it is a 104 // recursive lock. 105 andr(tmp/*==0?*/, disp_hdr, tmp); 106 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 107 mv(flag, tmp); // we can use the value of tmp as the result here 108 j(cont); 109 } else { 110 assert(LockingMode == LM_LIGHTWEIGHT, ""); 111 Label slow; 112 lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, slow); 113 114 // Indicate success on completion. 115 mv(flag, zr); 116 j(count); 117 bind(slow); 118 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path 119 j(no_count); 120 } 121 122 // Handle existing monitor. 123 bind(object_has_monitor); 124 // The object's monitor m is unlocked iff m->owner == NULL, 125 // otherwise m->owner may contain a thread or a stack address. 126 // 127 // Try to CAS m->owner from NULL to current thread. 128 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value)); 129 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, Assembler::aq, 130 Assembler::rl, /*result*/flag); // cas succeeds if flag == zr(expected) 131 132 if (LockingMode != LM_LIGHTWEIGHT) { 133 // Store a non-null value into the box to avoid looking like a re-entrant 134 // lock. The fast-path monitor unlock code checks for 135 // markWord::monitor_value so use markWord::unused_mark which has the 136 // relevant bit set, and also matches ObjectSynchronizer::slow_enter. 137 mv(tmp, (address)markWord::unused_mark().value()); 138 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 139 } 140 141 beqz(flag, cont); // CAS success means locking succeeded 142 143 bne(flag, xthread, cont); // Check for recursive locking 144 145 // Recursive lock case 146 mv(flag, zr); 147 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, t0, tmp); 148 149 bind(cont); 150 // zero flag indicates success 151 // non-zero flag indicates failure 152 bnez(flag, no_count); 153 154 bind(count); 155 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp); 156 157 bind(no_count); 158 } 159 160 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, 161 Register tmp1Reg, Register tmp2Reg) { 162 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure. 163 Register flag = t1; 164 Register oop = objectReg; 165 Register box = boxReg; 166 Register disp_hdr = tmp1Reg; 167 Register tmp = tmp2Reg; 168 Label cont; 169 Label object_has_monitor; 170 Label count, no_count; 171 172 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0); 173 174 if (LockingMode == LM_LEGACY) { 175 // Find the lock address and load the displaced header from the stack. 176 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 177 178 // If the displaced header is 0, we have a recursive unlock. 179 mv(flag, disp_hdr); 180 beqz(disp_hdr, cont); 181 } 182 183 // Handle existing monitor. 184 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 185 test_bit(t0, tmp, exact_log2(markWord::monitor_value)); 186 bnez(t0, object_has_monitor); 187 188 if (LockingMode == LM_MONITOR) { 189 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path 190 j(cont); 191 } else if (LockingMode == LM_LEGACY) { 192 // Check if it is still a light weight lock, this is true if we 193 // see the stack address of the basicLock in the markWord of the 194 // object. 195 196 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, Assembler::relaxed, 197 Assembler::rl, /*result*/tmp); 198 xorr(flag, box, tmp); // box == tmp if cas succeeds 199 j(cont); 200 } else { 201 assert(LockingMode == LM_LIGHTWEIGHT, ""); 202 Label slow; 203 lightweight_unlock(oop, tmp, box, disp_hdr, slow); 204 205 // Indicate success on completion. 206 mv(flag, zr); 207 j(count); 208 bind(slow); 209 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path 210 j(no_count); 211 } 212 213 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 214 215 // Handle existing monitor. 216 bind(object_has_monitor); 217 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 218 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 219 220 if (LockingMode == LM_LIGHTWEIGHT) { 221 // If the owner is anonymous, we need to fix it -- in an outline stub. 222 Register tmp2 = disp_hdr; 223 ld(tmp2, Address(tmp, ObjectMonitor::owner_offset())); 224 test_bit(t0, tmp2, exact_log2(ObjectMonitor::ANONYMOUS_OWNER)); 225 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2); 226 Compile::current()->output()->add_stub(stub); 227 bnez(t0, stub->entry(), /* is_far */ true); 228 bind(stub->continuation()); 229 } 230 231 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 232 233 Label notRecursive; 234 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive. 235 236 // Recursive lock 237 addi(disp_hdr, disp_hdr, -1); 238 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 239 mv(flag, zr); 240 j(cont); 241 242 bind(notRecursive); 243 ld(flag, Address(tmp, ObjectMonitor::EntryList_offset())); 244 ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 245 orr(flag, flag, disp_hdr); // Will be 0 if both are 0. 246 bnez(flag, cont); 247 // need a release store here 248 la(tmp, Address(tmp, ObjectMonitor::owner_offset())); 249 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore); 250 sd(zr, Address(tmp)); // set unowned 251 252 bind(cont); 253 // zero flag indicates success 254 // non-zero flag indicates failure 255 bnez(flag, no_count); 256 257 bind(count); 258 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp); 259 260 bind(no_count); 261 } 262 263 // short string 264 // StringUTF16.indexOfChar 265 // StringLatin1.indexOfChar 266 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1, 267 Register ch, Register result, 268 bool isL) 269 { 270 Register ch1 = t0; 271 Register index = t1; 272 273 BLOCK_COMMENT("string_indexof_char_short {"); 274 275 Label LOOP, LOOP1, LOOP4, LOOP8; 276 Label MATCH, MATCH1, MATCH2, MATCH3, 277 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH; 278 279 mv(result, -1); 280 mv(index, zr); 281 282 bind(LOOP); 283 addi(t0, index, 8); 284 ble(t0, cnt1, LOOP8); 285 addi(t0, index, 4); 286 ble(t0, cnt1, LOOP4); 287 j(LOOP1); 288 289 bind(LOOP8); 290 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 291 beq(ch, ch1, MATCH); 292 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 293 beq(ch, ch1, MATCH1); 294 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 295 beq(ch, ch1, MATCH2); 296 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 297 beq(ch, ch1, MATCH3); 298 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8)); 299 beq(ch, ch1, MATCH4); 300 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10)); 301 beq(ch, ch1, MATCH5); 302 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12)); 303 beq(ch, ch1, MATCH6); 304 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14)); 305 beq(ch, ch1, MATCH7); 306 addi(index, index, 8); 307 addi(str1, str1, isL ? 8 : 16); 308 blt(index, cnt1, LOOP); 309 j(NOMATCH); 310 311 bind(LOOP4); 312 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0)); 313 beq(ch, ch1, MATCH); 314 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2)); 315 beq(ch, ch1, MATCH1); 316 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4)); 317 beq(ch, ch1, MATCH2); 318 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6)); 319 beq(ch, ch1, MATCH3); 320 addi(index, index, 4); 321 addi(str1, str1, isL ? 4 : 8); 322 bge(index, cnt1, NOMATCH); 323 324 bind(LOOP1); 325 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1)); 326 beq(ch, ch1, MATCH); 327 addi(index, index, 1); 328 addi(str1, str1, isL ? 1 : 2); 329 blt(index, cnt1, LOOP1); 330 j(NOMATCH); 331 332 bind(MATCH1); 333 addi(index, index, 1); 334 j(MATCH); 335 336 bind(MATCH2); 337 addi(index, index, 2); 338 j(MATCH); 339 340 bind(MATCH3); 341 addi(index, index, 3); 342 j(MATCH); 343 344 bind(MATCH4); 345 addi(index, index, 4); 346 j(MATCH); 347 348 bind(MATCH5); 349 addi(index, index, 5); 350 j(MATCH); 351 352 bind(MATCH6); 353 addi(index, index, 6); 354 j(MATCH); 355 356 bind(MATCH7); 357 addi(index, index, 7); 358 359 bind(MATCH); 360 mv(result, index); 361 bind(NOMATCH); 362 BLOCK_COMMENT("} string_indexof_char_short"); 363 } 364 365 // StringUTF16.indexOfChar 366 // StringLatin1.indexOfChar 367 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 368 Register ch, Register result, 369 Register tmp1, Register tmp2, 370 Register tmp3, Register tmp4, 371 bool isL) 372 { 373 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG; 374 Register ch1 = t0; 375 Register orig_cnt = t1; 376 Register mask1 = tmp3; 377 Register mask2 = tmp2; 378 Register match_mask = tmp1; 379 Register trailing_char = tmp4; 380 Register unaligned_elems = tmp4; 381 382 BLOCK_COMMENT("string_indexof_char {"); 383 beqz(cnt1, NOMATCH); 384 385 addi(t0, cnt1, isL ? -32 : -16); 386 bgtz(t0, DO_LONG); 387 string_indexof_char_short(str1, cnt1, ch, result, isL); 388 j(DONE); 389 390 bind(DO_LONG); 391 mv(orig_cnt, cnt1); 392 if (AvoidUnalignedAccesses) { 393 Label ALIGNED; 394 andi(unaligned_elems, str1, 0x7); 395 beqz(unaligned_elems, ALIGNED); 396 sub(unaligned_elems, unaligned_elems, 8); 397 neg(unaligned_elems, unaligned_elems); 398 if (!isL) { 399 srli(unaligned_elems, unaligned_elems, 1); 400 } 401 // do unaligned part per element 402 string_indexof_char_short(str1, unaligned_elems, ch, result, isL); 403 bgez(result, DONE); 404 mv(orig_cnt, cnt1); 405 sub(cnt1, cnt1, unaligned_elems); 406 bind(ALIGNED); 407 } 408 409 // duplicate ch 410 if (isL) { 411 slli(ch1, ch, 8); 412 orr(ch, ch1, ch); 413 } 414 slli(ch1, ch, 16); 415 orr(ch, ch1, ch); 416 slli(ch1, ch, 32); 417 orr(ch, ch1, ch); 418 419 if (!isL) { 420 slli(cnt1, cnt1, 1); 421 } 422 423 uint64_t mask0101 = UCONST64(0x0101010101010101); 424 uint64_t mask0001 = UCONST64(0x0001000100010001); 425 mv(mask1, isL ? mask0101 : mask0001); 426 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 427 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 428 mv(mask2, isL ? mask7f7f : mask7fff); 429 430 bind(CH1_LOOP); 431 ld(ch1, Address(str1)); 432 addi(str1, str1, 8); 433 addi(cnt1, cnt1, -8); 434 compute_match_mask(ch1, ch, match_mask, mask1, mask2); 435 bnez(match_mask, HIT); 436 bgtz(cnt1, CH1_LOOP); 437 j(NOMATCH); 438 439 bind(HIT); 440 ctzc_bit(trailing_char, match_mask, isL, ch1, result); 441 srli(trailing_char, trailing_char, 3); 442 addi(cnt1, cnt1, 8); 443 ble(cnt1, trailing_char, NOMATCH); 444 // match case 445 if (!isL) { 446 srli(cnt1, cnt1, 1); 447 srli(trailing_char, trailing_char, 1); 448 } 449 450 sub(result, orig_cnt, cnt1); 451 add(result, result, trailing_char); 452 j(DONE); 453 454 bind(NOMATCH); 455 mv(result, -1); 456 457 bind(DONE); 458 BLOCK_COMMENT("} string_indexof_char"); 459 } 460 461 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp); 462 463 // Search for needle in haystack and return index or -1 464 // x10: result 465 // x11: haystack 466 // x12: haystack_len 467 // x13: needle 468 // x14: needle_len 469 void C2_MacroAssembler::string_indexof(Register haystack, Register needle, 470 Register haystack_len, Register needle_len, 471 Register tmp1, Register tmp2, 472 Register tmp3, Register tmp4, 473 Register tmp5, Register tmp6, 474 Register result, int ae) 475 { 476 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 477 478 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH; 479 480 Register ch1 = t0; 481 Register ch2 = t1; 482 Register nlen_tmp = tmp1; // needle len tmp 483 Register hlen_tmp = tmp2; // haystack len tmp 484 Register result_tmp = tmp4; 485 486 bool isLL = ae == StrIntrinsicNode::LL; 487 488 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 489 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 490 int needle_chr_shift = needle_isL ? 0 : 1; 491 int haystack_chr_shift = haystack_isL ? 0 : 1; 492 int needle_chr_size = needle_isL ? 1 : 2; 493 int haystack_chr_size = haystack_isL ? 1 : 2; 494 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 495 (load_chr_insn)&MacroAssembler::lhu; 496 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 497 (load_chr_insn)&MacroAssembler::lhu; 498 499 BLOCK_COMMENT("string_indexof {"); 500 501 // Note, inline_string_indexOf() generates checks: 502 // if (pattern.count > src.count) return -1; 503 // if (pattern.count == 0) return 0; 504 505 // We have two strings, a source string in haystack, haystack_len and a pattern string 506 // in needle, needle_len. Find the first occurrence of pattern in source or return -1. 507 508 // For larger pattern and source we use a simplified Boyer Moore algorithm. 509 // With a small pattern and source we use linear scan. 510 511 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm. 512 sub(result_tmp, haystack_len, needle_len); 513 // needle_len < 8, use linear scan 514 sub(t0, needle_len, 8); 515 bltz(t0, LINEARSEARCH); 516 // needle_len >= 256, use linear scan 517 sub(t0, needle_len, 256); 518 bgez(t0, LINEARSTUB); 519 // needle_len >= haystack_len/4, use linear scan 520 srli(t0, haystack_len, 2); 521 bge(needle_len, t0, LINEARSTUB); 522 523 // Boyer-Moore-Horspool introduction: 524 // The Boyer Moore alogorithm is based on the description here:- 525 // 526 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 527 // 528 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 529 // and the 'Good Suffix' rule. 530 // 531 // These rules are essentially heuristics for how far we can shift the 532 // pattern along the search string. 533 // 534 // The implementation here uses the 'Bad Character' rule only because of the 535 // complexity of initialisation for the 'Good Suffix' rule. 536 // 537 // This is also known as the Boyer-Moore-Horspool algorithm: 538 // 539 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 540 // 541 // #define ASIZE 256 542 // 543 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) { 544 // int i, j; 545 // unsigned c; 546 // unsigned char bc[ASIZE]; 547 // 548 // /* Preprocessing */ 549 // for (i = 0; i < ASIZE; ++i) 550 // bc[i] = m; 551 // for (i = 0; i < m - 1; ) { 552 // c = pattern[i]; 553 // ++i; 554 // // c < 256 for Latin1 string, so, no need for branch 555 // #ifdef PATTERN_STRING_IS_LATIN1 556 // bc[c] = m - i; 557 // #else 558 // if (c < ASIZE) bc[c] = m - i; 559 // #endif 560 // } 561 // 562 // /* Searching */ 563 // j = 0; 564 // while (j <= n - m) { 565 // c = src[i+j]; 566 // if (pattern[m-1] == c) 567 // int k; 568 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 569 // if (k < 0) return j; 570 // // c < 256 for Latin1 string, so, no need for branch 571 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1 572 // // LL case: (c< 256) always true. Remove branch 573 // j += bc[pattern[j+m-1]]; 574 // #endif 575 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF 576 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 577 // if (c < ASIZE) 578 // j += bc[pattern[j+m-1]]; 579 // else 580 // j += 1 581 // #endif 582 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1 583 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 584 // if (c < ASIZE) 585 // j += bc[pattern[j+m-1]]; 586 // else 587 // j += m 588 // #endif 589 // } 590 // return -1; 591 // } 592 593 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result 594 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 595 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 596 597 Register haystack_end = haystack_len; 598 Register skipch = tmp2; 599 600 // pattern length is >=8, so, we can read at least 1 register for cases when 601 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 602 // UL case. We'll re-read last character in inner pre-loop code to have 603 // single outer pre-loop load 604 const int firstStep = isLL ? 7 : 3; 605 606 const int ASIZE = 256; 607 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd) 608 609 sub(sp, sp, ASIZE); 610 611 // init BC offset table with default value: needle_len 612 slli(t0, needle_len, 8); 613 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len] 614 slli(tmp1, t0, 16); 615 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len] 616 slli(tmp1, t0, 32); 617 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len] 618 619 mv(ch1, sp); // ch1 is t0 620 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations 621 622 bind(BM_INIT_LOOP); 623 // for (i = 0; i < ASIZE; ++i) 624 // bc[i] = m; 625 for (int i = 0; i < 4; i++) { 626 sd(tmp5, Address(ch1, i * wordSize)); 627 } 628 add(ch1, ch1, 32); 629 sub(tmp6, tmp6, 4); 630 bgtz(tmp6, BM_INIT_LOOP); 631 632 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern 633 Register orig_haystack = tmp5; 634 mv(orig_haystack, haystack); 635 // result_tmp = tmp4 636 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift); 637 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1 638 mv(tmp3, needle); 639 640 // for (i = 0; i < m - 1; ) { 641 // c = pattern[i]; 642 // ++i; 643 // // c < 256 for Latin1 string, so, no need for branch 644 // #ifdef PATTERN_STRING_IS_LATIN1 645 // bc[c] = m - i; 646 // #else 647 // if (c < ASIZE) bc[c] = m - i; 648 // #endif 649 // } 650 bind(BCLOOP); 651 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg); 652 add(tmp3, tmp3, needle_chr_size); 653 if (!needle_isL) { 654 // ae == StrIntrinsicNode::UU 655 mv(tmp6, ASIZE); 656 bgeu(ch1, tmp6, BCSKIP); 657 } 658 add(tmp4, sp, ch1); 659 sb(ch2, Address(tmp4)); // store skip offset to BC offset table 660 661 bind(BCSKIP); 662 sub(ch2, ch2, 1); // for next pattern element, skip distance -1 663 bgtz(ch2, BCLOOP); 664 665 // tmp6: pattern end, address after needle 666 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift); 667 if (needle_isL == haystack_isL) { 668 // load last 8 bytes (8LL/4UU symbols) 669 ld(tmp6, Address(tmp6, -wordSize)); 670 } else { 671 // UL: from UTF-16(source) search Latin1(pattern) 672 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols) 673 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d 674 // We'll have to wait until load completed, but it's still faster than per-character loads+checks 675 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a 676 slli(ch2, tmp6, XLEN - 24); 677 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b 678 slli(ch1, tmp6, XLEN - 16); 679 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c 680 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d 681 slli(ch2, ch2, 16); 682 orr(ch2, ch2, ch1); // 0x00000b0c 683 slli(result, tmp3, 48); // use result as temp register 684 orr(tmp6, tmp6, result); // 0x0a00000d 685 slli(result, ch2, 16); 686 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d 687 } 688 689 // i = m - 1; 690 // skipch = j + i; 691 // if (skipch == pattern[m - 1] 692 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k); 693 // else 694 // move j with bad char offset table 695 bind(BMLOOPSTR2); 696 // compare pattern to source string backward 697 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift); 698 (this->*haystack_load_1chr)(skipch, Address(result), noreg); 699 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8 700 if (needle_isL == haystack_isL) { 701 // re-init tmp3. It's for free because it's executed in parallel with 702 // load above. Alternative is to initialize it before loop, but it'll 703 // affect performance on in-order systems with 2 or more ld/st pipelines 704 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1] 705 } 706 if (!isLL) { // UU/UL case 707 slli(ch2, nlen_tmp, 1); // offsets in bytes 708 } 709 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char 710 add(result, haystack, isLL ? nlen_tmp : ch2); 711 // load 8 bytes from source string 712 // if isLL is false then read granularity can be 2 713 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway 714 mv(ch1, tmp6); 715 if (isLL) { 716 j(BMLOOPSTR1_AFTER_LOAD); 717 } else { 718 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 719 j(BMLOOPSTR1_CMP); 720 } 721 722 bind(BMLOOPSTR1); 723 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift); 724 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 725 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift); 726 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 727 728 bind(BMLOOPSTR1_AFTER_LOAD); 729 sub(nlen_tmp, nlen_tmp, 1); 730 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP); 731 732 bind(BMLOOPSTR1_CMP); 733 beq(ch1, ch2, BMLOOPSTR1); 734 735 bind(BMSKIP); 736 if (!isLL) { 737 // if we've met UTF symbol while searching Latin1 pattern, then we can 738 // skip needle_len symbols 739 if (needle_isL != haystack_isL) { 740 mv(result_tmp, needle_len); 741 } else { 742 mv(result_tmp, 1); 743 } 744 mv(t0, ASIZE); 745 bgeu(skipch, t0, BMADV); 746 } 747 add(result_tmp, sp, skipch); 748 lbu(result_tmp, Address(result_tmp)); // load skip offset 749 750 bind(BMADV); 751 sub(nlen_tmp, needle_len, 1); 752 // move haystack after bad char skip offset 753 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift); 754 ble(haystack, haystack_end, BMLOOPSTR2); 755 add(sp, sp, ASIZE); 756 j(NOMATCH); 757 758 bind(BMLOOPSTR1_LASTCMP); 759 bne(ch1, ch2, BMSKIP); 760 761 bind(BMMATCH); 762 sub(result, haystack, orig_haystack); 763 if (!haystack_isL) { 764 srli(result, result, 1); 765 } 766 add(sp, sp, ASIZE); 767 j(DONE); 768 769 bind(LINEARSTUB); 770 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm 771 bltz(t0, LINEARSEARCH); 772 mv(result, zr); 773 RuntimeAddress stub = nullptr; 774 if (isLL) { 775 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll()); 776 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 777 } else if (needle_isL) { 778 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul()); 779 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 780 } else { 781 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu()); 782 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 783 } 784 address call = trampoline_call(stub); 785 if (call == nullptr) { 786 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH)); 787 ciEnv::current()->record_failure("CodeCache is full"); 788 return; 789 } 790 j(DONE); 791 792 bind(NOMATCH); 793 mv(result, -1); 794 j(DONE); 795 796 bind(LINEARSEARCH); 797 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae); 798 799 bind(DONE); 800 BLOCK_COMMENT("} string_indexof"); 801 } 802 803 // string_indexof 804 // result: x10 805 // src: x11 806 // src_count: x12 807 // pattern: x13 808 // pattern_count: x14 or 1/2/3/4 809 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle, 810 Register haystack_len, Register needle_len, 811 Register tmp1, Register tmp2, 812 Register tmp3, Register tmp4, 813 int needle_con_cnt, Register result, int ae) 814 { 815 // Note: 816 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant 817 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1 818 assert(needle_con_cnt <= 4, "Invalid needle constant count"); 819 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 820 821 Register ch1 = t0; 822 Register ch2 = t1; 823 Register hlen_neg = haystack_len, nlen_neg = needle_len; 824 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4; 825 826 bool isLL = ae == StrIntrinsicNode::LL; 827 828 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 829 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 830 int needle_chr_shift = needle_isL ? 0 : 1; 831 int haystack_chr_shift = haystack_isL ? 0 : 1; 832 int needle_chr_size = needle_isL ? 1 : 2; 833 int haystack_chr_size = haystack_isL ? 1 : 2; 834 835 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu : 836 (load_chr_insn)&MacroAssembler::lhu; 837 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu : 838 (load_chr_insn)&MacroAssembler::lhu; 839 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu; 840 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld; 841 842 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE; 843 844 Register first = tmp3; 845 846 if (needle_con_cnt == -1) { 847 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 848 849 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2); 850 bltz(t0, DOSHORT); 851 852 (this->*needle_load_1chr)(first, Address(needle), noreg); 853 slli(t0, needle_len, needle_chr_shift); 854 add(needle, needle, t0); 855 neg(nlen_neg, t0); 856 slli(t0, result_tmp, haystack_chr_shift); 857 add(haystack, haystack, t0); 858 neg(hlen_neg, t0); 859 860 bind(FIRST_LOOP); 861 add(t0, haystack, hlen_neg); 862 (this->*haystack_load_1chr)(ch2, Address(t0), noreg); 863 beq(first, ch2, STR1_LOOP); 864 865 bind(STR2_NEXT); 866 add(hlen_neg, hlen_neg, haystack_chr_size); 867 blez(hlen_neg, FIRST_LOOP); 868 j(NOMATCH); 869 870 bind(STR1_LOOP); 871 add(nlen_tmp, nlen_neg, needle_chr_size); 872 add(hlen_tmp, hlen_neg, haystack_chr_size); 873 bgez(nlen_tmp, MATCH); 874 875 bind(STR1_NEXT); 876 add(ch1, needle, nlen_tmp); 877 (this->*needle_load_1chr)(ch1, Address(ch1), noreg); 878 add(ch2, haystack, hlen_tmp); 879 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 880 bne(ch1, ch2, STR2_NEXT); 881 add(nlen_tmp, nlen_tmp, needle_chr_size); 882 add(hlen_tmp, hlen_tmp, haystack_chr_size); 883 bltz(nlen_tmp, STR1_NEXT); 884 j(MATCH); 885 886 bind(DOSHORT); 887 if (needle_isL == haystack_isL) { 888 sub(t0, needle_len, 2); 889 bltz(t0, DO1); 890 bgtz(t0, DO3); 891 } 892 } 893 894 if (needle_con_cnt == 4) { 895 Label CH1_LOOP; 896 (this->*load_4chr)(ch1, Address(needle), noreg); 897 sub(result_tmp, haystack_len, 4); 898 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp 899 add(haystack, haystack, tmp3); 900 neg(hlen_neg, tmp3); 901 if (AvoidUnalignedAccesses) { 902 // preload first value, then we will read by 1 character per loop, instead of four 903 // just shifting previous ch2 right by size of character in bits 904 add(tmp3, haystack, hlen_neg); 905 (this->*load_4chr)(ch2, Address(tmp3), noreg); 906 if (isLL) { 907 // need to erase 1 most significant byte in 32-bit value of ch2 908 slli(ch2, ch2, 40); 909 srli(ch2, ch2, 32); 910 } else { 911 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation 912 } 913 } 914 915 bind(CH1_LOOP); 916 add(tmp3, haystack, hlen_neg); 917 if (AvoidUnalignedAccesses) { 918 srli(ch2, ch2, isLL ? 8 : 16); 919 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg); 920 slli(tmp3, tmp3, isLL ? 24 : 48); 921 add(ch2, ch2, tmp3); 922 } else { 923 (this->*load_4chr)(ch2, Address(tmp3), noreg); 924 } 925 beq(ch1, ch2, MATCH); 926 add(hlen_neg, hlen_neg, haystack_chr_size); 927 blez(hlen_neg, CH1_LOOP); 928 j(NOMATCH); 929 } 930 931 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) { 932 Label CH1_LOOP; 933 BLOCK_COMMENT("string_indexof DO2 {"); 934 bind(DO2); 935 (this->*load_2chr)(ch1, Address(needle), noreg); 936 if (needle_con_cnt == 2) { 937 sub(result_tmp, haystack_len, 2); 938 } 939 slli(tmp3, result_tmp, haystack_chr_shift); 940 add(haystack, haystack, tmp3); 941 neg(hlen_neg, tmp3); 942 if (AvoidUnalignedAccesses) { 943 // preload first value, then we will read by 1 character per loop, instead of two 944 // just shifting previous ch2 right by size of character in bits 945 add(tmp3, haystack, hlen_neg); 946 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 947 slli(ch2, ch2, isLL ? 8 : 16); 948 } 949 bind(CH1_LOOP); 950 add(tmp3, haystack, hlen_neg); 951 if (AvoidUnalignedAccesses) { 952 srli(ch2, ch2, isLL ? 8 : 16); 953 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg); 954 slli(tmp3, tmp3, isLL ? 8 : 16); 955 add(ch2, ch2, tmp3); 956 } else { 957 (this->*load_2chr)(ch2, Address(tmp3), noreg); 958 } 959 beq(ch1, ch2, MATCH); 960 add(hlen_neg, hlen_neg, haystack_chr_size); 961 blez(hlen_neg, CH1_LOOP); 962 j(NOMATCH); 963 BLOCK_COMMENT("} string_indexof DO2"); 964 } 965 966 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) { 967 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 968 BLOCK_COMMENT("string_indexof DO3 {"); 969 970 bind(DO3); 971 (this->*load_2chr)(first, Address(needle), noreg); 972 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg); 973 if (needle_con_cnt == 3) { 974 sub(result_tmp, haystack_len, 3); 975 } 976 slli(hlen_tmp, result_tmp, haystack_chr_shift); 977 add(haystack, haystack, hlen_tmp); 978 neg(hlen_neg, hlen_tmp); 979 980 bind(FIRST_LOOP); 981 add(ch2, haystack, hlen_neg); 982 if (AvoidUnalignedAccesses) { 983 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2 984 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 985 slli(tmp2, tmp2, isLL ? 8 : 16); 986 add(ch2, ch2, tmp2); 987 } else { 988 (this->*load_2chr)(ch2, Address(ch2), noreg); 989 } 990 beq(first, ch2, STR1_LOOP); 991 992 bind(STR2_NEXT); 993 add(hlen_neg, hlen_neg, haystack_chr_size); 994 blez(hlen_neg, FIRST_LOOP); 995 j(NOMATCH); 996 997 bind(STR1_LOOP); 998 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size); 999 add(ch2, haystack, hlen_tmp); 1000 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg); 1001 bne(ch1, ch2, STR2_NEXT); 1002 j(MATCH); 1003 BLOCK_COMMENT("} string_indexof DO3"); 1004 } 1005 1006 if (needle_con_cnt == -1 || needle_con_cnt == 1) { 1007 Label DO1_LOOP; 1008 1009 BLOCK_COMMENT("string_indexof DO1 {"); 1010 bind(DO1); 1011 (this->*needle_load_1chr)(ch1, Address(needle), noreg); 1012 sub(result_tmp, haystack_len, 1); 1013 slli(tmp3, result_tmp, haystack_chr_shift); 1014 add(haystack, haystack, tmp3); 1015 neg(hlen_neg, tmp3); 1016 1017 bind(DO1_LOOP); 1018 add(tmp3, haystack, hlen_neg); 1019 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg); 1020 beq(ch1, ch2, MATCH); 1021 add(hlen_neg, hlen_neg, haystack_chr_size); 1022 blez(hlen_neg, DO1_LOOP); 1023 BLOCK_COMMENT("} string_indexof DO1"); 1024 } 1025 1026 bind(NOMATCH); 1027 mv(result, -1); 1028 j(DONE); 1029 1030 bind(MATCH); 1031 srai(t0, hlen_neg, haystack_chr_shift); 1032 add(result, result_tmp, t0); 1033 1034 bind(DONE); 1035 } 1036 1037 // Compare strings. 1038 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1039 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1040 Register tmp3, int ae) 1041 { 1042 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1043 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1044 SHORT_LOOP_START, TAIL_CHECK, L; 1045 1046 const int STUB_THRESHOLD = 64 + 8; 1047 bool isLL = ae == StrIntrinsicNode::LL; 1048 bool isLU = ae == StrIntrinsicNode::LU; 1049 bool isUL = ae == StrIntrinsicNode::UL; 1050 1051 bool str1_isL = isLL || isLU; 1052 bool str2_isL = isLL || isUL; 1053 1054 // for L strings, 1 byte for 1 character 1055 // for U strings, 2 bytes for 1 character 1056 int str1_chr_size = str1_isL ? 1 : 2; 1057 int str2_chr_size = str2_isL ? 1 : 2; 1058 int minCharsInWord = isLL ? wordSize : wordSize / 2; 1059 1060 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1061 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; 1062 1063 BLOCK_COMMENT("string_compare {"); 1064 1065 // Bizzarely, the counts are passed in bytes, regardless of whether they 1066 // are L or U strings, however the result is always in characters. 1067 if (!str1_isL) { 1068 sraiw(cnt1, cnt1, 1); 1069 } 1070 if (!str2_isL) { 1071 sraiw(cnt2, cnt2, 1); 1072 } 1073 1074 // Compute the minimum of the string lengths and save the difference in result. 1075 sub(result, cnt1, cnt2); 1076 bgt(cnt1, cnt2, L); 1077 mv(cnt2, cnt1); 1078 bind(L); 1079 1080 // A very short string 1081 mv(t0, minCharsInWord); 1082 ble(cnt2, t0, SHORT_STRING); 1083 1084 // Compare longwords 1085 // load first parts of strings and finish initialization while loading 1086 { 1087 if (str1_isL == str2_isL) { // LL or UU 1088 // check if str1 and str2 is same pointer 1089 beq(str1, str2, DONE); 1090 // load 8 bytes once to compare 1091 ld(tmp1, Address(str1)); 1092 ld(tmp2, Address(str2)); 1093 mv(t0, STUB_THRESHOLD); 1094 bge(cnt2, t0, STUB); 1095 sub(cnt2, cnt2, minCharsInWord); 1096 beqz(cnt2, TAIL_CHECK); 1097 // convert cnt2 from characters to bytes 1098 if (!str1_isL) { 1099 slli(cnt2, cnt2, 1); 1100 } 1101 add(str2, str2, cnt2); 1102 add(str1, str1, cnt2); 1103 sub(cnt2, zr, cnt2); 1104 } else if (isLU) { // LU case 1105 lwu(tmp1, Address(str1)); 1106 ld(tmp2, Address(str2)); 1107 mv(t0, STUB_THRESHOLD); 1108 bge(cnt2, t0, STUB); 1109 addi(cnt2, cnt2, -4); 1110 add(str1, str1, cnt2); 1111 sub(cnt1, zr, cnt2); 1112 slli(cnt2, cnt2, 1); 1113 add(str2, str2, cnt2); 1114 inflate_lo32(tmp3, tmp1); 1115 mv(tmp1, tmp3); 1116 sub(cnt2, zr, cnt2); 1117 addi(cnt1, cnt1, 4); 1118 } else { // UL case 1119 ld(tmp1, Address(str1)); 1120 lwu(tmp2, Address(str2)); 1121 mv(t0, STUB_THRESHOLD); 1122 bge(cnt2, t0, STUB); 1123 addi(cnt2, cnt2, -4); 1124 slli(t0, cnt2, 1); 1125 sub(cnt1, zr, t0); 1126 add(str1, str1, t0); 1127 add(str2, str2, cnt2); 1128 inflate_lo32(tmp3, tmp2); 1129 mv(tmp2, tmp3); 1130 sub(cnt2, zr, cnt2); 1131 addi(cnt1, cnt1, 8); 1132 } 1133 addi(cnt2, cnt2, isUL ? 4 : 8); 1134 bne(tmp1, tmp2, DIFFERENCE); 1135 bgez(cnt2, TAIL); 1136 1137 // main loop 1138 bind(NEXT_WORD); 1139 if (str1_isL == str2_isL) { // LL or UU 1140 add(t0, str1, cnt2); 1141 ld(tmp1, Address(t0)); 1142 add(t0, str2, cnt2); 1143 ld(tmp2, Address(t0)); 1144 addi(cnt2, cnt2, 8); 1145 } else if (isLU) { // LU case 1146 add(t0, str1, cnt1); 1147 lwu(tmp1, Address(t0)); 1148 add(t0, str2, cnt2); 1149 ld(tmp2, Address(t0)); 1150 addi(cnt1, cnt1, 4); 1151 inflate_lo32(tmp3, tmp1); 1152 mv(tmp1, tmp3); 1153 addi(cnt2, cnt2, 8); 1154 } else { // UL case 1155 add(t0, str2, cnt2); 1156 lwu(tmp2, Address(t0)); 1157 add(t0, str1, cnt1); 1158 ld(tmp1, Address(t0)); 1159 inflate_lo32(tmp3, tmp2); 1160 mv(tmp2, tmp3); 1161 addi(cnt1, cnt1, 8); 1162 addi(cnt2, cnt2, 4); 1163 } 1164 bne(tmp1, tmp2, DIFFERENCE); 1165 bltz(cnt2, NEXT_WORD); 1166 bind(TAIL); 1167 if (str1_isL == str2_isL) { // LL or UU 1168 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2); 1169 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2); 1170 } else if (isLU) { // LU case 1171 load_int_misaligned(tmp1, Address(str1), tmp3, false); 1172 load_long_misaligned(tmp2, Address(str2), tmp3, 2); 1173 inflate_lo32(tmp3, tmp1); 1174 mv(tmp1, tmp3); 1175 } else { // UL case 1176 load_int_misaligned(tmp2, Address(str2), tmp3, false); 1177 load_long_misaligned(tmp1, Address(str1), tmp3, 2); 1178 inflate_lo32(tmp3, tmp2); 1179 mv(tmp2, tmp3); 1180 } 1181 bind(TAIL_CHECK); 1182 beq(tmp1, tmp2, DONE); 1183 1184 // Find the first different characters in the longwords and 1185 // compute their difference. 1186 bind(DIFFERENCE); 1187 xorr(tmp3, tmp1, tmp2); 1188 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb 1189 srl(tmp1, tmp1, result); 1190 srl(tmp2, tmp2, result); 1191 if (isLL) { 1192 andi(tmp1, tmp1, 0xFF); 1193 andi(tmp2, tmp2, 0xFF); 1194 } else { 1195 andi(tmp1, tmp1, 0xFFFF); 1196 andi(tmp2, tmp2, 0xFFFF); 1197 } 1198 sub(result, tmp1, tmp2); 1199 j(DONE); 1200 } 1201 1202 bind(STUB); 1203 RuntimeAddress stub = nullptr; 1204 switch (ae) { 1205 case StrIntrinsicNode::LL: 1206 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL()); 1207 break; 1208 case StrIntrinsicNode::UU: 1209 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU()); 1210 break; 1211 case StrIntrinsicNode::LU: 1212 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU()); 1213 break; 1214 case StrIntrinsicNode::UL: 1215 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL()); 1216 break; 1217 default: 1218 ShouldNotReachHere(); 1219 } 1220 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1221 address call = trampoline_call(stub); 1222 if (call == nullptr) { 1223 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1224 ciEnv::current()->record_failure("CodeCache is full"); 1225 return; 1226 } 1227 j(DONE); 1228 1229 bind(SHORT_STRING); 1230 // Is the minimum length zero? 1231 beqz(cnt2, DONE); 1232 // arrange code to do most branches while loading and loading next characters 1233 // while comparing previous 1234 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1235 addi(str1, str1, str1_chr_size); 1236 addi(cnt2, cnt2, -1); 1237 beqz(cnt2, SHORT_LAST_INIT); 1238 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1239 addi(str2, str2, str2_chr_size); 1240 j(SHORT_LOOP_START); 1241 bind(SHORT_LOOP); 1242 addi(cnt2, cnt2, -1); 1243 beqz(cnt2, SHORT_LAST); 1244 bind(SHORT_LOOP_START); 1245 (this->*str1_load_chr)(tmp2, Address(str1), t0); 1246 addi(str1, str1, str1_chr_size); 1247 (this->*str2_load_chr)(t0, Address(str2), t0); 1248 addi(str2, str2, str2_chr_size); 1249 bne(tmp1, cnt1, SHORT_LOOP_TAIL); 1250 addi(cnt2, cnt2, -1); 1251 beqz(cnt2, SHORT_LAST2); 1252 (this->*str1_load_chr)(tmp1, Address(str1), t0); 1253 addi(str1, str1, str1_chr_size); 1254 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1255 addi(str2, str2, str2_chr_size); 1256 beq(tmp2, t0, SHORT_LOOP); 1257 sub(result, tmp2, t0); 1258 j(DONE); 1259 bind(SHORT_LOOP_TAIL); 1260 sub(result, tmp1, cnt1); 1261 j(DONE); 1262 bind(SHORT_LAST2); 1263 beq(tmp2, t0, DONE); 1264 sub(result, tmp2, t0); 1265 1266 j(DONE); 1267 bind(SHORT_LAST_INIT); 1268 (this->*str2_load_chr)(cnt1, Address(str2), t0); 1269 addi(str2, str2, str2_chr_size); 1270 bind(SHORT_LAST); 1271 beq(tmp1, cnt1, DONE); 1272 sub(result, tmp1, cnt1); 1273 1274 bind(DONE); 1275 1276 BLOCK_COMMENT("} string_compare"); 1277 } 1278 1279 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 1280 Register tmp4, Register tmp5, Register tmp6, Register result, 1281 Register cnt1, int elem_size) { 1282 Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR; 1283 Register tmp1 = t0; 1284 Register tmp2 = t1; 1285 Register cnt2 = tmp2; // cnt2 only used in array length compare 1286 Register elem_per_word = tmp6; 1287 int log_elem_size = exact_log2(elem_size); 1288 int length_offset = arrayOopDesc::length_offset_in_bytes(); 1289 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 1290 1291 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 1292 assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6); 1293 mv(elem_per_word, wordSize / elem_size); 1294 1295 BLOCK_COMMENT("arrays_equals {"); 1296 1297 // if (a1 == a2), return true 1298 beq(a1, a2, SAME); 1299 1300 mv(result, false); 1301 beqz(a1, DONE); 1302 beqz(a2, DONE); 1303 lwu(cnt1, Address(a1, length_offset)); 1304 lwu(cnt2, Address(a2, length_offset)); 1305 bne(cnt2, cnt1, DONE); 1306 beqz(cnt1, SAME); 1307 1308 slli(tmp5, cnt1, 3 + log_elem_size); 1309 sub(tmp5, zr, tmp5); 1310 add(a1, a1, base_offset); 1311 add(a2, a2, base_offset); 1312 ld(tmp3, Address(a1, 0)); 1313 ld(tmp4, Address(a2, 0)); 1314 ble(cnt1, elem_per_word, SHORT); // short or same 1315 1316 // Main 16 byte comparison loop with 2 exits 1317 bind(NEXT_DWORD); { 1318 ld(tmp1, Address(a1, wordSize)); 1319 ld(tmp2, Address(a2, wordSize)); 1320 sub(cnt1, cnt1, 2 * wordSize / elem_size); 1321 blez(cnt1, TAIL); 1322 bne(tmp3, tmp4, DONE); 1323 ld(tmp3, Address(a1, 2 * wordSize)); 1324 ld(tmp4, Address(a2, 2 * wordSize)); 1325 add(a1, a1, 2 * wordSize); 1326 add(a2, a2, 2 * wordSize); 1327 ble(cnt1, elem_per_word, TAIL2); 1328 } beq(tmp1, tmp2, NEXT_DWORD); 1329 j(DONE); 1330 1331 bind(TAIL); 1332 xorr(tmp4, tmp3, tmp4); 1333 xorr(tmp2, tmp1, tmp2); 1334 sll(tmp2, tmp2, tmp5); 1335 orr(tmp5, tmp4, tmp2); 1336 j(IS_TMP5_ZR); 1337 1338 bind(TAIL2); 1339 bne(tmp1, tmp2, DONE); 1340 1341 bind(SHORT); 1342 xorr(tmp4, tmp3, tmp4); 1343 sll(tmp5, tmp4, tmp5); 1344 1345 bind(IS_TMP5_ZR); 1346 bnez(tmp5, DONE); 1347 1348 bind(SAME); 1349 mv(result, true); 1350 // That's it. 1351 bind(DONE); 1352 1353 BLOCK_COMMENT("} array_equals"); 1354 } 1355 1356 // Compare Strings 1357 1358 // For Strings we're passed the address of the first characters in a1 1359 // and a2 and the length in cnt1. 1360 // elem_size is the element size in bytes: either 1 or 2. 1361 // There are two implementations. For arrays >= 8 bytes, all 1362 // comparisons (for hw supporting unaligned access: including the final one, 1363 // which may overlap) are performed 8 bytes at a time. 1364 // For strings < 8 bytes (and for tails of long strings when 1365 // AvoidUnalignedAccesses is true), we compare a 1366 // halfword, then a short, and then a byte. 1367 1368 void C2_MacroAssembler::string_equals(Register a1, Register a2, 1369 Register result, Register cnt1, int elem_size) 1370 { 1371 Label SAME, DONE, SHORT, NEXT_WORD; 1372 Register tmp1 = t0; 1373 Register tmp2 = t1; 1374 1375 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 1376 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); 1377 1378 BLOCK_COMMENT("string_equals {"); 1379 1380 beqz(cnt1, SAME); 1381 mv(result, false); 1382 1383 // Check for short strings, i.e. smaller than wordSize. 1384 sub(cnt1, cnt1, wordSize); 1385 bltz(cnt1, SHORT); 1386 1387 // Main 8 byte comparison loop. 1388 bind(NEXT_WORD); { 1389 ld(tmp1, Address(a1, 0)); 1390 add(a1, a1, wordSize); 1391 ld(tmp2, Address(a2, 0)); 1392 add(a2, a2, wordSize); 1393 sub(cnt1, cnt1, wordSize); 1394 bne(tmp1, tmp2, DONE); 1395 } bgez(cnt1, NEXT_WORD); 1396 1397 if (!AvoidUnalignedAccesses) { 1398 // Last longword. In the case where length == 4 we compare the 1399 // same longword twice, but that's still faster than another 1400 // conditional branch. 1401 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 1402 // length == 4. 1403 add(tmp1, a1, cnt1); 1404 ld(tmp1, Address(tmp1, 0)); 1405 add(tmp2, a2, cnt1); 1406 ld(tmp2, Address(tmp2, 0)); 1407 bne(tmp1, tmp2, DONE); 1408 j(SAME); 1409 } else { 1410 add(tmp1, cnt1, wordSize); 1411 beqz(tmp1, SAME); 1412 } 1413 1414 bind(SHORT); 1415 Label TAIL03, TAIL01; 1416 1417 // 0-7 bytes left. 1418 test_bit(tmp1, cnt1, 2); 1419 beqz(tmp1, TAIL03); 1420 { 1421 lwu(tmp1, Address(a1, 0)); 1422 add(a1, a1, 4); 1423 lwu(tmp2, Address(a2, 0)); 1424 add(a2, a2, 4); 1425 bne(tmp1, tmp2, DONE); 1426 } 1427 1428 bind(TAIL03); 1429 // 0-3 bytes left. 1430 test_bit(tmp1, cnt1, 1); 1431 beqz(tmp1, TAIL01); 1432 { 1433 lhu(tmp1, Address(a1, 0)); 1434 add(a1, a1, 2); 1435 lhu(tmp2, Address(a2, 0)); 1436 add(a2, a2, 2); 1437 bne(tmp1, tmp2, DONE); 1438 } 1439 1440 bind(TAIL01); 1441 if (elem_size == 1) { // Only needed when comparing 1-byte elements 1442 // 0-1 bytes left. 1443 test_bit(tmp1, cnt1, 0); 1444 beqz(tmp1, SAME); 1445 { 1446 lbu(tmp1, Address(a1, 0)); 1447 lbu(tmp2, Address(a2, 0)); 1448 bne(tmp1, tmp2, DONE); 1449 } 1450 } 1451 1452 // Arrays are equal. 1453 bind(SAME); 1454 mv(result, true); 1455 1456 // That's it. 1457 bind(DONE); 1458 BLOCK_COMMENT("} string_equals"); 1459 } 1460 1461 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far); 1462 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label, 1463 bool is_far, bool is_unordered); 1464 1465 static conditional_branch_insn conditional_branches[] = 1466 { 1467 /* SHORT branches */ 1468 (conditional_branch_insn)&MacroAssembler::beq, 1469 (conditional_branch_insn)&MacroAssembler::bgt, 1470 nullptr, // BoolTest::overflow 1471 (conditional_branch_insn)&MacroAssembler::blt, 1472 (conditional_branch_insn)&MacroAssembler::bne, 1473 (conditional_branch_insn)&MacroAssembler::ble, 1474 nullptr, // BoolTest::no_overflow 1475 (conditional_branch_insn)&MacroAssembler::bge, 1476 1477 /* UNSIGNED branches */ 1478 (conditional_branch_insn)&MacroAssembler::beq, 1479 (conditional_branch_insn)&MacroAssembler::bgtu, 1480 nullptr, 1481 (conditional_branch_insn)&MacroAssembler::bltu, 1482 (conditional_branch_insn)&MacroAssembler::bne, 1483 (conditional_branch_insn)&MacroAssembler::bleu, 1484 nullptr, 1485 (conditional_branch_insn)&MacroAssembler::bgeu 1486 }; 1487 1488 static float_conditional_branch_insn float_conditional_branches[] = 1489 { 1490 /* FLOAT SHORT branches */ 1491 (float_conditional_branch_insn)&MacroAssembler::float_beq, 1492 (float_conditional_branch_insn)&MacroAssembler::float_bgt, 1493 nullptr, // BoolTest::overflow 1494 (float_conditional_branch_insn)&MacroAssembler::float_blt, 1495 (float_conditional_branch_insn)&MacroAssembler::float_bne, 1496 (float_conditional_branch_insn)&MacroAssembler::float_ble, 1497 nullptr, // BoolTest::no_overflow 1498 (float_conditional_branch_insn)&MacroAssembler::float_bge, 1499 1500 /* DOUBLE SHORT branches */ 1501 (float_conditional_branch_insn)&MacroAssembler::double_beq, 1502 (float_conditional_branch_insn)&MacroAssembler::double_bgt, 1503 nullptr, 1504 (float_conditional_branch_insn)&MacroAssembler::double_blt, 1505 (float_conditional_branch_insn)&MacroAssembler::double_bne, 1506 (float_conditional_branch_insn)&MacroAssembler::double_ble, 1507 nullptr, 1508 (float_conditional_branch_insn)&MacroAssembler::double_bge 1509 }; 1510 1511 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) { 1512 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])), 1513 "invalid conditional branch index"); 1514 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far); 1515 } 1516 1517 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use 1518 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode(). 1519 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) { 1520 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])), 1521 "invalid float conditional branch index"); 1522 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask); 1523 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far, 1524 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true); 1525 } 1526 1527 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1528 switch (cmpFlag) { 1529 case BoolTest::eq: 1530 case BoolTest::le: 1531 beqz(op1, L, is_far); 1532 break; 1533 case BoolTest::ne: 1534 case BoolTest::gt: 1535 bnez(op1, L, is_far); 1536 break; 1537 default: 1538 ShouldNotReachHere(); 1539 } 1540 } 1541 1542 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) { 1543 switch (cmpFlag) { 1544 case BoolTest::eq: 1545 beqz(op1, L, is_far); 1546 break; 1547 case BoolTest::ne: 1548 bnez(op1, L, is_far); 1549 break; 1550 default: 1551 ShouldNotReachHere(); 1552 } 1553 } 1554 1555 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) { 1556 Label L; 1557 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L); 1558 mv(dst, src); 1559 bind(L); 1560 } 1561 1562 // Set dst to NaN if any NaN input. 1563 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2, 1564 bool is_double, bool is_min) { 1565 assert_different_registers(dst, src1, src2); 1566 1567 Label Done, Compare; 1568 1569 is_double ? fclass_d(t0, src1) 1570 : fclass_s(t0, src1); 1571 is_double ? fclass_d(t1, src2) 1572 : fclass_s(t1, src2); 1573 orr(t0, t0, t1); 1574 andi(t0, t0, 0b1100000000); //if src1 or src2 is quiet or signaling NaN then return NaN 1575 beqz(t0, Compare); 1576 is_double ? fadd_d(dst, src1, src2) 1577 : fadd_s(dst, src1, src2); 1578 j(Done); 1579 1580 bind(Compare); 1581 if (is_double) { 1582 is_min ? fmin_d(dst, src1, src2) 1583 : fmax_d(dst, src1, src2); 1584 } else { 1585 is_min ? fmin_s(dst, src1, src2) 1586 : fmax_s(dst, src1, src2); 1587 } 1588 1589 bind(Done); 1590 } 1591 1592 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, 1593 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) { 1594 Label loop; 1595 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16; 1596 1597 bind(loop); 1598 vsetvli(tmp1, cnt, sew, Assembler::m2); 1599 vlex_v(vr1, a1, sew); 1600 vlex_v(vr2, a2, sew); 1601 vmsne_vv(vrs, vr1, vr2); 1602 vfirst_m(tmp2, vrs); 1603 bgez(tmp2, DONE); 1604 sub(cnt, cnt, tmp1); 1605 if (!islatin) { 1606 slli(tmp1, tmp1, 1); // get byte counts 1607 } 1608 add(a1, a1, tmp1); 1609 add(a2, a2, tmp1); 1610 bnez(cnt, loop); 1611 1612 mv(result, true); 1613 } 1614 1615 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt, int elem_size) { 1616 Label DONE; 1617 Register tmp1 = t0; 1618 Register tmp2 = t1; 1619 1620 BLOCK_COMMENT("string_equals_v {"); 1621 1622 mv(result, false); 1623 1624 if (elem_size == 2) { 1625 srli(cnt, cnt, 1); 1626 } 1627 1628 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE); 1629 1630 bind(DONE); 1631 BLOCK_COMMENT("} string_equals_v"); 1632 } 1633 1634 // used by C2 ClearArray patterns. 1635 // base: Address of a buffer to be zeroed 1636 // cnt: Count in HeapWords 1637 // 1638 // base, cnt, v4, v5, v6, v7 and t0 are clobbered. 1639 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) { 1640 Label loop; 1641 1642 // making zero words 1643 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 1644 vxor_vv(v4, v4, v4); 1645 1646 bind(loop); 1647 vsetvli(t0, cnt, Assembler::e64, Assembler::m4); 1648 vse64_v(v4, base); 1649 sub(cnt, cnt, t0); 1650 shadd(base, t0, base, t0, 3); 1651 bnez(cnt, loop); 1652 } 1653 1654 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result, 1655 Register cnt1, int elem_size) { 1656 Label DONE; 1657 Register tmp1 = t0; 1658 Register tmp2 = t1; 1659 Register cnt2 = tmp2; 1660 int length_offset = arrayOopDesc::length_offset_in_bytes(); 1661 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 1662 1663 BLOCK_COMMENT("arrays_equals_v {"); 1664 1665 // if (a1 == a2), return true 1666 mv(result, true); 1667 beq(a1, a2, DONE); 1668 1669 mv(result, false); 1670 // if a1 == null or a2 == null, return false 1671 beqz(a1, DONE); 1672 beqz(a2, DONE); 1673 // if (a1.length != a2.length), return false 1674 lwu(cnt1, Address(a1, length_offset)); 1675 lwu(cnt2, Address(a2, length_offset)); 1676 bne(cnt1, cnt2, DONE); 1677 1678 la(a1, Address(a1, base_offset)); 1679 la(a2, Address(a2, base_offset)); 1680 1681 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE); 1682 1683 bind(DONE); 1684 1685 BLOCK_COMMENT("} arrays_equals_v"); 1686 } 1687 1688 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2, 1689 Register result, Register tmp1, Register tmp2, int encForm) { 1690 Label DIFFERENCE, DONE, L, loop; 1691 bool encLL = encForm == StrIntrinsicNode::LL; 1692 bool encLU = encForm == StrIntrinsicNode::LU; 1693 bool encUL = encForm == StrIntrinsicNode::UL; 1694 1695 bool str1_isL = encLL || encLU; 1696 bool str2_isL = encLL || encUL; 1697 1698 int minCharsInWord = encLL ? wordSize : wordSize / 2; 1699 1700 BLOCK_COMMENT("string_compare {"); 1701 1702 // for Latin strings, 1 byte for 1 character 1703 // for UTF16 strings, 2 bytes for 1 character 1704 if (!str1_isL) 1705 sraiw(cnt1, cnt1, 1); 1706 if (!str2_isL) 1707 sraiw(cnt2, cnt2, 1); 1708 1709 // if str1 == str2, return the difference 1710 // save the minimum of the string lengths in cnt2. 1711 sub(result, cnt1, cnt2); 1712 bgt(cnt1, cnt2, L); 1713 mv(cnt2, cnt1); 1714 bind(L); 1715 1716 if (str1_isL == str2_isL) { // LL or UU 1717 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE); 1718 j(DONE); 1719 } else { // LU or UL 1720 Register strL = encLU ? str1 : str2; 1721 Register strU = encLU ? str2 : str1; 1722 VectorRegister vstr1 = encLU ? v8 : v4; 1723 VectorRegister vstr2 = encLU ? v4 : v8; 1724 1725 bind(loop); 1726 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2); 1727 vle8_v(vstr1, strL); 1728 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4); 1729 vzext_vf2(vstr2, vstr1); 1730 vle16_v(vstr1, strU); 1731 vmsne_vv(v4, vstr2, vstr1); 1732 vfirst_m(tmp2, v4); 1733 bgez(tmp2, DIFFERENCE); 1734 sub(cnt2, cnt2, tmp1); 1735 add(strL, strL, tmp1); 1736 shadd(strU, tmp1, strU, tmp1, 1); 1737 bnez(cnt2, loop); 1738 j(DONE); 1739 } 1740 1741 bind(DIFFERENCE); 1742 slli(tmp1, tmp2, 1); 1743 add(str1, str1, str1_isL ? tmp2 : tmp1); 1744 add(str2, str2, str2_isL ? tmp2 : tmp1); 1745 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0)); 1746 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0)); 1747 sub(result, tmp1, tmp2); 1748 1749 bind(DONE); 1750 } 1751 1752 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) { 1753 Label loop; 1754 assert_different_registers(src, dst, len, tmp, t0); 1755 1756 BLOCK_COMMENT("byte_array_inflate_v {"); 1757 bind(loop); 1758 vsetvli(tmp, len, Assembler::e8, Assembler::m2); 1759 vle8_v(v6, src); 1760 vsetvli(t0, len, Assembler::e16, Assembler::m4); 1761 vzext_vf2(v4, v6); 1762 vse16_v(v4, dst); 1763 sub(len, len, tmp); 1764 add(src, src, tmp); 1765 shadd(dst, tmp, dst, tmp, 1); 1766 bnez(len, loop); 1767 BLOCK_COMMENT("} byte_array_inflate_v"); 1768 } 1769 1770 // Compress char[] array to byte[]. 1771 // result: the array length if every element in array can be encoded; 0, otherwise. 1772 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, 1773 Register result, Register tmp) { 1774 Label done; 1775 encode_iso_array_v(src, dst, len, result, tmp, false); 1776 beqz(len, done); 1777 mv(result, zr); 1778 bind(done); 1779 } 1780 1781 // Intrinsic for 1782 // 1783 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray 1784 // return the number of characters copied. 1785 // - java/lang/StringUTF16.compress 1786 // return zero (0) if copy fails, otherwise 'len'. 1787 // 1788 // This version always returns the number of characters copied. A successful 1789 // copy will complete with the post-condition: 'res' == 'len', while an 1790 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'. 1791 // 1792 // Clobbers: src, dst, len, result, t0 1793 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, 1794 Register result, Register tmp, bool ascii) { 1795 Label loop, fail, done; 1796 1797 BLOCK_COMMENT("encode_iso_array_v {"); 1798 mv(result, 0); 1799 1800 bind(loop); 1801 mv(tmp, ascii ? 0x7f : 0xff); 1802 vsetvli(t0, len, Assembler::e16, Assembler::m2); 1803 vle16_v(v2, src); 1804 1805 vmsgtu_vx(v1, v2, tmp); 1806 vfirst_m(tmp, v1); 1807 vmsbf_m(v0, v1); 1808 // compress char to byte 1809 vsetvli(t0, len, Assembler::e8); 1810 vncvt_x_x_w(v1, v2, Assembler::v0_t); 1811 vse8_v(v1, dst, Assembler::v0_t); 1812 1813 // fail if char > 0x7f/0xff 1814 bgez(tmp, fail); 1815 add(result, result, t0); 1816 add(dst, dst, t0); 1817 sub(len, len, t0); 1818 shadd(src, t0, src, t0, 1); 1819 bnez(len, loop); 1820 j(done); 1821 1822 bind(fail); 1823 add(result, result, tmp); 1824 1825 bind(done); 1826 BLOCK_COMMENT("} encode_iso_array_v"); 1827 } 1828 1829 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) { 1830 Label LOOP, SET_RESULT, DONE; 1831 1832 BLOCK_COMMENT("count_positives_v {"); 1833 assert_different_registers(ary, len, result, tmp); 1834 1835 mv(result, zr); 1836 1837 bind(LOOP); 1838 vsetvli(t0, len, Assembler::e8, Assembler::m4); 1839 vle8_v(v4, ary); 1840 vmslt_vx(v4, v4, zr); 1841 vfirst_m(tmp, v4); 1842 bgez(tmp, SET_RESULT); 1843 // if tmp == -1, all bytes are positive 1844 add(result, result, t0); 1845 1846 sub(len, len, t0); 1847 add(ary, ary, t0); 1848 bnez(len, LOOP); 1849 j(DONE); 1850 1851 // add remaining positive bytes count 1852 bind(SET_RESULT); 1853 add(result, result, tmp); 1854 1855 bind(DONE); 1856 BLOCK_COMMENT("} count_positives_v"); 1857 } 1858 1859 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1, 1860 Register ch, Register result, 1861 Register tmp1, Register tmp2, 1862 bool isL) { 1863 mv(result, zr); 1864 1865 Label loop, MATCH, DONE; 1866 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16; 1867 bind(loop); 1868 vsetvli(tmp1, cnt1, sew, Assembler::m4); 1869 vlex_v(v4, str1, sew); 1870 vmseq_vx(v4, v4, ch); 1871 vfirst_m(tmp2, v4); 1872 bgez(tmp2, MATCH); // if equal, return index 1873 1874 add(result, result, tmp1); 1875 sub(cnt1, cnt1, tmp1); 1876 if (!isL) slli(tmp1, tmp1, 1); 1877 add(str1, str1, tmp1); 1878 bnez(cnt1, loop); 1879 1880 mv(result, -1); 1881 j(DONE); 1882 1883 bind(MATCH); 1884 add(result, result, tmp2); 1885 1886 bind(DONE); 1887 } 1888 1889 // Set dst to NaN if any NaN input. 1890 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 1891 bool is_double, bool is_min, int vector_length) { 1892 assert_different_registers(dst, src1, src2); 1893 1894 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 1895 1896 is_min ? vfmin_vv(dst, src1, src2) 1897 : vfmax_vv(dst, src1, src2); 1898 1899 vmfne_vv(v0, src1, src1); 1900 vfadd_vv(dst, src1, src1, Assembler::v0_t); 1901 vmfne_vv(v0, src2, src2); 1902 vfadd_vv(dst, src2, src2, Assembler::v0_t); 1903 } 1904 1905 // Set dst to NaN if any NaN input. 1906 // The destination vector register elements corresponding to masked-off elements 1907 // are handled with a mask-undisturbed policy. 1908 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2, 1909 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2, 1910 bool is_double, bool is_min, int vector_length) { 1911 assert_different_registers(src1, src2, tmp1, tmp2); 1912 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 1913 1914 // Check vector elements of src1 and src2 for NaN. 1915 vmfeq_vv(tmp1, src1, src1); 1916 vmfeq_vv(tmp2, src2, src2); 1917 1918 vmandn_mm(v0, vmask, tmp1); 1919 vfadd_vv(dst, src1, src1, Assembler::v0_t); 1920 vmandn_mm(v0, vmask, tmp2); 1921 vfadd_vv(dst, src2, src2, Assembler::v0_t); 1922 1923 vmand_mm(tmp2, tmp1, tmp2); 1924 vmand_mm(v0, vmask, tmp2); 1925 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t) 1926 : vfmax_vv(dst, src1, src2, Assembler::v0_t); 1927 } 1928 1929 // Set dst to NaN if any NaN input. 1930 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst, 1931 FloatRegister src1, VectorRegister src2, 1932 VectorRegister tmp1, VectorRegister tmp2, 1933 bool is_double, bool is_min, int vector_length, VectorMask vm) { 1934 assert_different_registers(dst, src1); 1935 assert_different_registers(src2, tmp1, tmp2); 1936 1937 Label L_done, L_NaN_1, L_NaN_2; 1938 // Set dst to src1 if src1 is NaN 1939 is_double ? feq_d(t0, src1, src1) 1940 : feq_s(t0, src1, src1); 1941 beqz(t0, L_NaN_2); 1942 1943 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length); 1944 vfmv_s_f(tmp2, src1); 1945 1946 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm) 1947 : vfredmax_vs(tmp1, src2, tmp2, vm); 1948 vfmv_f_s(dst, tmp1); 1949 1950 // Checking NaNs in src2 1951 vmfne_vv(tmp1, src2, src2, vm); 1952 vcpop_m(t0, tmp1, vm); 1953 beqz(t0, L_done); 1954 1955 bind(L_NaN_1); 1956 vfredusum_vs(tmp1, src2, tmp2, vm); 1957 vfmv_f_s(dst, tmp1); 1958 j(L_done); 1959 1960 bind(L_NaN_2); 1961 is_double ? fmv_d(dst, src1) 1962 : fmv_s(dst, src1); 1963 bind(L_done); 1964 } 1965 1966 bool C2_MacroAssembler::in_scratch_emit_size() { 1967 if (ciEnv::current()->task() != nullptr) { 1968 PhaseOutput* phase_output = Compile::current()->output(); 1969 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 1970 return true; 1971 } 1972 } 1973 return MacroAssembler::in_scratch_emit_size(); 1974 } 1975 1976 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1, 1977 VectorRegister src2, VectorRegister tmp, 1978 int opc, BasicType bt, int vector_length, VectorMask vm) { 1979 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1980 vsetvli_helper(bt, vector_length); 1981 vmv_s_x(tmp, src1); 1982 switch (opc) { 1983 case Op_AddReductionVI: 1984 case Op_AddReductionVL: 1985 vredsum_vs(tmp, src2, tmp, vm); 1986 break; 1987 case Op_AndReductionV: 1988 vredand_vs(tmp, src2, tmp, vm); 1989 break; 1990 case Op_OrReductionV: 1991 vredor_vs(tmp, src2, tmp, vm); 1992 break; 1993 case Op_XorReductionV: 1994 vredxor_vs(tmp, src2, tmp, vm); 1995 break; 1996 case Op_MaxReductionV: 1997 vredmax_vs(tmp, src2, tmp, vm); 1998 break; 1999 case Op_MinReductionV: 2000 vredmin_vs(tmp, src2, tmp, vm); 2001 break; 2002 default: 2003 ShouldNotReachHere(); 2004 } 2005 vmv_x_s(dst, tmp); 2006 } 2007 2008 // Set vl and vtype for full and partial vector operations. 2009 // (vma = mu, vta = tu, vill = false) 2010 void C2_MacroAssembler::vsetvli_helper(BasicType bt, int vector_length, LMUL vlmul, Register tmp) { 2011 Assembler::SEW sew = Assembler::elemtype_to_sew(bt); 2012 if (vector_length <= 31) { 2013 vsetivli(tmp, vector_length, sew, vlmul); 2014 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) { 2015 vsetvli(tmp, x0, sew, vlmul); 2016 } else { 2017 mv(tmp, vector_length); 2018 vsetvli(tmp, tmp, sew, vlmul); 2019 } 2020 } 2021 2022 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2023 int cond, BasicType bt, int vector_length, VectorMask vm) { 2024 assert(is_integral_type(bt), "unsupported element type"); 2025 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2026 vsetvli_helper(bt, vector_length); 2027 vmclr_m(vd); 2028 switch (cond) { 2029 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break; 2030 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break; 2031 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break; 2032 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break; 2033 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break; 2034 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break; 2035 default: 2036 assert(false, "unsupported compare condition"); 2037 ShouldNotReachHere(); 2038 } 2039 } 2040 2041 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2, 2042 int cond, BasicType bt, int vector_length, VectorMask vm) { 2043 assert(is_floating_point_type(bt), "unsupported element type"); 2044 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers"); 2045 vsetvli_helper(bt, vector_length); 2046 vmclr_m(vd); 2047 switch (cond) { 2048 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break; 2049 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break; 2050 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break; 2051 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break; 2052 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break; 2053 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break; 2054 default: 2055 assert(false, "unsupported compare condition"); 2056 ShouldNotReachHere(); 2057 } 2058 } 2059 2060 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, int vector_length, 2061 VectorRegister src, BasicType src_bt) { 2062 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size"); 2063 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2064 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands 2065 // The destination EEW is greater than the source EEW, the source EMUL is at least 1, 2066 // and the overlap is in the highest-numbered part of the destination register group. 2067 // Since LMUL=1, vd and vs cannot be the same. 2068 assert_different_registers(dst, src); 2069 2070 vsetvli_helper(dst_bt, vector_length); 2071 if (src_bt == T_BYTE) { 2072 switch (dst_bt) { 2073 case T_SHORT: 2074 vsext_vf2(dst, src); 2075 break; 2076 case T_INT: 2077 vsext_vf4(dst, src); 2078 break; 2079 case T_LONG: 2080 vsext_vf8(dst, src); 2081 break; 2082 default: 2083 ShouldNotReachHere(); 2084 } 2085 } else if (src_bt == T_SHORT) { 2086 if (dst_bt == T_INT) { 2087 vsext_vf2(dst, src); 2088 } else { 2089 vsext_vf4(dst, src); 2090 } 2091 } else if (src_bt == T_INT) { 2092 vsext_vf2(dst, src); 2093 } 2094 } 2095 2096 // Vector narrow from src to dst with specified element sizes. 2097 // High part of dst vector will be filled with zero. 2098 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, int vector_length, 2099 VectorRegister src, BasicType src_bt) { 2100 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size"); 2101 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type"); 2102 mv(t0, vector_length); 2103 if (src_bt == T_LONG) { 2104 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions 2105 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source. 2106 // So we can currently only scale down by 1/2 the width at a time. 2107 vsetvli(t0, t0, Assembler::e32, Assembler::mf2); 2108 vncvt_x_x_w(dst, src); 2109 if (dst_bt == T_SHORT || dst_bt == T_BYTE) { 2110 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2111 vncvt_x_x_w(dst, dst); 2112 if (dst_bt == T_BYTE) { 2113 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2114 vncvt_x_x_w(dst, dst); 2115 } 2116 } 2117 } else if (src_bt == T_INT) { 2118 // T_SHORT 2119 vsetvli(t0, t0, Assembler::e16, Assembler::mf2); 2120 vncvt_x_x_w(dst, src); 2121 if (dst_bt == T_BYTE) { 2122 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2123 vncvt_x_x_w(dst, dst); 2124 } 2125 } else if (src_bt == T_SHORT) { 2126 vsetvli(t0, t0, Assembler::e8, Assembler::mf2); 2127 vncvt_x_x_w(dst, src); 2128 } 2129 } 2130 2131 #define VFCVT_SAFE(VFLOATCVT) \ 2132 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \ 2133 assert_different_registers(dst, src); \ 2134 vxor_vv(dst, dst, dst); \ 2135 vmfeq_vv(v0, src, src); \ 2136 VFLOATCVT(dst, src, Assembler::v0_t); \ 2137 } 2138 2139 VFCVT_SAFE(vfcvt_rtz_x_f_v); 2140 2141 #undef VFCVT_SAFE 2142 2143 // Extract a scalar element from an vector at position 'idx'. 2144 // The input elements in src are expected to be of integral type. 2145 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt, 2146 int idx, VectorRegister tmp) { 2147 assert(is_integral_type(bt), "unsupported element type"); 2148 assert(idx >= 0, "idx cannot be negative"); 2149 // Only need the first element after vector slidedown 2150 vsetvli_helper(bt, 1); 2151 if (idx == 0) { 2152 vmv_x_s(dst, src); 2153 } else if (idx <= 31) { 2154 vslidedown_vi(tmp, src, idx); 2155 vmv_x_s(dst, tmp); 2156 } else { 2157 mv(t0, idx); 2158 vslidedown_vx(tmp, src, t0); 2159 vmv_x_s(dst, tmp); 2160 } 2161 } 2162 2163 // Extract a scalar element from an vector at position 'idx'. 2164 // The input elements in src are expected to be of floating point type. 2165 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt, 2166 int idx, VectorRegister tmp) { 2167 assert(is_floating_point_type(bt), "unsupported element type"); 2168 assert(idx >= 0, "idx cannot be negative"); 2169 // Only need the first element after vector slidedown 2170 vsetvli_helper(bt, 1); 2171 if (idx == 0) { 2172 vfmv_f_s(dst, src); 2173 } else if (idx <= 31) { 2174 vslidedown_vi(tmp, src, idx); 2175 vfmv_f_s(dst, tmp); 2176 } else { 2177 mv(t0, idx); 2178 vslidedown_vx(tmp, src, t0); 2179 vfmv_f_s(dst, tmp); 2180 } 2181 }