1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 50 Register tmp2Reg, Register tmp3Reg) { 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmpReg; 54 Register tmp = tmp2Reg; 55 Label object_has_monitor; 56 Label count, no_count; 57 58 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 59 assert_different_registers(oop, box, tmp, disp_hdr, rscratch1); 60 61 // Load markWord from object into displaced_header. 62 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 63 64 if (DiagnoseSyncOnValueBasedClasses != 0) { 65 load_klass(tmp, oop); 66 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 67 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 68 br(Assembler::NE, no_count); 69 } 70 71 // Check for existing monitor 72 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 73 74 if (LockingMode == LM_MONITOR) { 75 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 76 b(no_count); 77 } else { 78 assert(LockingMode == LM_LEGACY, "must be"); 79 // Set tmp to be (markWord of object | UNLOCK_VALUE). 80 orr(tmp, disp_hdr, markWord::unlocked_value); 81 82 // Initialize the box. (Must happen before we update the object mark!) 83 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 84 85 // Compare object markWord with an unlocked value (tmp) and if 86 // equal exchange the stack address of our box with object markWord. 87 // On failure disp_hdr contains the possibly locked markWord. 88 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 89 /*release*/ true, /*weak*/ false, disp_hdr); 90 br(Assembler::EQ, count); 91 92 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 93 94 // If the compare-and-exchange succeeded, then we found an unlocked 95 // object, will have now locked it will continue at label cont 96 97 // Check if the owner is self by comparing the value in the 98 // markWord of object (disp_hdr) with the stack pointer. 99 mov(rscratch1, sp); 100 sub(disp_hdr, disp_hdr, rscratch1); 101 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 102 // If condition is true we are cont and hence we can store 0 as the 103 // displaced header in the box, which indicates that it is a recursive lock. 104 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 105 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 106 b(no_count); 107 } 108 109 // Handle existing monitor. 110 bind(object_has_monitor); 111 112 // The object's monitor m is unlocked iff m->owner == nullptr, 113 // otherwise m->owner may contain a thread id, a stack address for LM_LEGACY, 114 // or the ANONYMOUS_OWNER constant for LM_LIGHTWEIGHT. 115 // 116 // Try to CAS m->owner from null to current thread. 117 ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset())); 118 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 119 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 120 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 121 122 // Store a non-null value into the box to avoid looking like a re-entrant 123 // lock. The fast-path monitor unlock code checks for 124 // markWord::monitor_value so use markWord::unused_mark which has the 125 // relevant bit set, and also matches ObjectSynchronizer::enter. 126 mov(tmp, (address)markWord::unused_mark().value()); 127 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 128 129 br(Assembler::EQ, no_count); // CAS success means locking succeeded 130 131 cmp(tmp3Reg, rscratch2); 132 br(Assembler::NE, no_count); // Check for recursive locking 133 134 // Recursive lock case 135 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 136 // flag == EQ still from the cmp above, checking if this is a reentrant lock 137 b(no_count); 138 139 bind(count); 140 inc_held_monitor_count(); 141 142 bind(no_count); 143 // flag == EQ indicates success 144 // flag == NE indicates failure 145 } 146 147 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 148 Register tmp2Reg) { 149 Register oop = objectReg; 150 Register box = boxReg; 151 Register disp_hdr = tmpReg; 152 Register tmp = tmp2Reg; 153 Label cont; 154 Label object_has_monitor; 155 Label no_count; 156 157 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 158 assert_different_registers(oop, box, tmp, disp_hdr); 159 160 if (LockingMode == LM_LEGACY) { 161 // Find the lock address and load the displaced header from the stack. 162 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 163 164 // If the displaced header is 0, we have a recursive unlock. 165 cmp(disp_hdr, zr); 166 br(Assembler::EQ, no_count); 167 } 168 169 // Handle existing monitor. 170 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 171 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 172 173 if (LockingMode == LM_MONITOR) { 174 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 175 b(no_count); 176 } else { 177 assert(LockingMode == LM_LEGACY, "must be"); 178 // Check if it is still a light weight lock, this is is true if we 179 // see the stack address of the basicLock in the markWord of the 180 // object. 181 182 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 183 /*release*/ true, /*weak*/ false, tmp); 184 b(cont); 185 } 186 187 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 188 189 // Handle existing monitor. 190 bind(object_has_monitor); 191 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 192 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 193 194 // If the owner is anonymous, we need to fix it -- in an outline stub. 195 Register tmp2 = disp_hdr; 196 ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset())); 197 // We cannot use tbnz here, the target might be too far away and cannot 198 // be encoded. 199 mov(rscratch1, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER); 200 cmp(tmp2, rscratch1); 201 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2); 202 Compile::current()->output()->add_stub(stub); 203 br(Assembler::EQ, stub->entry()); 204 bind(stub->continuation()); 205 206 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 207 208 Label notRecursive; 209 cbz(disp_hdr, notRecursive); 210 211 // Recursive lock 212 sub(disp_hdr, disp_hdr, 1u); 213 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 214 cmp(disp_hdr, disp_hdr); // Sets flags for result 215 b(no_count); 216 217 bind(notRecursive); 218 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 219 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 220 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 221 cmp(rscratch1, zr); // Sets flags for result 222 cbnz(rscratch1, no_count); 223 // need a release store here 224 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 225 stlr(zr, tmp); // set unowned 226 b(no_count); 227 228 bind(cont); 229 // flag == EQ indicates success 230 // flag == NE indicates failure 231 br(Assembler::NE, no_count); 232 233 dec_held_monitor_count(); 234 235 bind(no_count); 236 } 237 238 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1, 239 Register t2, Register t3) { 240 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 241 assert_different_registers(obj, t1, t2, t3); 242 243 // Handle inflated monitor. 244 Label inflated; 245 // Finish fast lock successfully. MUST branch to with flag == EQ 246 Label locked; 247 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 248 Label slow_path; 249 250 if (DiagnoseSyncOnValueBasedClasses != 0) { 251 load_klass(t1, obj); 252 ldrw(t1, Address(t1, Klass::access_flags_offset())); 253 tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS); 254 br(Assembler::NE, slow_path); 255 } 256 257 const Register t1_mark = t1; 258 259 { // Lightweight locking 260 261 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 262 Label push; 263 264 const Register t2_top = t2; 265 const Register t3_t = t3; 266 267 // Check if lock-stack is full. 268 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 269 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 270 br(Assembler::GT, slow_path); 271 272 // Check if recursive. 273 subw(t3_t, t2_top, oopSize); 274 ldr(t3_t, Address(rthread, t3_t)); 275 cmp(obj, t3_t); 276 br(Assembler::EQ, push); 277 278 // Relaxed normal load to check for monitor. Optimization for monitor case. 279 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 280 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 281 282 // Not inflated 283 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 284 285 // Try to lock. Transition lock-bits 0b01 => 0b00 286 orr(t1_mark, t1_mark, markWord::unlocked_value); 287 eor(t3_t, t1_mark, markWord::unlocked_value); 288 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 289 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 290 br(Assembler::NE, slow_path); 291 292 bind(push); 293 // After successful lock, push object on lock-stack. 294 str(obj, Address(rthread, t2_top)); 295 addw(t2_top, t2_top, oopSize); 296 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 297 b(locked); 298 } 299 300 { // Handle inflated monitor. 301 bind(inflated); 302 303 // mark contains the tagged ObjectMonitor*. 304 const Register t1_tagged_monitor = t1_mark; 305 const uintptr_t monitor_tag = markWord::monitor_value; 306 const Register t2_owner_addr = t2; 307 const Register t3_owner = t3; 308 309 // Compute owner address. 310 lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag))); 311 312 // CAS owner (null => current thread id). 313 ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset())); 314 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 315 /*release*/ false, /*weak*/ false, t3_owner); 316 br(Assembler::EQ, locked); 317 318 // Check if recursive. 319 cmp(t3_owner, rscratch2); 320 br(Assembler::NE, slow_path); 321 322 // Recursive. 323 increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1); 324 } 325 326 bind(locked); 327 328 #ifdef ASSERT 329 // Check that locked label is reached with Flags == EQ. 330 Label flag_correct; 331 br(Assembler::EQ, flag_correct); 332 stop("Fast Lock Flag != EQ"); 333 #endif 334 335 bind(slow_path); 336 #ifdef ASSERT 337 // Check that slow_path label is reached with Flags == NE. 338 br(Assembler::NE, flag_correct); 339 stop("Fast Lock Flag != NE"); 340 bind(flag_correct); 341 #endif 342 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 343 } 344 345 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2, 346 Register t3) { 347 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 348 assert_different_registers(obj, t1, t2, t3); 349 350 // Handle inflated monitor. 351 Label inflated, inflated_load_monitor; 352 // Finish fast unlock successfully. MUST branch to with flag == EQ 353 Label unlocked; 354 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 355 Label slow_path; 356 357 const Register t1_mark = t1; 358 const Register t2_top = t2; 359 const Register t3_t = t3; 360 361 { // Lightweight unlock 362 363 // Check if obj is top of lock-stack. 364 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 365 subw(t2_top, t2_top, oopSize); 366 ldr(t3_t, Address(rthread, t2_top)); 367 cmp(obj, t3_t); 368 // Top of lock stack was not obj. Must be monitor. 369 br(Assembler::NE, inflated_load_monitor); 370 371 // Pop lock-stack. 372 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 373 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 374 375 // Check if recursive. 376 subw(t3_t, t2_top, oopSize); 377 ldr(t3_t, Address(rthread, t3_t)); 378 cmp(obj, t3_t); 379 br(Assembler::EQ, unlocked); 380 381 // Not recursive. 382 // Load Mark. 383 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 384 385 // Check header for monitor (0b10). 386 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 387 388 // Try to unlock. Transition lock bits 0b00 => 0b01 389 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 390 orr(t3_t, t1_mark, markWord::unlocked_value); 391 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 392 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 393 br(Assembler::EQ, unlocked); 394 395 // Compare and exchange failed. 396 // Restore lock-stack and handle the unlock in runtime. 397 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 398 addw(t2_top, t2_top, oopSize); 399 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 400 b(slow_path); 401 } 402 403 404 { // Handle inflated monitor. 405 bind(inflated_load_monitor); 406 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 407 #ifdef ASSERT 408 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 409 stop("Fast Unlock not monitor"); 410 #endif 411 412 bind(inflated); 413 414 #ifdef ASSERT 415 Label check_done; 416 subw(t2_top, t2_top, oopSize); 417 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 418 br(Assembler::LT, check_done); 419 ldr(t3_t, Address(rthread, t2_top)); 420 cmp(obj, t3_t); 421 br(Assembler::NE, inflated); 422 stop("Fast Unlock lock on stack"); 423 bind(check_done); 424 #endif 425 426 // mark contains the tagged ObjectMonitor*. 427 const Register t1_monitor = t1_mark; 428 const uintptr_t monitor_tag = markWord::monitor_value; 429 430 // Untag the monitor. 431 sub(t1_monitor, t1_mark, monitor_tag); 432 433 const Register t2_recursions = t2; 434 Label not_recursive; 435 436 // Check if recursive. 437 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 438 cbz(t2_recursions, not_recursive); 439 440 // Recursive unlock. 441 sub(t2_recursions, t2_recursions, 1u); 442 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 443 // Set flag == EQ 444 cmp(t2_recursions, t2_recursions); 445 b(unlocked); 446 447 bind(not_recursive); 448 449 Label release; 450 const Register t2_owner_addr = t2; 451 452 // Compute owner address. 453 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 454 455 // Check if the entry lists are empty. 456 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 457 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 458 orr(rscratch1, rscratch1, t3_t); 459 cmp(rscratch1, zr); 460 br(Assembler::EQ, release); 461 462 // The owner may be anonymous and we removed the last obj entry in 463 // the lock-stack. This loses the information about the owner. 464 // Write the thread id to the owner field so the runtime knows the owner. 465 ldr(t3_t, Address(rthread, JavaThread::lock_id_offset())); 466 str(t3_t, Address(t2_owner_addr)); 467 b(slow_path); 468 469 bind(release); 470 // Set owner to null. 471 // Release to satisfy the JMM 472 stlr(zr, t2_owner_addr); 473 } 474 475 bind(unlocked); 476 477 #ifdef ASSERT 478 // Check that unlocked label is reached with Flags == EQ. 479 Label flag_correct; 480 br(Assembler::EQ, flag_correct); 481 stop("Fast Unlock Flag != EQ"); 482 #endif 483 484 bind(slow_path); 485 #ifdef ASSERT 486 // Check that slow_path label is reached with Flags == NE. 487 br(Assembler::NE, flag_correct); 488 stop("Fast Unlock Flag != NE"); 489 bind(flag_correct); 490 #endif 491 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 492 } 493 494 // Search for str1 in str2 and return index or -1 495 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 496 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 497 Register cnt2, Register cnt1, 498 Register tmp1, Register tmp2, 499 Register tmp3, Register tmp4, 500 Register tmp5, Register tmp6, 501 int icnt1, Register result, int ae) { 502 // NOTE: tmp5, tmp6 can be zr depending on specific method version 503 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 504 505 Register ch1 = rscratch1; 506 Register ch2 = rscratch2; 507 Register cnt1tmp = tmp1; 508 Register cnt2tmp = tmp2; 509 Register cnt1_neg = cnt1; 510 Register cnt2_neg = cnt2; 511 Register result_tmp = tmp4; 512 513 bool isL = ae == StrIntrinsicNode::LL; 514 515 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 516 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 517 int str1_chr_shift = str1_isL ? 0:1; 518 int str2_chr_shift = str2_isL ? 0:1; 519 int str1_chr_size = str1_isL ? 1:2; 520 int str2_chr_size = str2_isL ? 1:2; 521 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 522 (chr_insn)&MacroAssembler::ldrh; 523 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 524 (chr_insn)&MacroAssembler::ldrh; 525 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 526 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 527 528 // Note, inline_string_indexOf() generates checks: 529 // if (substr.count > string.count) return -1; 530 // if (substr.count == 0) return 0; 531 532 // We have two strings, a source string in str2, cnt2 and a pattern string 533 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 534 535 // For larger pattern and source we use a simplified Boyer Moore algorithm. 536 // With a small pattern and source we use linear scan. 537 538 if (icnt1 == -1) { 539 sub(result_tmp, cnt2, cnt1); 540 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 541 br(LT, LINEARSEARCH); 542 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 543 subs(zr, cnt1, 256); 544 lsr(tmp1, cnt2, 2); 545 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 546 br(GE, LINEARSTUB); 547 } 548 549 // The Boyer Moore alogorithm is based on the description here:- 550 // 551 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 552 // 553 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 554 // and the 'Good Suffix' rule. 555 // 556 // These rules are essentially heuristics for how far we can shift the 557 // pattern along the search string. 558 // 559 // The implementation here uses the 'Bad Character' rule only because of the 560 // complexity of initialisation for the 'Good Suffix' rule. 561 // 562 // This is also known as the Boyer-Moore-Horspool algorithm:- 563 // 564 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 565 // 566 // This particular implementation has few java-specific optimizations. 567 // 568 // #define ASIZE 256 569 // 570 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 571 // int i, j; 572 // unsigned c; 573 // unsigned char bc[ASIZE]; 574 // 575 // /* Preprocessing */ 576 // for (i = 0; i < ASIZE; ++i) 577 // bc[i] = m; 578 // for (i = 0; i < m - 1; ) { 579 // c = x[i]; 580 // ++i; 581 // // c < 256 for Latin1 string, so, no need for branch 582 // #ifdef PATTERN_STRING_IS_LATIN1 583 // bc[c] = m - i; 584 // #else 585 // if (c < ASIZE) bc[c] = m - i; 586 // #endif 587 // } 588 // 589 // /* Searching */ 590 // j = 0; 591 // while (j <= n - m) { 592 // c = y[i+j]; 593 // if (x[m-1] == c) 594 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 595 // if (i < 0) return j; 596 // // c < 256 for Latin1 string, so, no need for branch 597 // #ifdef SOURCE_STRING_IS_LATIN1 598 // // LL case: (c< 256) always true. Remove branch 599 // j += bc[y[j+m-1]]; 600 // #endif 601 // #ifndef PATTERN_STRING_IS_UTF 602 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 603 // if (c < ASIZE) 604 // j += bc[y[j+m-1]]; 605 // else 606 // j += 1 607 // #endif 608 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 609 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 610 // if (c < ASIZE) 611 // j += bc[y[j+m-1]]; 612 // else 613 // j += m 614 // #endif 615 // } 616 // } 617 618 if (icnt1 == -1) { 619 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 620 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 621 Register cnt1end = tmp2; 622 Register str2end = cnt2; 623 Register skipch = tmp2; 624 625 // str1 length is >=8, so, we can read at least 1 register for cases when 626 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 627 // UL case. We'll re-read last character in inner pre-loop code to have 628 // single outer pre-loop load 629 const int firstStep = isL ? 7 : 3; 630 631 const int ASIZE = 256; 632 const int STORED_BYTES = 32; // amount of bytes stored per instruction 633 sub(sp, sp, ASIZE); 634 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 635 mov(ch1, sp); 636 BIND(BM_INIT_LOOP); 637 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 638 subs(tmp5, tmp5, 1); 639 br(GT, BM_INIT_LOOP); 640 641 sub(cnt1tmp, cnt1, 1); 642 mov(tmp5, str2); 643 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 644 sub(ch2, cnt1, 1); 645 mov(tmp3, str1); 646 BIND(BCLOOP); 647 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 648 if (!str1_isL) { 649 subs(zr, ch1, ASIZE); 650 br(HS, BCSKIP); 651 } 652 strb(ch2, Address(sp, ch1)); 653 BIND(BCSKIP); 654 subs(ch2, ch2, 1); 655 br(GT, BCLOOP); 656 657 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 658 if (str1_isL == str2_isL) { 659 // load last 8 bytes (8LL/4UU symbols) 660 ldr(tmp6, Address(tmp6, -wordSize)); 661 } else { 662 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 663 // convert Latin1 to UTF. We'll have to wait until load completed, but 664 // it's still faster than per-character loads+checks 665 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 666 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 667 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 668 andr(tmp6, tmp6, 0xFF); // str1[N-4] 669 orr(ch2, ch1, ch2, LSL, 16); 670 orr(tmp6, tmp6, tmp3, LSL, 48); 671 orr(tmp6, tmp6, ch2, LSL, 16); 672 } 673 BIND(BMLOOPSTR2); 674 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 675 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 676 if (str1_isL == str2_isL) { 677 // re-init tmp3. It's for free because it's executed in parallel with 678 // load above. Alternative is to initialize it before loop, but it'll 679 // affect performance on in-order systems with 2 or more ld/st pipelines 680 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 681 } 682 if (!isL) { // UU/UL case 683 lsl(ch2, cnt1tmp, 1); // offset in bytes 684 } 685 cmp(tmp3, skipch); 686 br(NE, BMSKIP); 687 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 688 mov(ch1, tmp6); 689 if (isL) { 690 b(BMLOOPSTR1_AFTER_LOAD); 691 } else { 692 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 693 b(BMLOOPSTR1_CMP); 694 } 695 BIND(BMLOOPSTR1); 696 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 697 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 698 BIND(BMLOOPSTR1_AFTER_LOAD); 699 subs(cnt1tmp, cnt1tmp, 1); 700 br(LT, BMLOOPSTR1_LASTCMP); 701 BIND(BMLOOPSTR1_CMP); 702 cmp(ch1, ch2); 703 br(EQ, BMLOOPSTR1); 704 BIND(BMSKIP); 705 if (!isL) { 706 // if we've met UTF symbol while searching Latin1 pattern, then we can 707 // skip cnt1 symbols 708 if (str1_isL != str2_isL) { 709 mov(result_tmp, cnt1); 710 } else { 711 mov(result_tmp, 1); 712 } 713 subs(zr, skipch, ASIZE); 714 br(HS, BMADV); 715 } 716 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 717 BIND(BMADV); 718 sub(cnt1tmp, cnt1, 1); 719 add(str2, str2, result_tmp, LSL, str2_chr_shift); 720 cmp(str2, str2end); 721 br(LE, BMLOOPSTR2); 722 add(sp, sp, ASIZE); 723 b(NOMATCH); 724 BIND(BMLOOPSTR1_LASTCMP); 725 cmp(ch1, ch2); 726 br(NE, BMSKIP); 727 BIND(BMMATCH); 728 sub(result, str2, tmp5); 729 if (!str2_isL) lsr(result, result, 1); 730 add(sp, sp, ASIZE); 731 b(DONE); 732 733 BIND(LINEARSTUB); 734 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 735 br(LT, LINEAR_MEDIUM); 736 mov(result, zr); 737 RuntimeAddress stub = nullptr; 738 if (isL) { 739 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 740 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 741 } else if (str1_isL) { 742 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 743 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 744 } else { 745 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 746 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 747 } 748 address call = trampoline_call(stub); 749 if (call == nullptr) { 750 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 751 ciEnv::current()->record_failure("CodeCache is full"); 752 return; 753 } 754 b(DONE); 755 } 756 757 BIND(LINEARSEARCH); 758 { 759 Label DO1, DO2, DO3; 760 761 Register str2tmp = tmp2; 762 Register first = tmp3; 763 764 if (icnt1 == -1) 765 { 766 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 767 768 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 769 br(LT, DOSHORT); 770 BIND(LINEAR_MEDIUM); 771 (this->*str1_load_1chr)(first, Address(str1)); 772 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 773 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 774 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 775 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 776 777 BIND(FIRST_LOOP); 778 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 779 cmp(first, ch2); 780 br(EQ, STR1_LOOP); 781 BIND(STR2_NEXT); 782 adds(cnt2_neg, cnt2_neg, str2_chr_size); 783 br(LE, FIRST_LOOP); 784 b(NOMATCH); 785 786 BIND(STR1_LOOP); 787 adds(cnt1tmp, cnt1_neg, str1_chr_size); 788 add(cnt2tmp, cnt2_neg, str2_chr_size); 789 br(GE, MATCH); 790 791 BIND(STR1_NEXT); 792 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 793 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 794 cmp(ch1, ch2); 795 br(NE, STR2_NEXT); 796 adds(cnt1tmp, cnt1tmp, str1_chr_size); 797 add(cnt2tmp, cnt2tmp, str2_chr_size); 798 br(LT, STR1_NEXT); 799 b(MATCH); 800 801 BIND(DOSHORT); 802 if (str1_isL == str2_isL) { 803 cmp(cnt1, (u1)2); 804 br(LT, DO1); 805 br(GT, DO3); 806 } 807 } 808 809 if (icnt1 == 4) { 810 Label CH1_LOOP; 811 812 (this->*load_4chr)(ch1, str1); 813 sub(result_tmp, cnt2, 4); 814 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 815 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 816 817 BIND(CH1_LOOP); 818 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 819 cmp(ch1, ch2); 820 br(EQ, MATCH); 821 adds(cnt2_neg, cnt2_neg, str2_chr_size); 822 br(LE, CH1_LOOP); 823 b(NOMATCH); 824 } 825 826 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 827 Label CH1_LOOP; 828 829 BIND(DO2); 830 (this->*load_2chr)(ch1, str1); 831 if (icnt1 == 2) { 832 sub(result_tmp, cnt2, 2); 833 } 834 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 835 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 836 BIND(CH1_LOOP); 837 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 838 cmp(ch1, ch2); 839 br(EQ, MATCH); 840 adds(cnt2_neg, cnt2_neg, str2_chr_size); 841 br(LE, CH1_LOOP); 842 b(NOMATCH); 843 } 844 845 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 846 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 847 848 BIND(DO3); 849 (this->*load_2chr)(first, str1); 850 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 851 if (icnt1 == 3) { 852 sub(result_tmp, cnt2, 3); 853 } 854 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 855 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 856 BIND(FIRST_LOOP); 857 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 858 cmpw(first, ch2); 859 br(EQ, STR1_LOOP); 860 BIND(STR2_NEXT); 861 adds(cnt2_neg, cnt2_neg, str2_chr_size); 862 br(LE, FIRST_LOOP); 863 b(NOMATCH); 864 865 BIND(STR1_LOOP); 866 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 867 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 868 cmp(ch1, ch2); 869 br(NE, STR2_NEXT); 870 b(MATCH); 871 } 872 873 if (icnt1 == -1 || icnt1 == 1) { 874 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 875 876 BIND(DO1); 877 (this->*str1_load_1chr)(ch1, str1); 878 cmp(cnt2, (u1)8); 879 br(LT, DO1_SHORT); 880 881 sub(result_tmp, cnt2, 8/str2_chr_size); 882 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 883 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 884 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 885 886 if (str2_isL) { 887 orr(ch1, ch1, ch1, LSL, 8); 888 } 889 orr(ch1, ch1, ch1, LSL, 16); 890 orr(ch1, ch1, ch1, LSL, 32); 891 BIND(CH1_LOOP); 892 ldr(ch2, Address(str2, cnt2_neg)); 893 eor(ch2, ch1, ch2); 894 sub(tmp1, ch2, tmp3); 895 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 896 bics(tmp1, tmp1, tmp2); 897 br(NE, HAS_ZERO); 898 adds(cnt2_neg, cnt2_neg, 8); 899 br(LT, CH1_LOOP); 900 901 cmp(cnt2_neg, (u1)8); 902 mov(cnt2_neg, 0); 903 br(LT, CH1_LOOP); 904 b(NOMATCH); 905 906 BIND(HAS_ZERO); 907 rev(tmp1, tmp1); 908 clz(tmp1, tmp1); 909 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 910 b(MATCH); 911 912 BIND(DO1_SHORT); 913 mov(result_tmp, cnt2); 914 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 915 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 916 BIND(DO1_LOOP); 917 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 918 cmpw(ch1, ch2); 919 br(EQ, MATCH); 920 adds(cnt2_neg, cnt2_neg, str2_chr_size); 921 br(LT, DO1_LOOP); 922 } 923 } 924 BIND(NOMATCH); 925 mov(result, -1); 926 b(DONE); 927 BIND(MATCH); 928 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 929 BIND(DONE); 930 } 931 932 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 933 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 934 935 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 936 Register ch, Register result, 937 Register tmp1, Register tmp2, Register tmp3) 938 { 939 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 940 Register cnt1_neg = cnt1; 941 Register ch1 = rscratch1; 942 Register result_tmp = rscratch2; 943 944 cbz(cnt1, NOMATCH); 945 946 cmp(cnt1, (u1)4); 947 br(LT, DO1_SHORT); 948 949 orr(ch, ch, ch, LSL, 16); 950 orr(ch, ch, ch, LSL, 32); 951 952 sub(cnt1, cnt1, 4); 953 mov(result_tmp, cnt1); 954 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 955 sub(cnt1_neg, zr, cnt1, LSL, 1); 956 957 mov(tmp3, 0x0001000100010001); 958 959 BIND(CH1_LOOP); 960 ldr(ch1, Address(str1, cnt1_neg)); 961 eor(ch1, ch, ch1); 962 sub(tmp1, ch1, tmp3); 963 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 964 bics(tmp1, tmp1, tmp2); 965 br(NE, HAS_ZERO); 966 adds(cnt1_neg, cnt1_neg, 8); 967 br(LT, CH1_LOOP); 968 969 cmp(cnt1_neg, (u1)8); 970 mov(cnt1_neg, 0); 971 br(LT, CH1_LOOP); 972 b(NOMATCH); 973 974 BIND(HAS_ZERO); 975 rev(tmp1, tmp1); 976 clz(tmp1, tmp1); 977 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 978 b(MATCH); 979 980 BIND(DO1_SHORT); 981 mov(result_tmp, cnt1); 982 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 983 sub(cnt1_neg, zr, cnt1, LSL, 1); 984 BIND(DO1_LOOP); 985 ldrh(ch1, Address(str1, cnt1_neg)); 986 cmpw(ch, ch1); 987 br(EQ, MATCH); 988 adds(cnt1_neg, cnt1_neg, 2); 989 br(LT, DO1_LOOP); 990 BIND(NOMATCH); 991 mov(result, -1); 992 b(DONE); 993 BIND(MATCH); 994 add(result, result_tmp, cnt1_neg, ASR, 1); 995 BIND(DONE); 996 } 997 998 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 999 Register ch, Register result, 1000 FloatRegister ztmp1, 1001 FloatRegister ztmp2, 1002 PRegister tmp_pg, 1003 PRegister tmp_pdn, bool isL) 1004 { 1005 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1006 assert(tmp_pg->is_governing(), 1007 "this register has to be a governing predicate register"); 1008 1009 Label LOOP, MATCH, DONE, NOMATCH; 1010 Register vec_len = rscratch1; 1011 Register idx = rscratch2; 1012 1013 SIMD_RegVariant T = (isL == true) ? B : H; 1014 1015 cbz(cnt1, NOMATCH); 1016 1017 // Assign the particular char throughout the vector. 1018 sve_dup(ztmp2, T, ch); 1019 if (isL) { 1020 sve_cntb(vec_len); 1021 } else { 1022 sve_cnth(vec_len); 1023 } 1024 mov(idx, 0); 1025 1026 // Generate a predicate to control the reading of input string. 1027 sve_whilelt(tmp_pg, T, idx, cnt1); 1028 1029 BIND(LOOP); 1030 // Read a vector of 8- or 16-bit data depending on the string type. Note 1031 // that inactive elements indicated by the predicate register won't cause 1032 // a data read from memory to the destination vector. 1033 if (isL) { 1034 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1035 } else { 1036 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1037 } 1038 add(idx, idx, vec_len); 1039 1040 // Perform the comparison. An element of the destination predicate is set 1041 // to active if the particular char is matched. 1042 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1043 1044 // Branch if the particular char is found. 1045 br(NE, MATCH); 1046 1047 sve_whilelt(tmp_pg, T, idx, cnt1); 1048 1049 // Loop back if the particular char not found. 1050 br(MI, LOOP); 1051 1052 BIND(NOMATCH); 1053 mov(result, -1); 1054 b(DONE); 1055 1056 BIND(MATCH); 1057 // Undo the index increment. 1058 sub(idx, idx, vec_len); 1059 1060 // Crop the vector to find its location. 1061 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1062 add(result, idx, -1); 1063 sve_incp(result, T, tmp_pdn); 1064 BIND(DONE); 1065 } 1066 1067 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1068 Register ch, Register result, 1069 Register tmp1, Register tmp2, Register tmp3) 1070 { 1071 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1072 Register cnt1_neg = cnt1; 1073 Register ch1 = rscratch1; 1074 Register result_tmp = rscratch2; 1075 1076 cbz(cnt1, NOMATCH); 1077 1078 cmp(cnt1, (u1)8); 1079 br(LT, DO1_SHORT); 1080 1081 orr(ch, ch, ch, LSL, 8); 1082 orr(ch, ch, ch, LSL, 16); 1083 orr(ch, ch, ch, LSL, 32); 1084 1085 sub(cnt1, cnt1, 8); 1086 mov(result_tmp, cnt1); 1087 lea(str1, Address(str1, cnt1)); 1088 sub(cnt1_neg, zr, cnt1); 1089 1090 mov(tmp3, 0x0101010101010101); 1091 1092 BIND(CH1_LOOP); 1093 ldr(ch1, Address(str1, cnt1_neg)); 1094 eor(ch1, ch, ch1); 1095 sub(tmp1, ch1, tmp3); 1096 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1097 bics(tmp1, tmp1, tmp2); 1098 br(NE, HAS_ZERO); 1099 adds(cnt1_neg, cnt1_neg, 8); 1100 br(LT, CH1_LOOP); 1101 1102 cmp(cnt1_neg, (u1)8); 1103 mov(cnt1_neg, 0); 1104 br(LT, CH1_LOOP); 1105 b(NOMATCH); 1106 1107 BIND(HAS_ZERO); 1108 rev(tmp1, tmp1); 1109 clz(tmp1, tmp1); 1110 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1111 b(MATCH); 1112 1113 BIND(DO1_SHORT); 1114 mov(result_tmp, cnt1); 1115 lea(str1, Address(str1, cnt1)); 1116 sub(cnt1_neg, zr, cnt1); 1117 BIND(DO1_LOOP); 1118 ldrb(ch1, Address(str1, cnt1_neg)); 1119 cmp(ch, ch1); 1120 br(EQ, MATCH); 1121 adds(cnt1_neg, cnt1_neg, 1); 1122 br(LT, DO1_LOOP); 1123 BIND(NOMATCH); 1124 mov(result, -1); 1125 b(DONE); 1126 BIND(MATCH); 1127 add(result, result_tmp, cnt1_neg); 1128 BIND(DONE); 1129 } 1130 1131 // Compare strings. 1132 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1133 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1134 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1135 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1136 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1137 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1138 SHORT_LOOP_START, TAIL_CHECK; 1139 1140 bool isLL = ae == StrIntrinsicNode::LL; 1141 bool isLU = ae == StrIntrinsicNode::LU; 1142 bool isUL = ae == StrIntrinsicNode::UL; 1143 1144 // The stub threshold for LL strings is: 72 (64 + 8) chars 1145 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1146 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1147 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1148 1149 bool str1_isL = isLL || isLU; 1150 bool str2_isL = isLL || isUL; 1151 1152 int str1_chr_shift = str1_isL ? 0 : 1; 1153 int str2_chr_shift = str2_isL ? 0 : 1; 1154 int str1_chr_size = str1_isL ? 1 : 2; 1155 int str2_chr_size = str2_isL ? 1 : 2; 1156 int minCharsInWord = isLL ? wordSize : wordSize/2; 1157 1158 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1159 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1160 (chr_insn)&MacroAssembler::ldrh; 1161 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1162 (chr_insn)&MacroAssembler::ldrh; 1163 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1164 (uxt_insn)&MacroAssembler::uxthw; 1165 1166 BLOCK_COMMENT("string_compare {"); 1167 1168 // Bizzarely, the counts are passed in bytes, regardless of whether they 1169 // are L or U strings, however the result is always in characters. 1170 if (!str1_isL) asrw(cnt1, cnt1, 1); 1171 if (!str2_isL) asrw(cnt2, cnt2, 1); 1172 1173 // Compute the minimum of the string lengths and save the difference. 1174 subsw(result, cnt1, cnt2); 1175 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1176 1177 // A very short string 1178 cmpw(cnt2, minCharsInWord); 1179 br(Assembler::LE, SHORT_STRING); 1180 1181 // Compare longwords 1182 // load first parts of strings and finish initialization while loading 1183 { 1184 if (str1_isL == str2_isL) { // LL or UU 1185 ldr(tmp1, Address(str1)); 1186 cmp(str1, str2); 1187 br(Assembler::EQ, DONE); 1188 ldr(tmp2, Address(str2)); 1189 cmp(cnt2, stub_threshold); 1190 br(GE, STUB); 1191 subsw(cnt2, cnt2, minCharsInWord); 1192 br(EQ, TAIL_CHECK); 1193 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1194 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1195 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1196 } else if (isLU) { 1197 ldrs(vtmp, Address(str1)); 1198 ldr(tmp2, Address(str2)); 1199 cmp(cnt2, stub_threshold); 1200 br(GE, STUB); 1201 subw(cnt2, cnt2, 4); 1202 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1203 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1204 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1205 zip1(vtmp, T8B, vtmp, vtmpZ); 1206 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1207 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1208 add(cnt1, cnt1, 4); 1209 fmovd(tmp1, vtmp); 1210 } else { // UL case 1211 ldr(tmp1, Address(str1)); 1212 ldrs(vtmp, Address(str2)); 1213 cmp(cnt2, stub_threshold); 1214 br(GE, STUB); 1215 subw(cnt2, cnt2, 4); 1216 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1217 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1218 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1219 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1220 zip1(vtmp, T8B, vtmp, vtmpZ); 1221 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1222 add(cnt1, cnt1, 8); 1223 fmovd(tmp2, vtmp); 1224 } 1225 adds(cnt2, cnt2, isUL ? 4 : 8); 1226 br(GE, TAIL); 1227 eor(rscratch2, tmp1, tmp2); 1228 cbnz(rscratch2, DIFF); 1229 // main loop 1230 bind(NEXT_WORD); 1231 if (str1_isL == str2_isL) { 1232 ldr(tmp1, Address(str1, cnt2)); 1233 ldr(tmp2, Address(str2, cnt2)); 1234 adds(cnt2, cnt2, 8); 1235 } else if (isLU) { 1236 ldrs(vtmp, Address(str1, cnt1)); 1237 ldr(tmp2, Address(str2, cnt2)); 1238 add(cnt1, cnt1, 4); 1239 zip1(vtmp, T8B, vtmp, vtmpZ); 1240 fmovd(tmp1, vtmp); 1241 adds(cnt2, cnt2, 8); 1242 } else { // UL 1243 ldrs(vtmp, Address(str2, cnt2)); 1244 ldr(tmp1, Address(str1, cnt1)); 1245 zip1(vtmp, T8B, vtmp, vtmpZ); 1246 add(cnt1, cnt1, 8); 1247 fmovd(tmp2, vtmp); 1248 adds(cnt2, cnt2, 4); 1249 } 1250 br(GE, TAIL); 1251 1252 eor(rscratch2, tmp1, tmp2); 1253 cbz(rscratch2, NEXT_WORD); 1254 b(DIFF); 1255 bind(TAIL); 1256 eor(rscratch2, tmp1, tmp2); 1257 cbnz(rscratch2, DIFF); 1258 // Last longword. In the case where length == 4 we compare the 1259 // same longword twice, but that's still faster than another 1260 // conditional branch. 1261 if (str1_isL == str2_isL) { 1262 ldr(tmp1, Address(str1)); 1263 ldr(tmp2, Address(str2)); 1264 } else if (isLU) { 1265 ldrs(vtmp, Address(str1)); 1266 ldr(tmp2, Address(str2)); 1267 zip1(vtmp, T8B, vtmp, vtmpZ); 1268 fmovd(tmp1, vtmp); 1269 } else { // UL 1270 ldrs(vtmp, Address(str2)); 1271 ldr(tmp1, Address(str1)); 1272 zip1(vtmp, T8B, vtmp, vtmpZ); 1273 fmovd(tmp2, vtmp); 1274 } 1275 bind(TAIL_CHECK); 1276 eor(rscratch2, tmp1, tmp2); 1277 cbz(rscratch2, DONE); 1278 1279 // Find the first different characters in the longwords and 1280 // compute their difference. 1281 bind(DIFF); 1282 rev(rscratch2, rscratch2); 1283 clz(rscratch2, rscratch2); 1284 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1285 lsrv(tmp1, tmp1, rscratch2); 1286 (this->*ext_chr)(tmp1, tmp1); 1287 lsrv(tmp2, tmp2, rscratch2); 1288 (this->*ext_chr)(tmp2, tmp2); 1289 subw(result, tmp1, tmp2); 1290 b(DONE); 1291 } 1292 1293 bind(STUB); 1294 RuntimeAddress stub = nullptr; 1295 switch(ae) { 1296 case StrIntrinsicNode::LL: 1297 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1298 break; 1299 case StrIntrinsicNode::UU: 1300 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1301 break; 1302 case StrIntrinsicNode::LU: 1303 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1304 break; 1305 case StrIntrinsicNode::UL: 1306 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1307 break; 1308 default: 1309 ShouldNotReachHere(); 1310 } 1311 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1312 address call = trampoline_call(stub); 1313 if (call == nullptr) { 1314 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1315 ciEnv::current()->record_failure("CodeCache is full"); 1316 return; 1317 } 1318 b(DONE); 1319 1320 bind(SHORT_STRING); 1321 // Is the minimum length zero? 1322 cbz(cnt2, DONE); 1323 // arrange code to do most branches while loading and loading next characters 1324 // while comparing previous 1325 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1326 subs(cnt2, cnt2, 1); 1327 br(EQ, SHORT_LAST_INIT); 1328 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1329 b(SHORT_LOOP_START); 1330 bind(SHORT_LOOP); 1331 subs(cnt2, cnt2, 1); 1332 br(EQ, SHORT_LAST); 1333 bind(SHORT_LOOP_START); 1334 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1335 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1336 cmp(tmp1, cnt1); 1337 br(NE, SHORT_LOOP_TAIL); 1338 subs(cnt2, cnt2, 1); 1339 br(EQ, SHORT_LAST2); 1340 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1341 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1342 cmp(tmp2, rscratch1); 1343 br(EQ, SHORT_LOOP); 1344 sub(result, tmp2, rscratch1); 1345 b(DONE); 1346 bind(SHORT_LOOP_TAIL); 1347 sub(result, tmp1, cnt1); 1348 b(DONE); 1349 bind(SHORT_LAST2); 1350 cmp(tmp2, rscratch1); 1351 br(EQ, DONE); 1352 sub(result, tmp2, rscratch1); 1353 1354 b(DONE); 1355 bind(SHORT_LAST_INIT); 1356 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1357 bind(SHORT_LAST); 1358 cmp(tmp1, cnt1); 1359 br(EQ, DONE); 1360 sub(result, tmp1, cnt1); 1361 1362 bind(DONE); 1363 1364 BLOCK_COMMENT("} string_compare"); 1365 } 1366 1367 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1368 FloatRegister src2, Condition cond, bool isQ) { 1369 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1370 FloatRegister zn = src1, zm = src2; 1371 bool needs_negation = false; 1372 switch (cond) { 1373 case LT: cond = GT; zn = src2; zm = src1; break; 1374 case LE: cond = GE; zn = src2; zm = src1; break; 1375 case LO: cond = HI; zn = src2; zm = src1; break; 1376 case LS: cond = HS; zn = src2; zm = src1; break; 1377 case NE: cond = EQ; needs_negation = true; break; 1378 default: 1379 break; 1380 } 1381 1382 if (is_floating_point_type(bt)) { 1383 fcm(cond, dst, size, zn, zm); 1384 } else { 1385 cm(cond, dst, size, zn, zm); 1386 } 1387 1388 if (needs_negation) { 1389 notr(dst, isQ ? T16B : T8B, dst); 1390 } 1391 } 1392 1393 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1394 Condition cond, bool isQ) { 1395 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1396 if (bt == T_FLOAT || bt == T_DOUBLE) { 1397 if (cond == Assembler::NE) { 1398 fcm(Assembler::EQ, dst, size, src); 1399 notr(dst, isQ ? T16B : T8B, dst); 1400 } else { 1401 fcm(cond, dst, size, src); 1402 } 1403 } else { 1404 if (cond == Assembler::NE) { 1405 cm(Assembler::EQ, dst, size, src); 1406 notr(dst, isQ ? T16B : T8B, dst); 1407 } else { 1408 cm(cond, dst, size, src); 1409 } 1410 } 1411 } 1412 1413 // Compress the least significant bit of each byte to the rightmost and clear 1414 // the higher garbage bits. 1415 void C2_MacroAssembler::bytemask_compress(Register dst) { 1416 // Example input, dst = 0x01 00 00 00 01 01 00 01 1417 // The "??" bytes are garbage. 1418 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1419 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1420 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1421 andr(dst, dst, 0xff); // dst = 0x8D 1422 } 1423 1424 // Pack the lowest-numbered bit of each mask element in src into a long value 1425 // in dst, at most the first 64 lane elements. 1426 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1427 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1428 FloatRegister vtmp1, FloatRegister vtmp2) { 1429 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1430 assert_different_registers(dst, rscratch1); 1431 assert_different_registers(vtmp1, vtmp2); 1432 1433 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1434 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1435 // Expected: dst = 0x658D 1436 1437 // Convert the mask into vector with sequential bytes. 1438 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1439 sve_cpy(vtmp1, size, src, 1, false); 1440 if (bt != T_BYTE) { 1441 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1442 } 1443 1444 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1445 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1446 // is to compress each significant bit of the byte in a cross-lane way. Due 1447 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1448 // (bit-compress in each lane) with the biggest lane size (T = D) then 1449 // concatenate the results. 1450 1451 // The second source input of BEXT, initialized with 0x01 in each byte. 1452 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1453 sve_dup(vtmp2, B, 1); 1454 1455 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1456 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1457 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1458 // --------------------------------------- 1459 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1460 sve_bext(vtmp1, D, vtmp1, vtmp2); 1461 1462 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1463 // result to dst. 1464 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1465 // dst = 0x658D 1466 if (lane_cnt <= 8) { 1467 // No need to concatenate. 1468 umov(dst, vtmp1, B, 0); 1469 } else if (lane_cnt <= 16) { 1470 ins(vtmp1, B, vtmp1, 1, 8); 1471 umov(dst, vtmp1, H, 0); 1472 } else { 1473 // As the lane count is 64 at most, the final expected value must be in 1474 // the lowest 64 bits after narrowing vtmp1 from D to B. 1475 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1476 umov(dst, vtmp1, D, 0); 1477 } 1478 } else if (UseSVE > 0) { 1479 // Compress the lowest 8 bytes. 1480 fmovd(dst, vtmp1); 1481 bytemask_compress(dst); 1482 if (lane_cnt <= 8) return; 1483 1484 // Repeat on higher bytes and join the results. 1485 // Compress 8 bytes in each iteration. 1486 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1487 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1488 bytemask_compress(rscratch1); 1489 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1490 } 1491 } else { 1492 assert(false, "unsupported"); 1493 ShouldNotReachHere(); 1494 } 1495 } 1496 1497 // Unpack the mask, a long value in src, into predicate register dst based on the 1498 // corresponding data type. Note that dst can support at most 64 lanes. 1499 // Below example gives the expected dst predicate register in different types, with 1500 // a valid src(0x658D) on a 1024-bit vector size machine. 1501 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1502 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1503 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1504 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1505 // 1506 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1507 // has 24 significant bits would be an invalid input if dst predicate register refers to 1508 // a LONG type 1024-bit vector, which has at most 16 lanes. 1509 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1510 FloatRegister vtmp1, FloatRegister vtmp2) { 1511 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1512 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1513 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1514 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1515 // Expected: dst = 0b01101001 10001101 1516 1517 // Put long value from general purpose register into the first lane of vector. 1518 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1519 sve_dup(vtmp1, B, 0); 1520 mov(vtmp1, D, 0, src); 1521 1522 // As sve_cmp generates mask value with the minimum unit in byte, we should 1523 // transform the value in the first lane which is mask in bit now to the 1524 // mask in byte, which can be done by SVE2's BDEP instruction. 1525 1526 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1527 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1528 if (lane_cnt <= 8) { 1529 // Nothing. As only one byte exsits. 1530 } else if (lane_cnt <= 16) { 1531 ins(vtmp1, B, vtmp1, 8, 1); 1532 mov(vtmp1, B, 1, zr); 1533 } else { 1534 sve_vector_extend(vtmp1, D, vtmp1, B); 1535 } 1536 1537 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1538 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1539 sve_dup(vtmp2, B, 1); 1540 1541 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1542 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1543 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1544 // --------------------------------------- 1545 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1546 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1547 1548 if (bt != T_BYTE) { 1549 sve_vector_extend(vtmp1, size, vtmp1, B); 1550 } 1551 // Generate mask according to the given vector, in which the elements have been 1552 // extended to expected type. 1553 // dst = 0b01101001 10001101 1554 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1555 } 1556 1557 // Clobbers: rflags 1558 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1559 FloatRegister zn, FloatRegister zm, Condition cond) { 1560 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1561 FloatRegister z1 = zn, z2 = zm; 1562 switch (cond) { 1563 case LE: z1 = zm; z2 = zn; cond = GE; break; 1564 case LT: z1 = zm; z2 = zn; cond = GT; break; 1565 case LO: z1 = zm; z2 = zn; cond = HI; break; 1566 case LS: z1 = zm; z2 = zn; cond = HS; break; 1567 default: 1568 break; 1569 } 1570 1571 SIMD_RegVariant size = elemType_to_regVariant(bt); 1572 if (is_floating_point_type(bt)) { 1573 sve_fcm(cond, pd, size, pg, z1, z2); 1574 } else { 1575 assert(is_integral_type(bt), "unsupported element type"); 1576 sve_cmp(cond, pd, size, pg, z1, z2); 1577 } 1578 } 1579 1580 // Get index of the last mask lane that is set 1581 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1582 SIMD_RegVariant size = elemType_to_regVariant(bt); 1583 sve_rev(ptmp, size, src); 1584 sve_brkb(ptmp, ptrue, ptmp, false); 1585 sve_cntp(dst, size, ptrue, ptmp); 1586 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1587 subw(dst, rscratch1, dst); 1588 } 1589 1590 // Extend integer vector src to dst with the same lane count 1591 // but larger element size, e.g. 4B -> 4I 1592 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1593 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1594 if (src_bt == T_BYTE) { 1595 if (dst_bt == T_SHORT) { 1596 // 4B/8B to 4S/8S 1597 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1598 } else { 1599 // 4B to 4I 1600 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1601 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1602 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1603 } 1604 } else if (src_bt == T_SHORT) { 1605 // 4S to 4I 1606 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1607 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1608 } else if (src_bt == T_INT) { 1609 // 2I to 2L 1610 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1611 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1612 } else { 1613 ShouldNotReachHere(); 1614 } 1615 } 1616 1617 // Narrow integer vector src down to dst with the same lane count 1618 // but smaller element size, e.g. 4I -> 4B 1619 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1620 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1621 if (src_bt == T_SHORT) { 1622 // 4S/8S to 4B/8B 1623 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1624 assert(dst_bt == T_BYTE, "unsupported"); 1625 xtn(dst, T8B, src, T8H); 1626 } else if (src_bt == T_INT) { 1627 // 4I to 4B/4S 1628 assert(src_vlen_in_bytes == 16, "unsupported"); 1629 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1630 xtn(dst, T4H, src, T4S); 1631 if (dst_bt == T_BYTE) { 1632 xtn(dst, T8B, dst, T8H); 1633 } 1634 } else if (src_bt == T_LONG) { 1635 // 2L to 2I 1636 assert(src_vlen_in_bytes == 16, "unsupported"); 1637 assert(dst_bt == T_INT, "unsupported"); 1638 xtn(dst, T2S, src, T2D); 1639 } else { 1640 ShouldNotReachHere(); 1641 } 1642 } 1643 1644 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1645 FloatRegister src, SIMD_RegVariant src_size, 1646 bool is_unsigned) { 1647 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1648 1649 if (src_size == B) { 1650 switch (dst_size) { 1651 case H: 1652 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1653 break; 1654 case S: 1655 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1656 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1657 break; 1658 case D: 1659 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1660 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1661 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1662 break; 1663 default: 1664 ShouldNotReachHere(); 1665 } 1666 } else if (src_size == H) { 1667 if (dst_size == S) { 1668 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1669 } else { // D 1670 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1671 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1672 } 1673 } else if (src_size == S) { 1674 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1675 } 1676 } 1677 1678 // Vector narrow from src to dst with specified element sizes. 1679 // High part of dst vector will be filled with zero. 1680 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1681 FloatRegister src, SIMD_RegVariant src_size, 1682 FloatRegister tmp) { 1683 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1684 assert_different_registers(src, tmp); 1685 sve_dup(tmp, src_size, 0); 1686 if (src_size == D) { 1687 switch (dst_size) { 1688 case S: 1689 sve_uzp1(dst, S, src, tmp); 1690 break; 1691 case H: 1692 assert_different_registers(dst, tmp); 1693 sve_uzp1(dst, S, src, tmp); 1694 sve_uzp1(dst, H, dst, tmp); 1695 break; 1696 case B: 1697 assert_different_registers(dst, tmp); 1698 sve_uzp1(dst, S, src, tmp); 1699 sve_uzp1(dst, H, dst, tmp); 1700 sve_uzp1(dst, B, dst, tmp); 1701 break; 1702 default: 1703 ShouldNotReachHere(); 1704 } 1705 } else if (src_size == S) { 1706 if (dst_size == H) { 1707 sve_uzp1(dst, H, src, tmp); 1708 } else { // B 1709 assert_different_registers(dst, tmp); 1710 sve_uzp1(dst, H, src, tmp); 1711 sve_uzp1(dst, B, dst, tmp); 1712 } 1713 } else if (src_size == H) { 1714 sve_uzp1(dst, B, src, tmp); 1715 } 1716 } 1717 1718 // Extend src predicate to dst predicate with the same lane count but larger 1719 // element size, e.g. 64Byte -> 512Long 1720 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1721 uint dst_element_length_in_bytes, 1722 uint src_element_length_in_bytes) { 1723 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1724 sve_punpklo(dst, src); 1725 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1726 sve_punpklo(dst, src); 1727 sve_punpklo(dst, dst); 1728 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1729 sve_punpklo(dst, src); 1730 sve_punpklo(dst, dst); 1731 sve_punpklo(dst, dst); 1732 } else { 1733 assert(false, "unsupported"); 1734 ShouldNotReachHere(); 1735 } 1736 } 1737 1738 // Narrow src predicate to dst predicate with the same lane count but 1739 // smaller element size, e.g. 512Long -> 64Byte 1740 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1741 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1742 // The insignificant bits in src predicate are expected to be zero. 1743 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1744 // passed as the second argument. An example narrowing operation with a given mask would be - 1745 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1746 // Mask (for 2 Longs) : TF 1747 // Predicate register for the above mask (16 bits) : 00000001 00000000 1748 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1749 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1750 assert_different_registers(src, ptmp); 1751 assert_different_registers(dst, ptmp); 1752 sve_pfalse(ptmp); 1753 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1754 sve_uzp1(dst, B, src, ptmp); 1755 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1756 sve_uzp1(dst, H, src, ptmp); 1757 sve_uzp1(dst, B, dst, ptmp); 1758 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1759 sve_uzp1(dst, S, src, ptmp); 1760 sve_uzp1(dst, H, dst, ptmp); 1761 sve_uzp1(dst, B, dst, ptmp); 1762 } else { 1763 assert(false, "unsupported"); 1764 ShouldNotReachHere(); 1765 } 1766 } 1767 1768 // Vector reduction add for integral type with ASIMD instructions. 1769 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1770 Register isrc, FloatRegister vsrc, 1771 unsigned vector_length_in_bytes, 1772 FloatRegister vtmp) { 1773 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1774 assert_different_registers(dst, isrc); 1775 bool isQ = vector_length_in_bytes == 16; 1776 1777 BLOCK_COMMENT("neon_reduce_add_integral {"); 1778 switch(bt) { 1779 case T_BYTE: 1780 addv(vtmp, isQ ? T16B : T8B, vsrc); 1781 smov(dst, vtmp, B, 0); 1782 addw(dst, dst, isrc, ext::sxtb); 1783 break; 1784 case T_SHORT: 1785 addv(vtmp, isQ ? T8H : T4H, vsrc); 1786 smov(dst, vtmp, H, 0); 1787 addw(dst, dst, isrc, ext::sxth); 1788 break; 1789 case T_INT: 1790 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1791 umov(dst, vtmp, S, 0); 1792 addw(dst, dst, isrc); 1793 break; 1794 case T_LONG: 1795 assert(isQ, "unsupported"); 1796 addpd(vtmp, vsrc); 1797 umov(dst, vtmp, D, 0); 1798 add(dst, dst, isrc); 1799 break; 1800 default: 1801 assert(false, "unsupported"); 1802 ShouldNotReachHere(); 1803 } 1804 BLOCK_COMMENT("} neon_reduce_add_integral"); 1805 } 1806 1807 // Vector reduction multiply for integral type with ASIMD instructions. 1808 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1809 // Clobbers: rscratch1 1810 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1811 Register isrc, FloatRegister vsrc, 1812 unsigned vector_length_in_bytes, 1813 FloatRegister vtmp1, FloatRegister vtmp2) { 1814 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1815 bool isQ = vector_length_in_bytes == 16; 1816 1817 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1818 switch(bt) { 1819 case T_BYTE: 1820 if (isQ) { 1821 // Multiply the lower half and higher half of vector iteratively. 1822 // vtmp1 = vsrc[8:15] 1823 ins(vtmp1, D, vsrc, 0, 1); 1824 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1825 mulv(vtmp1, T8B, vtmp1, vsrc); 1826 // vtmp2 = vtmp1[4:7] 1827 ins(vtmp2, S, vtmp1, 0, 1); 1828 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1829 mulv(vtmp1, T8B, vtmp2, vtmp1); 1830 } else { 1831 ins(vtmp1, S, vsrc, 0, 1); 1832 mulv(vtmp1, T8B, vtmp1, vsrc); 1833 } 1834 // vtmp2 = vtmp1[2:3] 1835 ins(vtmp2, H, vtmp1, 0, 1); 1836 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1837 mulv(vtmp2, T8B, vtmp2, vtmp1); 1838 // dst = vtmp2[0] * isrc * vtmp2[1] 1839 umov(rscratch1, vtmp2, B, 0); 1840 mulw(dst, rscratch1, isrc); 1841 sxtb(dst, dst); 1842 umov(rscratch1, vtmp2, B, 1); 1843 mulw(dst, rscratch1, dst); 1844 sxtb(dst, dst); 1845 break; 1846 case T_SHORT: 1847 if (isQ) { 1848 ins(vtmp2, D, vsrc, 0, 1); 1849 mulv(vtmp2, T4H, vtmp2, vsrc); 1850 ins(vtmp1, S, vtmp2, 0, 1); 1851 mulv(vtmp1, T4H, vtmp1, vtmp2); 1852 } else { 1853 ins(vtmp1, S, vsrc, 0, 1); 1854 mulv(vtmp1, T4H, vtmp1, vsrc); 1855 } 1856 umov(rscratch1, vtmp1, H, 0); 1857 mulw(dst, rscratch1, isrc); 1858 sxth(dst, dst); 1859 umov(rscratch1, vtmp1, H, 1); 1860 mulw(dst, rscratch1, dst); 1861 sxth(dst, dst); 1862 break; 1863 case T_INT: 1864 if (isQ) { 1865 ins(vtmp1, D, vsrc, 0, 1); 1866 mulv(vtmp1, T2S, vtmp1, vsrc); 1867 } else { 1868 vtmp1 = vsrc; 1869 } 1870 umov(rscratch1, vtmp1, S, 0); 1871 mul(dst, rscratch1, isrc); 1872 umov(rscratch1, vtmp1, S, 1); 1873 mul(dst, rscratch1, dst); 1874 break; 1875 case T_LONG: 1876 umov(rscratch1, vsrc, D, 0); 1877 mul(dst, isrc, rscratch1); 1878 umov(rscratch1, vsrc, D, 1); 1879 mul(dst, dst, rscratch1); 1880 break; 1881 default: 1882 assert(false, "unsupported"); 1883 ShouldNotReachHere(); 1884 } 1885 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1886 } 1887 1888 // Vector reduction multiply for floating-point type with ASIMD instructions. 1889 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1890 FloatRegister fsrc, FloatRegister vsrc, 1891 unsigned vector_length_in_bytes, 1892 FloatRegister vtmp) { 1893 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1894 bool isQ = vector_length_in_bytes == 16; 1895 1896 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1897 switch(bt) { 1898 case T_FLOAT: 1899 fmuls(dst, fsrc, vsrc); 1900 ins(vtmp, S, vsrc, 0, 1); 1901 fmuls(dst, dst, vtmp); 1902 if (isQ) { 1903 ins(vtmp, S, vsrc, 0, 2); 1904 fmuls(dst, dst, vtmp); 1905 ins(vtmp, S, vsrc, 0, 3); 1906 fmuls(dst, dst, vtmp); 1907 } 1908 break; 1909 case T_DOUBLE: 1910 assert(isQ, "unsupported"); 1911 fmuld(dst, fsrc, vsrc); 1912 ins(vtmp, D, vsrc, 0, 1); 1913 fmuld(dst, dst, vtmp); 1914 break; 1915 default: 1916 assert(false, "unsupported"); 1917 ShouldNotReachHere(); 1918 } 1919 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1920 } 1921 1922 // Helper to select logical instruction 1923 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1924 Register Rn, Register Rm, 1925 enum shift_kind kind, unsigned shift) { 1926 switch(opc) { 1927 case Op_AndReductionV: 1928 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1929 break; 1930 case Op_OrReductionV: 1931 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1932 break; 1933 case Op_XorReductionV: 1934 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1935 break; 1936 default: 1937 assert(false, "unsupported"); 1938 ShouldNotReachHere(); 1939 } 1940 } 1941 1942 // Vector reduction logical operations And, Or, Xor 1943 // Clobbers: rscratch1 1944 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1945 Register isrc, FloatRegister vsrc, 1946 unsigned vector_length_in_bytes) { 1947 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1948 "unsupported"); 1949 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1950 assert_different_registers(dst, isrc); 1951 bool isQ = vector_length_in_bytes == 16; 1952 1953 BLOCK_COMMENT("neon_reduce_logical {"); 1954 umov(rscratch1, vsrc, isQ ? D : S, 0); 1955 umov(dst, vsrc, isQ ? D : S, 1); 1956 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1957 switch(bt) { 1958 case T_BYTE: 1959 if (isQ) { 1960 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1961 } 1962 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1963 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1964 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1965 sxtb(dst, dst); 1966 break; 1967 case T_SHORT: 1968 if (isQ) { 1969 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1970 } 1971 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1972 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1973 sxth(dst, dst); 1974 break; 1975 case T_INT: 1976 if (isQ) { 1977 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1978 } 1979 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1980 break; 1981 case T_LONG: 1982 assert(isQ, "unsupported"); 1983 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1984 break; 1985 default: 1986 assert(false, "unsupported"); 1987 ShouldNotReachHere(); 1988 } 1989 BLOCK_COMMENT("} neon_reduce_logical"); 1990 } 1991 1992 // Vector reduction min/max for integral type with ASIMD instructions. 1993 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1994 // Clobbers: rscratch1, rflags 1995 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1996 Register isrc, FloatRegister vsrc, 1997 unsigned vector_length_in_bytes, 1998 FloatRegister vtmp) { 1999 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2000 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2001 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2002 assert_different_registers(dst, isrc); 2003 bool isQ = vector_length_in_bytes == 16; 2004 bool is_min = opc == Op_MinReductionV; 2005 2006 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2007 if (bt == T_LONG) { 2008 assert(vtmp == fnoreg, "should be"); 2009 assert(isQ, "should be"); 2010 umov(rscratch1, vsrc, D, 0); 2011 cmp(isrc, rscratch1); 2012 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2013 umov(rscratch1, vsrc, D, 1); 2014 cmp(dst, rscratch1); 2015 csel(dst, dst, rscratch1, is_min ? LT : GT); 2016 } else { 2017 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2018 if (size == T2S) { 2019 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2020 } else { 2021 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2022 } 2023 if (bt == T_INT) { 2024 umov(dst, vtmp, S, 0); 2025 } else { 2026 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2027 } 2028 cmpw(dst, isrc); 2029 cselw(dst, dst, isrc, is_min ? LT : GT); 2030 } 2031 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2032 } 2033 2034 // Vector reduction for integral type with SVE instruction. 2035 // Supported operations are Add, And, Or, Xor, Max, Min. 2036 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2037 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2038 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2039 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2040 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2041 assert_different_registers(src1, dst); 2042 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2043 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2044 switch (opc) { 2045 case Op_AddReductionVI: { 2046 sve_uaddv(tmp, size, pg, src2); 2047 if (bt == T_BYTE) { 2048 smov(dst, tmp, size, 0); 2049 addw(dst, src1, dst, ext::sxtb); 2050 } else if (bt == T_SHORT) { 2051 smov(dst, tmp, size, 0); 2052 addw(dst, src1, dst, ext::sxth); 2053 } else { 2054 umov(dst, tmp, size, 0); 2055 addw(dst, dst, src1); 2056 } 2057 break; 2058 } 2059 case Op_AddReductionVL: { 2060 sve_uaddv(tmp, size, pg, src2); 2061 umov(dst, tmp, size, 0); 2062 add(dst, dst, src1); 2063 break; 2064 } 2065 case Op_AndReductionV: { 2066 sve_andv(tmp, size, pg, src2); 2067 if (bt == T_INT || bt == T_LONG) { 2068 umov(dst, tmp, size, 0); 2069 } else { 2070 smov(dst, tmp, size, 0); 2071 } 2072 if (bt == T_LONG) { 2073 andr(dst, dst, src1); 2074 } else { 2075 andw(dst, dst, src1); 2076 } 2077 break; 2078 } 2079 case Op_OrReductionV: { 2080 sve_orv(tmp, size, pg, src2); 2081 if (bt == T_INT || bt == T_LONG) { 2082 umov(dst, tmp, size, 0); 2083 } else { 2084 smov(dst, tmp, size, 0); 2085 } 2086 if (bt == T_LONG) { 2087 orr(dst, dst, src1); 2088 } else { 2089 orrw(dst, dst, src1); 2090 } 2091 break; 2092 } 2093 case Op_XorReductionV: { 2094 sve_eorv(tmp, size, pg, src2); 2095 if (bt == T_INT || bt == T_LONG) { 2096 umov(dst, tmp, size, 0); 2097 } else { 2098 smov(dst, tmp, size, 0); 2099 } 2100 if (bt == T_LONG) { 2101 eor(dst, dst, src1); 2102 } else { 2103 eorw(dst, dst, src1); 2104 } 2105 break; 2106 } 2107 case Op_MaxReductionV: { 2108 sve_smaxv(tmp, size, pg, src2); 2109 if (bt == T_INT || bt == T_LONG) { 2110 umov(dst, tmp, size, 0); 2111 } else { 2112 smov(dst, tmp, size, 0); 2113 } 2114 if (bt == T_LONG) { 2115 cmp(dst, src1); 2116 csel(dst, dst, src1, Assembler::GT); 2117 } else { 2118 cmpw(dst, src1); 2119 cselw(dst, dst, src1, Assembler::GT); 2120 } 2121 break; 2122 } 2123 case Op_MinReductionV: { 2124 sve_sminv(tmp, size, pg, src2); 2125 if (bt == T_INT || bt == T_LONG) { 2126 umov(dst, tmp, size, 0); 2127 } else { 2128 smov(dst, tmp, size, 0); 2129 } 2130 if (bt == T_LONG) { 2131 cmp(dst, src1); 2132 csel(dst, dst, src1, Assembler::LT); 2133 } else { 2134 cmpw(dst, src1); 2135 cselw(dst, dst, src1, Assembler::LT); 2136 } 2137 break; 2138 } 2139 default: 2140 assert(false, "unsupported"); 2141 ShouldNotReachHere(); 2142 } 2143 2144 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2145 if (bt == T_BYTE) { 2146 sxtb(dst, dst); 2147 } else if (bt == T_SHORT) { 2148 sxth(dst, dst); 2149 } 2150 } 2151 } 2152 2153 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2154 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2155 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2156 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2157 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2158 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2159 2160 // Set all elements to false if the input "lane_cnt" is zero. 2161 if (lane_cnt == 0) { 2162 sve_pfalse(dst); 2163 return; 2164 } 2165 2166 SIMD_RegVariant size = elemType_to_regVariant(bt); 2167 assert(size != Q, "invalid size"); 2168 2169 // Set all true if "lane_cnt" equals to the max lane count. 2170 if (lane_cnt == max_vector_length) { 2171 sve_ptrue(dst, size, /* ALL */ 0b11111); 2172 return; 2173 } 2174 2175 // Fixed numbers for "ptrue". 2176 switch(lane_cnt) { 2177 case 1: /* VL1 */ 2178 case 2: /* VL2 */ 2179 case 3: /* VL3 */ 2180 case 4: /* VL4 */ 2181 case 5: /* VL5 */ 2182 case 6: /* VL6 */ 2183 case 7: /* VL7 */ 2184 case 8: /* VL8 */ 2185 sve_ptrue(dst, size, lane_cnt); 2186 return; 2187 case 16: 2188 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2189 return; 2190 case 32: 2191 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2192 return; 2193 case 64: 2194 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2195 return; 2196 case 128: 2197 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2198 return; 2199 case 256: 2200 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2201 return; 2202 default: 2203 break; 2204 } 2205 2206 // Special patterns for "ptrue". 2207 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2208 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2209 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2210 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2211 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2212 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2213 } else { 2214 // Encode to "whileltw" for the remaining cases. 2215 mov(rscratch1, lane_cnt); 2216 sve_whileltw(dst, size, zr, rscratch1); 2217 } 2218 } 2219 2220 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2221 // Any remaining elements of dst will be filled with zero. 2222 // Clobbers: rscratch1 2223 // Preserves: src, mask 2224 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2225 FloatRegister vtmp1, FloatRegister vtmp2, 2226 PRegister pgtmp) { 2227 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2228 assert_different_registers(dst, src, vtmp1, vtmp2); 2229 assert_different_registers(mask, pgtmp); 2230 2231 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2232 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2233 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2234 sve_dup(vtmp2, H, 0); 2235 2236 // Extend lowest half to type INT. 2237 // dst = 00004444 00003333 00002222 00001111 2238 sve_uunpklo(dst, S, src); 2239 // pgtmp = 00000001 00000000 00000001 00000001 2240 sve_punpklo(pgtmp, mask); 2241 // Pack the active elements in size of type INT to the right, 2242 // and fill the remainings with zero. 2243 // dst = 00000000 00004444 00002222 00001111 2244 sve_compact(dst, S, dst, pgtmp); 2245 // Narrow the result back to type SHORT. 2246 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2247 sve_uzp1(dst, H, dst, vtmp2); 2248 // Count the active elements of lowest half. 2249 // rscratch1 = 3 2250 sve_cntp(rscratch1, S, ptrue, pgtmp); 2251 2252 // Repeat to the highest half. 2253 // pgtmp = 00000001 00000000 00000000 00000001 2254 sve_punpkhi(pgtmp, mask); 2255 // vtmp1 = 00008888 00007777 00006666 00005555 2256 sve_uunpkhi(vtmp1, S, src); 2257 // vtmp1 = 00000000 00000000 00008888 00005555 2258 sve_compact(vtmp1, S, vtmp1, pgtmp); 2259 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2260 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2261 2262 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2263 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2264 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2265 // TRUE_CNT is the number of active elements in the compressed low. 2266 neg(rscratch1, rscratch1); 2267 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2268 sve_index(vtmp2, H, rscratch1, 1); 2269 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2270 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2271 2272 // Combine the compressed high(after shifted) with the compressed low. 2273 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2274 sve_orr(dst, dst, vtmp1); 2275 } 2276 2277 // Clobbers: rscratch1, rscratch2 2278 // Preserves: src, mask 2279 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2280 FloatRegister vtmp1, FloatRegister vtmp2, 2281 FloatRegister vtmp3, FloatRegister vtmp4, 2282 PRegister ptmp, PRegister pgtmp) { 2283 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2284 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2285 assert_different_registers(mask, ptmp, pgtmp); 2286 // Example input: src = 88 77 66 55 44 33 22 11 2287 // mask = 01 00 00 01 01 00 01 01 2288 // Expected result: dst = 00 00 00 88 55 44 22 11 2289 2290 sve_dup(vtmp4, B, 0); 2291 // Extend lowest half to type SHORT. 2292 // vtmp1 = 0044 0033 0022 0011 2293 sve_uunpklo(vtmp1, H, src); 2294 // ptmp = 0001 0000 0001 0001 2295 sve_punpklo(ptmp, mask); 2296 // Count the active elements of lowest half. 2297 // rscratch2 = 3 2298 sve_cntp(rscratch2, H, ptrue, ptmp); 2299 // Pack the active elements in size of type SHORT to the right, 2300 // and fill the remainings with zero. 2301 // dst = 0000 0044 0022 0011 2302 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2303 // Narrow the result back to type BYTE. 2304 // dst = 00 00 00 00 00 44 22 11 2305 sve_uzp1(dst, B, dst, vtmp4); 2306 2307 // Repeat to the highest half. 2308 // ptmp = 0001 0000 0000 0001 2309 sve_punpkhi(ptmp, mask); 2310 // vtmp1 = 0088 0077 0066 0055 2311 sve_uunpkhi(vtmp2, H, src); 2312 // vtmp1 = 0000 0000 0088 0055 2313 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2314 2315 sve_dup(vtmp4, B, 0); 2316 // vtmp1 = 00 00 00 00 00 00 88 55 2317 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2318 2319 // Compressed low: dst = 00 00 00 00 00 44 22 11 2320 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2321 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2322 // TRUE_CNT is the number of active elements in the compressed low. 2323 neg(rscratch2, rscratch2); 2324 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2325 sve_index(vtmp2, B, rscratch2, 1); 2326 // vtmp1 = 00 00 00 88 55 00 00 00 2327 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2328 // Combine the compressed high(after shifted) with the compressed low. 2329 // dst = 00 00 00 88 55 44 22 11 2330 sve_orr(dst, dst, vtmp1); 2331 } 2332 2333 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2334 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2335 SIMD_Arrangement size = isQ ? T16B : T8B; 2336 if (bt == T_BYTE) { 2337 rbit(dst, size, src); 2338 } else { 2339 neon_reverse_bytes(dst, src, bt, isQ); 2340 rbit(dst, size, dst); 2341 } 2342 } 2343 2344 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2345 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2346 SIMD_Arrangement size = isQ ? T16B : T8B; 2347 switch (bt) { 2348 case T_BYTE: 2349 if (dst != src) { 2350 orr(dst, size, src, src); 2351 } 2352 break; 2353 case T_SHORT: 2354 rev16(dst, size, src); 2355 break; 2356 case T_INT: 2357 rev32(dst, size, src); 2358 break; 2359 case T_LONG: 2360 rev64(dst, size, src); 2361 break; 2362 default: 2363 assert(false, "unsupported"); 2364 ShouldNotReachHere(); 2365 } 2366 } 2367 2368 // Extract a scalar element from an sve vector at position 'idx'. 2369 // The input elements in src are expected to be of integral type. 2370 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2371 int idx, FloatRegister vtmp) { 2372 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2373 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2374 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2375 if (bt == T_INT || bt == T_LONG) { 2376 umov(dst, src, size, idx); 2377 } else { 2378 smov(dst, src, size, idx); 2379 } 2380 } else { 2381 sve_orr(vtmp, src, src); 2382 sve_ext(vtmp, vtmp, idx << size); 2383 if (bt == T_INT || bt == T_LONG) { 2384 umov(dst, vtmp, size, 0); 2385 } else { 2386 smov(dst, vtmp, size, 0); 2387 } 2388 } 2389 } 2390 2391 // java.lang.Math::round intrinsics 2392 2393 // Clobbers: rscratch1, rflags 2394 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2395 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2396 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2397 switch (T) { 2398 case T2S: 2399 case T4S: 2400 fmovs(tmp1, T, 0.5f); 2401 mov(rscratch1, jint_cast(0x1.0p23f)); 2402 break; 2403 case T2D: 2404 fmovd(tmp1, T, 0.5); 2405 mov(rscratch1, julong_cast(0x1.0p52)); 2406 break; 2407 default: 2408 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2409 } 2410 fadd(tmp1, T, tmp1, src); 2411 fcvtms(tmp1, T, tmp1); 2412 // tmp1 = floor(src + 0.5, ties to even) 2413 2414 fcvtas(dst, T, src); 2415 // dst = round(src), ties to away 2416 2417 fneg(tmp3, T, src); 2418 dup(tmp2, T, rscratch1); 2419 cm(HS, tmp3, T, tmp3, tmp2); 2420 // tmp3 is now a set of flags 2421 2422 bif(dst, T16B, tmp1, tmp3); 2423 // result in dst 2424 } 2425 2426 // Clobbers: rscratch1, rflags 2427 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2428 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2429 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2430 assert_different_registers(tmp1, tmp2, src, dst); 2431 2432 switch (T) { 2433 case S: 2434 mov(rscratch1, jint_cast(0x1.0p23f)); 2435 break; 2436 case D: 2437 mov(rscratch1, julong_cast(0x1.0p52)); 2438 break; 2439 default: 2440 assert(T == S || T == D, "invalid register variant"); 2441 } 2442 2443 sve_frinta(dst, T, ptrue, src); 2444 // dst = round(src), ties to away 2445 2446 Label none; 2447 2448 sve_fneg(tmp1, T, ptrue, src); 2449 sve_dup(tmp2, T, rscratch1); 2450 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2451 br(EQ, none); 2452 { 2453 sve_cpy(tmp1, T, pgtmp, 0.5); 2454 sve_fadd(tmp1, T, pgtmp, src); 2455 sve_frintm(dst, T, pgtmp, tmp1); 2456 // dst = floor(src + 0.5, ties to even) 2457 } 2458 bind(none); 2459 2460 sve_fcvtzs(dst, T, ptrue, dst, T); 2461 // result in dst 2462 } 2463 2464 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2465 FloatRegister one, SIMD_Arrangement T) { 2466 assert_different_registers(dst, src, zero, one); 2467 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2468 2469 facgt(dst, T, src, zero); 2470 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2471 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2472 } 2473 2474 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2475 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2476 assert_different_registers(dst, src, zero, one, vtmp); 2477 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2478 2479 sve_orr(vtmp, src, src); 2480 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2481 switch (T) { 2482 case S: 2483 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2484 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2485 // on the sign of the float value 2486 break; 2487 case D: 2488 sve_and(vtmp, T, min_jlong); 2489 sve_orr(vtmp, T, jlong_cast(1.0)); 2490 break; 2491 default: 2492 assert(false, "unsupported"); 2493 ShouldNotReachHere(); 2494 } 2495 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2496 // Result in dst 2497 } 2498 2499 bool C2_MacroAssembler::in_scratch_emit_size() { 2500 if (ciEnv::current()->task() != nullptr) { 2501 PhaseOutput* phase_output = Compile::current()->output(); 2502 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2503 return true; 2504 } 2505 } 2506 return MacroAssembler::in_scratch_emit_size(); 2507 }