1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 50 Register tmp2Reg, Register tmp3Reg) { 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmpReg; 54 Register tmp = tmp2Reg; 55 Label cont; 56 Label object_has_monitor; 57 Label count, no_count; 58 59 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 60 assert_different_registers(oop, box, tmp, disp_hdr, rscratch2); 61 62 // Load markWord from object into displaced_header. 63 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 64 65 if (DiagnoseSyncOnValueBasedClasses != 0) { 66 load_klass(tmp, oop); 67 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 68 tst(tmp, KlassFlags::_misc_is_value_based_class); 69 br(Assembler::NE, cont); 70 } 71 72 // Check for existing monitor 73 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 74 75 if (LockingMode == LM_MONITOR) { 76 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 77 b(cont); 78 } else { 79 assert(LockingMode == LM_LEGACY, "must be"); 80 // Set tmp to be (markWord of object | UNLOCK_VALUE). 81 orr(tmp, disp_hdr, markWord::unlocked_value); 82 83 // Initialize the box. (Must happen before we update the object mark!) 84 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 85 86 // Compare object markWord with an unlocked value (tmp) and if 87 // equal exchange the stack address of our box with object markWord. 88 // On failure disp_hdr contains the possibly locked markWord. 89 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 90 /*release*/ true, /*weak*/ false, disp_hdr); 91 br(Assembler::EQ, cont); 92 93 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 94 95 // If the compare-and-exchange succeeded, then we found an unlocked 96 // object, will have now locked it will continue at label cont 97 98 // Check if the owner is self by comparing the value in the 99 // markWord of object (disp_hdr) with the stack pointer. 100 mov(rscratch1, sp); 101 sub(disp_hdr, disp_hdr, rscratch1); 102 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 103 // If condition is true we are cont and hence we can store 0 as the 104 // displaced header in the box, which indicates that it is a recursive lock. 105 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 106 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 107 b(cont); 108 } 109 110 // Handle existing monitor. 111 bind(object_has_monitor); 112 113 // The object's monitor m is unlocked iff m->owner == nullptr, 114 // otherwise m->owner may contain a thread id, a stack address for LM_LEGACY, 115 // or the ANONYMOUS_OWNER constant for LM_LIGHTWEIGHT. 116 // 117 // Try to CAS m->owner from null to current thread. 118 ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset())); 119 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 120 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 121 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 122 123 // Store a non-null value into the box to avoid looking like a re-entrant 124 // lock. The fast-path monitor unlock code checks for 125 // markWord::monitor_value so use markWord::unused_mark which has the 126 // relevant bit set, and also matches ObjectSynchronizer::enter. 127 mov(tmp, (address)markWord::unused_mark().value()); 128 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 129 130 br(Assembler::EQ, cont); // CAS success means locking succeeded 131 132 cmp(tmp3Reg, rscratch2); 133 br(Assembler::NE, cont); // Check for recursive locking 134 135 // Recursive lock case 136 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 137 // flag == EQ still from the cmp above, checking if this is a reentrant lock 138 139 bind(cont); 140 // flag == EQ indicates success 141 // flag == NE indicates failure 142 br(Assembler::NE, no_count); 143 144 bind(count); 145 if (LockingMode == LM_LEGACY) { 146 inc_held_monitor_count(); 147 } 148 149 bind(no_count); 150 } 151 152 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 153 Register tmp2Reg) { 154 Register oop = objectReg; 155 Register box = boxReg; 156 Register disp_hdr = tmpReg; 157 Register owner_addr = tmpReg; 158 Register tmp = tmp2Reg; 159 Label cont; 160 Label object_has_monitor; 161 Label count, no_count; 162 Label unlocked; 163 164 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 165 assert_different_registers(oop, box, tmp, disp_hdr); 166 167 if (LockingMode == LM_LEGACY) { 168 // Find the lock address and load the displaced header from the stack. 169 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 170 171 // If the displaced header is 0, we have a recursive unlock. 172 cmp(disp_hdr, zr); 173 br(Assembler::EQ, cont); 174 } 175 176 // Handle existing monitor. 177 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 178 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 179 180 if (LockingMode == LM_MONITOR) { 181 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 182 b(cont); 183 } else { 184 assert(LockingMode == LM_LEGACY, "must be"); 185 // Check if it is still a light weight lock, this is is true if we 186 // see the stack address of the basicLock in the markWord of the 187 // object. 188 189 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 190 /*release*/ true, /*weak*/ false, tmp); 191 b(cont); 192 } 193 194 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 195 196 // Handle existing monitor. 197 bind(object_has_monitor); 198 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 199 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 200 201 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 202 203 Label notRecursive; 204 cbz(disp_hdr, notRecursive); 205 206 // Recursive lock 207 sub(disp_hdr, disp_hdr, 1u); 208 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 209 cmp(disp_hdr, disp_hdr); // Sets flags for result 210 b(cont); 211 212 bind(notRecursive); 213 214 // Compute owner address. 215 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 216 217 // Set owner to null. 218 // Release to satisfy the JMM 219 stlr(zr, owner_addr); 220 // We need a full fence after clearing owner to avoid stranding. 221 // StoreLoad achieves this. 222 membar(StoreLoad); 223 224 // Check if the entry lists are empty. 225 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 226 ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset())); 227 orr(rscratch1, rscratch1, tmpReg); 228 cmp(rscratch1, zr); 229 br(Assembler::EQ, cont); // If so we are done. 230 231 // Check if there is a successor. 232 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 233 cmp(rscratch1, zr); 234 br(Assembler::NE, unlocked); // If so we are done. 235 236 // Save the monitor pointer in the current thread, so we can try to 237 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 238 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 239 240 cmp(zr, rthread); // Set Flag to NE => slow path 241 b(cont); 242 243 bind(unlocked); 244 cmp(zr, zr); // Set Flag to EQ => fast path 245 246 // Intentional fall-through 247 248 bind(cont); 249 // flag == EQ indicates success 250 // flag == NE indicates failure 251 br(Assembler::NE, no_count); 252 253 bind(count); 254 if (LockingMode == LM_LEGACY) { 255 dec_held_monitor_count(); 256 } 257 258 bind(no_count); 259 } 260 261 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 262 Register t2, Register t3) { 263 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 264 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 265 266 // Handle inflated monitor. 267 Label inflated; 268 // Finish fast lock successfully. MUST branch to with flag == EQ 269 Label locked; 270 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 271 Label slow_path; 272 273 if (UseObjectMonitorTable) { 274 // Clear cache in case fast locking succeeds. 275 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 276 } 277 278 if (DiagnoseSyncOnValueBasedClasses != 0) { 279 load_klass(t1, obj); 280 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 281 tst(t1, KlassFlags::_misc_is_value_based_class); 282 br(Assembler::NE, slow_path); 283 } 284 285 const Register t1_mark = t1; 286 const Register t3_t = t3; 287 288 { // Lightweight locking 289 290 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 291 Label push; 292 293 const Register t2_top = t2; 294 295 // Check if lock-stack is full. 296 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 297 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 298 br(Assembler::GT, slow_path); 299 300 // Check if recursive. 301 subw(t3_t, t2_top, oopSize); 302 ldr(t3_t, Address(rthread, t3_t)); 303 cmp(obj, t3_t); 304 br(Assembler::EQ, push); 305 306 // Relaxed normal load to check for monitor. Optimization for monitor case. 307 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 308 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 309 310 // Not inflated 311 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 312 313 // Try to lock. Transition lock-bits 0b01 => 0b00 314 orr(t1_mark, t1_mark, markWord::unlocked_value); 315 eor(t3_t, t1_mark, markWord::unlocked_value); 316 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 317 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 318 br(Assembler::NE, slow_path); 319 320 bind(push); 321 // After successful lock, push object on lock-stack. 322 str(obj, Address(rthread, t2_top)); 323 addw(t2_top, t2_top, oopSize); 324 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 325 b(locked); 326 } 327 328 { // Handle inflated monitor. 329 bind(inflated); 330 331 const Register t1_monitor = t1; 332 333 if (!UseObjectMonitorTable) { 334 assert(t1_monitor == t1_mark, "should be the same here"); 335 } else { 336 Label monitor_found; 337 338 // Load cache address 339 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 340 341 const int num_unrolled = 2; 342 for (int i = 0; i < num_unrolled; i++) { 343 ldr(t1, Address(t3_t)); 344 cmp(obj, t1); 345 br(Assembler::EQ, monitor_found); 346 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 347 } 348 349 Label loop; 350 351 // Search for obj in cache. 352 bind(loop); 353 354 // Check for match. 355 ldr(t1, Address(t3_t)); 356 cmp(obj, t1); 357 br(Assembler::EQ, monitor_found); 358 359 // Search until null encountered, guaranteed _null_sentinel at end. 360 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 361 cbnz(t1, loop); 362 // Cache Miss, NE set from cmp above, cbnz does not set flags 363 b(slow_path); 364 365 bind(monitor_found); 366 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 367 } 368 369 const Register t2_owner_addr = t2; 370 const Register t3_owner = t3; 371 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 372 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 373 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 374 375 Label monitor_locked; 376 377 // Compute owner address. 378 lea(t2_owner_addr, owner_address); 379 380 // CAS owner (null => current thread id). 381 ldr(rscratch2, Address(rthread, JavaThread::lock_id_offset())); 382 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 383 /*release*/ false, /*weak*/ false, t3_owner); 384 br(Assembler::EQ, monitor_locked); 385 386 // Check if recursive. 387 cmp(t3_owner, rscratch2); 388 br(Assembler::NE, slow_path); 389 390 // Recursive. 391 increment(recursions_address, 1); 392 393 bind(monitor_locked); 394 if (UseObjectMonitorTable) { 395 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 396 } 397 } 398 399 bind(locked); 400 401 #ifdef ASSERT 402 // Check that locked label is reached with Flags == EQ. 403 Label flag_correct; 404 br(Assembler::EQ, flag_correct); 405 stop("Fast Lock Flag != EQ"); 406 #endif 407 408 bind(slow_path); 409 #ifdef ASSERT 410 // Check that slow_path label is reached with Flags == NE. 411 br(Assembler::NE, flag_correct); 412 stop("Fast Lock Flag != NE"); 413 bind(flag_correct); 414 #endif 415 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 416 } 417 418 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 419 Register t2, Register t3) { 420 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 421 assert_different_registers(obj, box, t1, t2, t3); 422 423 // Handle inflated monitor. 424 Label inflated, inflated_load_mark; 425 // Finish fast unlock successfully. MUST branch to with flag == EQ 426 Label unlocked; 427 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 428 Label slow_path; 429 430 const Register t1_mark = t1; 431 const Register t2_top = t2; 432 const Register t3_t = t3; 433 434 { // Lightweight unlock 435 436 Label push_and_slow_path; 437 438 // Check if obj is top of lock-stack. 439 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 440 subw(t2_top, t2_top, oopSize); 441 ldr(t3_t, Address(rthread, t2_top)); 442 cmp(obj, t3_t); 443 // Top of lock stack was not obj. Must be monitor. 444 br(Assembler::NE, inflated_load_mark); 445 446 // Pop lock-stack. 447 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 448 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 449 450 // Check if recursive. 451 subw(t3_t, t2_top, oopSize); 452 ldr(t3_t, Address(rthread, t3_t)); 453 cmp(obj, t3_t); 454 br(Assembler::EQ, unlocked); 455 456 // Not recursive. 457 // Load Mark. 458 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 459 460 // Check header for monitor (0b10). 461 // Because we got here by popping (meaning we pushed in locked) 462 // there will be no monitor in the box. So we need to push back the obj 463 // so that the runtime can fix any potential anonymous owner. 464 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 465 466 // Try to unlock. Transition lock bits 0b00 => 0b01 467 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 468 orr(t3_t, t1_mark, markWord::unlocked_value); 469 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 470 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 471 br(Assembler::EQ, unlocked); 472 473 bind(push_and_slow_path); 474 // Compare and exchange failed. 475 // Restore lock-stack and handle the unlock in runtime. 476 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 477 addw(t2_top, t2_top, oopSize); 478 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 479 b(slow_path); 480 } 481 482 483 { // Handle inflated monitor. 484 bind(inflated_load_mark); 485 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 486 #ifdef ASSERT 487 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 488 stop("Fast Unlock not monitor"); 489 #endif 490 491 bind(inflated); 492 493 #ifdef ASSERT 494 Label check_done; 495 subw(t2_top, t2_top, oopSize); 496 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 497 br(Assembler::LT, check_done); 498 ldr(t3_t, Address(rthread, t2_top)); 499 cmp(obj, t3_t); 500 br(Assembler::NE, inflated); 501 stop("Fast Unlock lock on stack"); 502 bind(check_done); 503 #endif 504 505 const Register t1_monitor = t1; 506 507 if (!UseObjectMonitorTable) { 508 assert(t1_monitor == t1_mark, "should be the same here"); 509 510 // Untag the monitor. 511 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 512 } else { 513 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 514 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 515 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 516 br(Assembler::LO, slow_path); 517 } 518 519 const Register t2_recursions = t2; 520 Label not_recursive; 521 522 // Check if recursive. 523 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 524 cbz(t2_recursions, not_recursive); 525 526 // Recursive unlock. 527 sub(t2_recursions, t2_recursions, 1u); 528 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 529 // Set flag == EQ 530 cmp(t2_recursions, t2_recursions); 531 b(unlocked); 532 533 bind(not_recursive); 534 535 const Register t2_owner_addr = t2; 536 537 // Compute owner address. 538 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 539 540 // Set owner to null. 541 // Release to satisfy the JMM 542 stlr(zr, t2_owner_addr); 543 // We need a full fence after clearing owner to avoid stranding. 544 // StoreLoad achieves this. 545 membar(StoreLoad); 546 547 // Check if the entry lists are empty. 548 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 549 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 550 orr(rscratch1, rscratch1, t3_t); 551 cmp(rscratch1, zr); 552 br(Assembler::EQ, unlocked); // If so we are done. 553 554 // Check if there is a successor. 555 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 556 cmp(rscratch1, zr); 557 br(Assembler::NE, unlocked); // If so we are done. 558 559 // Save the monitor pointer in the current thread, so we can try to 560 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 561 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 562 563 cmp(zr, rthread); // Set Flag to NE => slow path 564 b(slow_path); 565 } 566 567 bind(unlocked); 568 cmp(zr, zr); // Set Flags to EQ => fast path 569 570 #ifdef ASSERT 571 // Check that unlocked label is reached with Flags == EQ. 572 Label flag_correct; 573 br(Assembler::EQ, flag_correct); 574 stop("Fast Unlock Flag != EQ"); 575 #endif 576 577 bind(slow_path); 578 #ifdef ASSERT 579 // Check that slow_path label is reached with Flags == NE. 580 br(Assembler::NE, flag_correct); 581 stop("Fast Unlock Flag != NE"); 582 bind(flag_correct); 583 #endif 584 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 585 } 586 587 // Search for str1 in str2 and return index or -1 588 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 589 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 590 Register cnt2, Register cnt1, 591 Register tmp1, Register tmp2, 592 Register tmp3, Register tmp4, 593 Register tmp5, Register tmp6, 594 int icnt1, Register result, int ae) { 595 // NOTE: tmp5, tmp6 can be zr depending on specific method version 596 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 597 598 Register ch1 = rscratch1; 599 Register ch2 = rscratch2; 600 Register cnt1tmp = tmp1; 601 Register cnt2tmp = tmp2; 602 Register cnt1_neg = cnt1; 603 Register cnt2_neg = cnt2; 604 Register result_tmp = tmp4; 605 606 bool isL = ae == StrIntrinsicNode::LL; 607 608 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 609 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 610 int str1_chr_shift = str1_isL ? 0:1; 611 int str2_chr_shift = str2_isL ? 0:1; 612 int str1_chr_size = str1_isL ? 1:2; 613 int str2_chr_size = str2_isL ? 1:2; 614 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 615 (chr_insn)&MacroAssembler::ldrh; 616 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 617 (chr_insn)&MacroAssembler::ldrh; 618 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 619 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 620 621 // Note, inline_string_indexOf() generates checks: 622 // if (substr.count > string.count) return -1; 623 // if (substr.count == 0) return 0; 624 625 // We have two strings, a source string in str2, cnt2 and a pattern string 626 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 627 628 // For larger pattern and source we use a simplified Boyer Moore algorithm. 629 // With a small pattern and source we use linear scan. 630 631 if (icnt1 == -1) { 632 sub(result_tmp, cnt2, cnt1); 633 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 634 br(LT, LINEARSEARCH); 635 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 636 subs(zr, cnt1, 256); 637 lsr(tmp1, cnt2, 2); 638 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 639 br(GE, LINEARSTUB); 640 } 641 642 // The Boyer Moore alogorithm is based on the description here:- 643 // 644 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 645 // 646 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 647 // and the 'Good Suffix' rule. 648 // 649 // These rules are essentially heuristics for how far we can shift the 650 // pattern along the search string. 651 // 652 // The implementation here uses the 'Bad Character' rule only because of the 653 // complexity of initialisation for the 'Good Suffix' rule. 654 // 655 // This is also known as the Boyer-Moore-Horspool algorithm:- 656 // 657 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 658 // 659 // This particular implementation has few java-specific optimizations. 660 // 661 // #define ASIZE 256 662 // 663 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 664 // int i, j; 665 // unsigned c; 666 // unsigned char bc[ASIZE]; 667 // 668 // /* Preprocessing */ 669 // for (i = 0; i < ASIZE; ++i) 670 // bc[i] = m; 671 // for (i = 0; i < m - 1; ) { 672 // c = x[i]; 673 // ++i; 674 // // c < 256 for Latin1 string, so, no need for branch 675 // #ifdef PATTERN_STRING_IS_LATIN1 676 // bc[c] = m - i; 677 // #else 678 // if (c < ASIZE) bc[c] = m - i; 679 // #endif 680 // } 681 // 682 // /* Searching */ 683 // j = 0; 684 // while (j <= n - m) { 685 // c = y[i+j]; 686 // if (x[m-1] == c) 687 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 688 // if (i < 0) return j; 689 // // c < 256 for Latin1 string, so, no need for branch 690 // #ifdef SOURCE_STRING_IS_LATIN1 691 // // LL case: (c< 256) always true. Remove branch 692 // j += bc[y[j+m-1]]; 693 // #endif 694 // #ifndef PATTERN_STRING_IS_UTF 695 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 696 // if (c < ASIZE) 697 // j += bc[y[j+m-1]]; 698 // else 699 // j += 1 700 // #endif 701 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 702 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 703 // if (c < ASIZE) 704 // j += bc[y[j+m-1]]; 705 // else 706 // j += m 707 // #endif 708 // } 709 // } 710 711 if (icnt1 == -1) { 712 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 713 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 714 Register cnt1end = tmp2; 715 Register str2end = cnt2; 716 Register skipch = tmp2; 717 718 // str1 length is >=8, so, we can read at least 1 register for cases when 719 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 720 // UL case. We'll re-read last character in inner pre-loop code to have 721 // single outer pre-loop load 722 const int firstStep = isL ? 7 : 3; 723 724 const int ASIZE = 256; 725 const int STORED_BYTES = 32; // amount of bytes stored per instruction 726 sub(sp, sp, ASIZE); 727 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 728 mov(ch1, sp); 729 BIND(BM_INIT_LOOP); 730 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 731 subs(tmp5, tmp5, 1); 732 br(GT, BM_INIT_LOOP); 733 734 sub(cnt1tmp, cnt1, 1); 735 mov(tmp5, str2); 736 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 737 sub(ch2, cnt1, 1); 738 mov(tmp3, str1); 739 BIND(BCLOOP); 740 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 741 if (!str1_isL) { 742 subs(zr, ch1, ASIZE); 743 br(HS, BCSKIP); 744 } 745 strb(ch2, Address(sp, ch1)); 746 BIND(BCSKIP); 747 subs(ch2, ch2, 1); 748 br(GT, BCLOOP); 749 750 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 751 if (str1_isL == str2_isL) { 752 // load last 8 bytes (8LL/4UU symbols) 753 ldr(tmp6, Address(tmp6, -wordSize)); 754 } else { 755 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 756 // convert Latin1 to UTF. We'll have to wait until load completed, but 757 // it's still faster than per-character loads+checks 758 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 759 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 760 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 761 andr(tmp6, tmp6, 0xFF); // str1[N-4] 762 orr(ch2, ch1, ch2, LSL, 16); 763 orr(tmp6, tmp6, tmp3, LSL, 48); 764 orr(tmp6, tmp6, ch2, LSL, 16); 765 } 766 BIND(BMLOOPSTR2); 767 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 768 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 769 if (str1_isL == str2_isL) { 770 // re-init tmp3. It's for free because it's executed in parallel with 771 // load above. Alternative is to initialize it before loop, but it'll 772 // affect performance on in-order systems with 2 or more ld/st pipelines 773 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 774 } 775 if (!isL) { // UU/UL case 776 lsl(ch2, cnt1tmp, 1); // offset in bytes 777 } 778 cmp(tmp3, skipch); 779 br(NE, BMSKIP); 780 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 781 mov(ch1, tmp6); 782 if (isL) { 783 b(BMLOOPSTR1_AFTER_LOAD); 784 } else { 785 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 786 b(BMLOOPSTR1_CMP); 787 } 788 BIND(BMLOOPSTR1); 789 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 790 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 791 BIND(BMLOOPSTR1_AFTER_LOAD); 792 subs(cnt1tmp, cnt1tmp, 1); 793 br(LT, BMLOOPSTR1_LASTCMP); 794 BIND(BMLOOPSTR1_CMP); 795 cmp(ch1, ch2); 796 br(EQ, BMLOOPSTR1); 797 BIND(BMSKIP); 798 if (!isL) { 799 // if we've met UTF symbol while searching Latin1 pattern, then we can 800 // skip cnt1 symbols 801 if (str1_isL != str2_isL) { 802 mov(result_tmp, cnt1); 803 } else { 804 mov(result_tmp, 1); 805 } 806 subs(zr, skipch, ASIZE); 807 br(HS, BMADV); 808 } 809 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 810 BIND(BMADV); 811 sub(cnt1tmp, cnt1, 1); 812 add(str2, str2, result_tmp, LSL, str2_chr_shift); 813 cmp(str2, str2end); 814 br(LE, BMLOOPSTR2); 815 add(sp, sp, ASIZE); 816 b(NOMATCH); 817 BIND(BMLOOPSTR1_LASTCMP); 818 cmp(ch1, ch2); 819 br(NE, BMSKIP); 820 BIND(BMMATCH); 821 sub(result, str2, tmp5); 822 if (!str2_isL) lsr(result, result, 1); 823 add(sp, sp, ASIZE); 824 b(DONE); 825 826 BIND(LINEARSTUB); 827 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 828 br(LT, LINEAR_MEDIUM); 829 mov(result, zr); 830 RuntimeAddress stub = nullptr; 831 if (isL) { 832 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 833 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 834 } else if (str1_isL) { 835 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 836 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 837 } else { 838 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 839 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 840 } 841 address call = trampoline_call(stub); 842 if (call == nullptr) { 843 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 844 ciEnv::current()->record_failure("CodeCache is full"); 845 return; 846 } 847 b(DONE); 848 } 849 850 BIND(LINEARSEARCH); 851 { 852 Label DO1, DO2, DO3; 853 854 Register str2tmp = tmp2; 855 Register first = tmp3; 856 857 if (icnt1 == -1) 858 { 859 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 860 861 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 862 br(LT, DOSHORT); 863 BIND(LINEAR_MEDIUM); 864 (this->*str1_load_1chr)(first, Address(str1)); 865 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 866 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 867 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 868 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 869 870 BIND(FIRST_LOOP); 871 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 872 cmp(first, ch2); 873 br(EQ, STR1_LOOP); 874 BIND(STR2_NEXT); 875 adds(cnt2_neg, cnt2_neg, str2_chr_size); 876 br(LE, FIRST_LOOP); 877 b(NOMATCH); 878 879 BIND(STR1_LOOP); 880 adds(cnt1tmp, cnt1_neg, str1_chr_size); 881 add(cnt2tmp, cnt2_neg, str2_chr_size); 882 br(GE, MATCH); 883 884 BIND(STR1_NEXT); 885 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 886 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 887 cmp(ch1, ch2); 888 br(NE, STR2_NEXT); 889 adds(cnt1tmp, cnt1tmp, str1_chr_size); 890 add(cnt2tmp, cnt2tmp, str2_chr_size); 891 br(LT, STR1_NEXT); 892 b(MATCH); 893 894 BIND(DOSHORT); 895 if (str1_isL == str2_isL) { 896 cmp(cnt1, (u1)2); 897 br(LT, DO1); 898 br(GT, DO3); 899 } 900 } 901 902 if (icnt1 == 4) { 903 Label CH1_LOOP; 904 905 (this->*load_4chr)(ch1, str1); 906 sub(result_tmp, cnt2, 4); 907 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 908 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 909 910 BIND(CH1_LOOP); 911 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 912 cmp(ch1, ch2); 913 br(EQ, MATCH); 914 adds(cnt2_neg, cnt2_neg, str2_chr_size); 915 br(LE, CH1_LOOP); 916 b(NOMATCH); 917 } 918 919 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 920 Label CH1_LOOP; 921 922 BIND(DO2); 923 (this->*load_2chr)(ch1, str1); 924 if (icnt1 == 2) { 925 sub(result_tmp, cnt2, 2); 926 } 927 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 928 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 929 BIND(CH1_LOOP); 930 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 931 cmp(ch1, ch2); 932 br(EQ, MATCH); 933 adds(cnt2_neg, cnt2_neg, str2_chr_size); 934 br(LE, CH1_LOOP); 935 b(NOMATCH); 936 } 937 938 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 939 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 940 941 BIND(DO3); 942 (this->*load_2chr)(first, str1); 943 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 944 if (icnt1 == 3) { 945 sub(result_tmp, cnt2, 3); 946 } 947 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 948 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 949 BIND(FIRST_LOOP); 950 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 951 cmpw(first, ch2); 952 br(EQ, STR1_LOOP); 953 BIND(STR2_NEXT); 954 adds(cnt2_neg, cnt2_neg, str2_chr_size); 955 br(LE, FIRST_LOOP); 956 b(NOMATCH); 957 958 BIND(STR1_LOOP); 959 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 960 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 961 cmp(ch1, ch2); 962 br(NE, STR2_NEXT); 963 b(MATCH); 964 } 965 966 if (icnt1 == -1 || icnt1 == 1) { 967 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 968 969 BIND(DO1); 970 (this->*str1_load_1chr)(ch1, str1); 971 cmp(cnt2, (u1)8); 972 br(LT, DO1_SHORT); 973 974 sub(result_tmp, cnt2, 8/str2_chr_size); 975 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 976 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 977 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 978 979 if (str2_isL) { 980 orr(ch1, ch1, ch1, LSL, 8); 981 } 982 orr(ch1, ch1, ch1, LSL, 16); 983 orr(ch1, ch1, ch1, LSL, 32); 984 BIND(CH1_LOOP); 985 ldr(ch2, Address(str2, cnt2_neg)); 986 eor(ch2, ch1, ch2); 987 sub(tmp1, ch2, tmp3); 988 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 989 bics(tmp1, tmp1, tmp2); 990 br(NE, HAS_ZERO); 991 adds(cnt2_neg, cnt2_neg, 8); 992 br(LT, CH1_LOOP); 993 994 cmp(cnt2_neg, (u1)8); 995 mov(cnt2_neg, 0); 996 br(LT, CH1_LOOP); 997 b(NOMATCH); 998 999 BIND(HAS_ZERO); 1000 rev(tmp1, tmp1); 1001 clz(tmp1, tmp1); 1002 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1003 b(MATCH); 1004 1005 BIND(DO1_SHORT); 1006 mov(result_tmp, cnt2); 1007 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1008 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1009 BIND(DO1_LOOP); 1010 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1011 cmpw(ch1, ch2); 1012 br(EQ, MATCH); 1013 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1014 br(LT, DO1_LOOP); 1015 } 1016 } 1017 BIND(NOMATCH); 1018 mov(result, -1); 1019 b(DONE); 1020 BIND(MATCH); 1021 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1022 BIND(DONE); 1023 } 1024 1025 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1026 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1027 1028 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1029 Register ch, Register result, 1030 Register tmp1, Register tmp2, Register tmp3) 1031 { 1032 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1033 Register cnt1_neg = cnt1; 1034 Register ch1 = rscratch1; 1035 Register result_tmp = rscratch2; 1036 1037 cbz(cnt1, NOMATCH); 1038 1039 cmp(cnt1, (u1)4); 1040 br(LT, DO1_SHORT); 1041 1042 orr(ch, ch, ch, LSL, 16); 1043 orr(ch, ch, ch, LSL, 32); 1044 1045 sub(cnt1, cnt1, 4); 1046 mov(result_tmp, cnt1); 1047 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1048 sub(cnt1_neg, zr, cnt1, LSL, 1); 1049 1050 mov(tmp3, 0x0001000100010001); 1051 1052 BIND(CH1_LOOP); 1053 ldr(ch1, Address(str1, cnt1_neg)); 1054 eor(ch1, ch, ch1); 1055 sub(tmp1, ch1, tmp3); 1056 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1057 bics(tmp1, tmp1, tmp2); 1058 br(NE, HAS_ZERO); 1059 adds(cnt1_neg, cnt1_neg, 8); 1060 br(LT, CH1_LOOP); 1061 1062 cmp(cnt1_neg, (u1)8); 1063 mov(cnt1_neg, 0); 1064 br(LT, CH1_LOOP); 1065 b(NOMATCH); 1066 1067 BIND(HAS_ZERO); 1068 rev(tmp1, tmp1); 1069 clz(tmp1, tmp1); 1070 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1071 b(MATCH); 1072 1073 BIND(DO1_SHORT); 1074 mov(result_tmp, cnt1); 1075 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1076 sub(cnt1_neg, zr, cnt1, LSL, 1); 1077 BIND(DO1_LOOP); 1078 ldrh(ch1, Address(str1, cnt1_neg)); 1079 cmpw(ch, ch1); 1080 br(EQ, MATCH); 1081 adds(cnt1_neg, cnt1_neg, 2); 1082 br(LT, DO1_LOOP); 1083 BIND(NOMATCH); 1084 mov(result, -1); 1085 b(DONE); 1086 BIND(MATCH); 1087 add(result, result_tmp, cnt1_neg, ASR, 1); 1088 BIND(DONE); 1089 } 1090 1091 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1092 Register ch, Register result, 1093 FloatRegister ztmp1, 1094 FloatRegister ztmp2, 1095 PRegister tmp_pg, 1096 PRegister tmp_pdn, bool isL) 1097 { 1098 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1099 assert(tmp_pg->is_governing(), 1100 "this register has to be a governing predicate register"); 1101 1102 Label LOOP, MATCH, DONE, NOMATCH; 1103 Register vec_len = rscratch1; 1104 Register idx = rscratch2; 1105 1106 SIMD_RegVariant T = (isL == true) ? B : H; 1107 1108 cbz(cnt1, NOMATCH); 1109 1110 // Assign the particular char throughout the vector. 1111 sve_dup(ztmp2, T, ch); 1112 if (isL) { 1113 sve_cntb(vec_len); 1114 } else { 1115 sve_cnth(vec_len); 1116 } 1117 mov(idx, 0); 1118 1119 // Generate a predicate to control the reading of input string. 1120 sve_whilelt(tmp_pg, T, idx, cnt1); 1121 1122 BIND(LOOP); 1123 // Read a vector of 8- or 16-bit data depending on the string type. Note 1124 // that inactive elements indicated by the predicate register won't cause 1125 // a data read from memory to the destination vector. 1126 if (isL) { 1127 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1128 } else { 1129 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1130 } 1131 add(idx, idx, vec_len); 1132 1133 // Perform the comparison. An element of the destination predicate is set 1134 // to active if the particular char is matched. 1135 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1136 1137 // Branch if the particular char is found. 1138 br(NE, MATCH); 1139 1140 sve_whilelt(tmp_pg, T, idx, cnt1); 1141 1142 // Loop back if the particular char not found. 1143 br(MI, LOOP); 1144 1145 BIND(NOMATCH); 1146 mov(result, -1); 1147 b(DONE); 1148 1149 BIND(MATCH); 1150 // Undo the index increment. 1151 sub(idx, idx, vec_len); 1152 1153 // Crop the vector to find its location. 1154 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1155 add(result, idx, -1); 1156 sve_incp(result, T, tmp_pdn); 1157 BIND(DONE); 1158 } 1159 1160 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1161 Register ch, Register result, 1162 Register tmp1, Register tmp2, Register tmp3) 1163 { 1164 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1165 Register cnt1_neg = cnt1; 1166 Register ch1 = rscratch1; 1167 Register result_tmp = rscratch2; 1168 1169 cbz(cnt1, NOMATCH); 1170 1171 cmp(cnt1, (u1)8); 1172 br(LT, DO1_SHORT); 1173 1174 orr(ch, ch, ch, LSL, 8); 1175 orr(ch, ch, ch, LSL, 16); 1176 orr(ch, ch, ch, LSL, 32); 1177 1178 sub(cnt1, cnt1, 8); 1179 mov(result_tmp, cnt1); 1180 lea(str1, Address(str1, cnt1)); 1181 sub(cnt1_neg, zr, cnt1); 1182 1183 mov(tmp3, 0x0101010101010101); 1184 1185 BIND(CH1_LOOP); 1186 ldr(ch1, Address(str1, cnt1_neg)); 1187 eor(ch1, ch, ch1); 1188 sub(tmp1, ch1, tmp3); 1189 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1190 bics(tmp1, tmp1, tmp2); 1191 br(NE, HAS_ZERO); 1192 adds(cnt1_neg, cnt1_neg, 8); 1193 br(LT, CH1_LOOP); 1194 1195 cmp(cnt1_neg, (u1)8); 1196 mov(cnt1_neg, 0); 1197 br(LT, CH1_LOOP); 1198 b(NOMATCH); 1199 1200 BIND(HAS_ZERO); 1201 rev(tmp1, tmp1); 1202 clz(tmp1, tmp1); 1203 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1204 b(MATCH); 1205 1206 BIND(DO1_SHORT); 1207 mov(result_tmp, cnt1); 1208 lea(str1, Address(str1, cnt1)); 1209 sub(cnt1_neg, zr, cnt1); 1210 BIND(DO1_LOOP); 1211 ldrb(ch1, Address(str1, cnt1_neg)); 1212 cmp(ch, ch1); 1213 br(EQ, MATCH); 1214 adds(cnt1_neg, cnt1_neg, 1); 1215 br(LT, DO1_LOOP); 1216 BIND(NOMATCH); 1217 mov(result, -1); 1218 b(DONE); 1219 BIND(MATCH); 1220 add(result, result_tmp, cnt1_neg); 1221 BIND(DONE); 1222 } 1223 1224 // Compare strings. 1225 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1226 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1227 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1228 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1229 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1230 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1231 SHORT_LOOP_START, TAIL_CHECK; 1232 1233 bool isLL = ae == StrIntrinsicNode::LL; 1234 bool isLU = ae == StrIntrinsicNode::LU; 1235 bool isUL = ae == StrIntrinsicNode::UL; 1236 1237 // The stub threshold for LL strings is: 72 (64 + 8) chars 1238 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1239 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1240 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1241 1242 bool str1_isL = isLL || isLU; 1243 bool str2_isL = isLL || isUL; 1244 1245 int str1_chr_shift = str1_isL ? 0 : 1; 1246 int str2_chr_shift = str2_isL ? 0 : 1; 1247 int str1_chr_size = str1_isL ? 1 : 2; 1248 int str2_chr_size = str2_isL ? 1 : 2; 1249 int minCharsInWord = isLL ? wordSize : wordSize/2; 1250 1251 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1252 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1253 (chr_insn)&MacroAssembler::ldrh; 1254 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1255 (chr_insn)&MacroAssembler::ldrh; 1256 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1257 (uxt_insn)&MacroAssembler::uxthw; 1258 1259 BLOCK_COMMENT("string_compare {"); 1260 1261 // Bizarrely, the counts are passed in bytes, regardless of whether they 1262 // are L or U strings, however the result is always in characters. 1263 if (!str1_isL) asrw(cnt1, cnt1, 1); 1264 if (!str2_isL) asrw(cnt2, cnt2, 1); 1265 1266 // Compute the minimum of the string lengths and save the difference. 1267 subsw(result, cnt1, cnt2); 1268 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1269 1270 // A very short string 1271 cmpw(cnt2, minCharsInWord); 1272 br(Assembler::LE, SHORT_STRING); 1273 1274 // Compare longwords 1275 // load first parts of strings and finish initialization while loading 1276 { 1277 if (str1_isL == str2_isL) { // LL or UU 1278 ldr(tmp1, Address(str1)); 1279 cmp(str1, str2); 1280 br(Assembler::EQ, DONE); 1281 ldr(tmp2, Address(str2)); 1282 cmp(cnt2, stub_threshold); 1283 br(GE, STUB); 1284 subsw(cnt2, cnt2, minCharsInWord); 1285 br(EQ, TAIL_CHECK); 1286 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1287 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1288 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1289 } else if (isLU) { 1290 ldrs(vtmp, Address(str1)); 1291 ldr(tmp2, Address(str2)); 1292 cmp(cnt2, stub_threshold); 1293 br(GE, STUB); 1294 subw(cnt2, cnt2, 4); 1295 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1296 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1297 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1298 zip1(vtmp, T8B, vtmp, vtmpZ); 1299 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1300 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1301 add(cnt1, cnt1, 4); 1302 fmovd(tmp1, vtmp); 1303 } else { // UL case 1304 ldr(tmp1, Address(str1)); 1305 ldrs(vtmp, Address(str2)); 1306 cmp(cnt2, stub_threshold); 1307 br(GE, STUB); 1308 subw(cnt2, cnt2, 4); 1309 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1310 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1311 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1312 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1313 zip1(vtmp, T8B, vtmp, vtmpZ); 1314 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1315 add(cnt1, cnt1, 8); 1316 fmovd(tmp2, vtmp); 1317 } 1318 adds(cnt2, cnt2, isUL ? 4 : 8); 1319 br(GE, TAIL); 1320 eor(rscratch2, tmp1, tmp2); 1321 cbnz(rscratch2, DIFF); 1322 // main loop 1323 bind(NEXT_WORD); 1324 if (str1_isL == str2_isL) { 1325 ldr(tmp1, Address(str1, cnt2)); 1326 ldr(tmp2, Address(str2, cnt2)); 1327 adds(cnt2, cnt2, 8); 1328 } else if (isLU) { 1329 ldrs(vtmp, Address(str1, cnt1)); 1330 ldr(tmp2, Address(str2, cnt2)); 1331 add(cnt1, cnt1, 4); 1332 zip1(vtmp, T8B, vtmp, vtmpZ); 1333 fmovd(tmp1, vtmp); 1334 adds(cnt2, cnt2, 8); 1335 } else { // UL 1336 ldrs(vtmp, Address(str2, cnt2)); 1337 ldr(tmp1, Address(str1, cnt1)); 1338 zip1(vtmp, T8B, vtmp, vtmpZ); 1339 add(cnt1, cnt1, 8); 1340 fmovd(tmp2, vtmp); 1341 adds(cnt2, cnt2, 4); 1342 } 1343 br(GE, TAIL); 1344 1345 eor(rscratch2, tmp1, tmp2); 1346 cbz(rscratch2, NEXT_WORD); 1347 b(DIFF); 1348 bind(TAIL); 1349 eor(rscratch2, tmp1, tmp2); 1350 cbnz(rscratch2, DIFF); 1351 // Last longword. In the case where length == 4 we compare the 1352 // same longword twice, but that's still faster than another 1353 // conditional branch. 1354 if (str1_isL == str2_isL) { 1355 ldr(tmp1, Address(str1)); 1356 ldr(tmp2, Address(str2)); 1357 } else if (isLU) { 1358 ldrs(vtmp, Address(str1)); 1359 ldr(tmp2, Address(str2)); 1360 zip1(vtmp, T8B, vtmp, vtmpZ); 1361 fmovd(tmp1, vtmp); 1362 } else { // UL 1363 ldrs(vtmp, Address(str2)); 1364 ldr(tmp1, Address(str1)); 1365 zip1(vtmp, T8B, vtmp, vtmpZ); 1366 fmovd(tmp2, vtmp); 1367 } 1368 bind(TAIL_CHECK); 1369 eor(rscratch2, tmp1, tmp2); 1370 cbz(rscratch2, DONE); 1371 1372 // Find the first different characters in the longwords and 1373 // compute their difference. 1374 bind(DIFF); 1375 rev(rscratch2, rscratch2); 1376 clz(rscratch2, rscratch2); 1377 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1378 lsrv(tmp1, tmp1, rscratch2); 1379 (this->*ext_chr)(tmp1, tmp1); 1380 lsrv(tmp2, tmp2, rscratch2); 1381 (this->*ext_chr)(tmp2, tmp2); 1382 subw(result, tmp1, tmp2); 1383 b(DONE); 1384 } 1385 1386 bind(STUB); 1387 RuntimeAddress stub = nullptr; 1388 switch(ae) { 1389 case StrIntrinsicNode::LL: 1390 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1391 break; 1392 case StrIntrinsicNode::UU: 1393 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1394 break; 1395 case StrIntrinsicNode::LU: 1396 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1397 break; 1398 case StrIntrinsicNode::UL: 1399 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1400 break; 1401 default: 1402 ShouldNotReachHere(); 1403 } 1404 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1405 address call = trampoline_call(stub); 1406 if (call == nullptr) { 1407 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1408 ciEnv::current()->record_failure("CodeCache is full"); 1409 return; 1410 } 1411 b(DONE); 1412 1413 bind(SHORT_STRING); 1414 // Is the minimum length zero? 1415 cbz(cnt2, DONE); 1416 // arrange code to do most branches while loading and loading next characters 1417 // while comparing previous 1418 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1419 subs(cnt2, cnt2, 1); 1420 br(EQ, SHORT_LAST_INIT); 1421 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1422 b(SHORT_LOOP_START); 1423 bind(SHORT_LOOP); 1424 subs(cnt2, cnt2, 1); 1425 br(EQ, SHORT_LAST); 1426 bind(SHORT_LOOP_START); 1427 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1428 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1429 cmp(tmp1, cnt1); 1430 br(NE, SHORT_LOOP_TAIL); 1431 subs(cnt2, cnt2, 1); 1432 br(EQ, SHORT_LAST2); 1433 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1434 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1435 cmp(tmp2, rscratch1); 1436 br(EQ, SHORT_LOOP); 1437 sub(result, tmp2, rscratch1); 1438 b(DONE); 1439 bind(SHORT_LOOP_TAIL); 1440 sub(result, tmp1, cnt1); 1441 b(DONE); 1442 bind(SHORT_LAST2); 1443 cmp(tmp2, rscratch1); 1444 br(EQ, DONE); 1445 sub(result, tmp2, rscratch1); 1446 1447 b(DONE); 1448 bind(SHORT_LAST_INIT); 1449 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1450 bind(SHORT_LAST); 1451 cmp(tmp1, cnt1); 1452 br(EQ, DONE); 1453 sub(result, tmp1, cnt1); 1454 1455 bind(DONE); 1456 1457 BLOCK_COMMENT("} string_compare"); 1458 } 1459 1460 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1461 FloatRegister src2, Condition cond, bool isQ) { 1462 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1463 FloatRegister zn = src1, zm = src2; 1464 bool needs_negation = false; 1465 switch (cond) { 1466 case LT: cond = GT; zn = src2; zm = src1; break; 1467 case LE: cond = GE; zn = src2; zm = src1; break; 1468 case LO: cond = HI; zn = src2; zm = src1; break; 1469 case LS: cond = HS; zn = src2; zm = src1; break; 1470 case NE: cond = EQ; needs_negation = true; break; 1471 default: 1472 break; 1473 } 1474 1475 if (is_floating_point_type(bt)) { 1476 fcm(cond, dst, size, zn, zm); 1477 } else { 1478 cm(cond, dst, size, zn, zm); 1479 } 1480 1481 if (needs_negation) { 1482 notr(dst, isQ ? T16B : T8B, dst); 1483 } 1484 } 1485 1486 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1487 Condition cond, bool isQ) { 1488 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1489 if (bt == T_FLOAT || bt == T_DOUBLE) { 1490 if (cond == Assembler::NE) { 1491 fcm(Assembler::EQ, dst, size, src); 1492 notr(dst, isQ ? T16B : T8B, dst); 1493 } else { 1494 fcm(cond, dst, size, src); 1495 } 1496 } else { 1497 if (cond == Assembler::NE) { 1498 cm(Assembler::EQ, dst, size, src); 1499 notr(dst, isQ ? T16B : T8B, dst); 1500 } else { 1501 cm(cond, dst, size, src); 1502 } 1503 } 1504 } 1505 1506 // Compress the least significant bit of each byte to the rightmost and clear 1507 // the higher garbage bits. 1508 void C2_MacroAssembler::bytemask_compress(Register dst) { 1509 // Example input, dst = 0x01 00 00 00 01 01 00 01 1510 // The "??" bytes are garbage. 1511 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1512 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1513 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1514 andr(dst, dst, 0xff); // dst = 0x8D 1515 } 1516 1517 // Pack the lowest-numbered bit of each mask element in src into a long value 1518 // in dst, at most the first 64 lane elements. 1519 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1520 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1521 FloatRegister vtmp1, FloatRegister vtmp2) { 1522 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1523 assert_different_registers(dst, rscratch1); 1524 assert_different_registers(vtmp1, vtmp2); 1525 1526 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1527 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1528 // Expected: dst = 0x658D 1529 1530 // Convert the mask into vector with sequential bytes. 1531 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1532 sve_cpy(vtmp1, size, src, 1, false); 1533 if (bt != T_BYTE) { 1534 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1535 } 1536 1537 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1538 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1539 // is to compress each significant bit of the byte in a cross-lane way. Due 1540 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1541 // (bit-compress in each lane) with the biggest lane size (T = D) then 1542 // concatenate the results. 1543 1544 // The second source input of BEXT, initialized with 0x01 in each byte. 1545 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1546 sve_dup(vtmp2, B, 1); 1547 1548 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1549 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1550 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1551 // --------------------------------------- 1552 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1553 sve_bext(vtmp1, D, vtmp1, vtmp2); 1554 1555 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1556 // result to dst. 1557 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1558 // dst = 0x658D 1559 if (lane_cnt <= 8) { 1560 // No need to concatenate. 1561 umov(dst, vtmp1, B, 0); 1562 } else if (lane_cnt <= 16) { 1563 ins(vtmp1, B, vtmp1, 1, 8); 1564 umov(dst, vtmp1, H, 0); 1565 } else { 1566 // As the lane count is 64 at most, the final expected value must be in 1567 // the lowest 64 bits after narrowing vtmp1 from D to B. 1568 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1569 umov(dst, vtmp1, D, 0); 1570 } 1571 } else if (UseSVE > 0) { 1572 // Compress the lowest 8 bytes. 1573 fmovd(dst, vtmp1); 1574 bytemask_compress(dst); 1575 if (lane_cnt <= 8) return; 1576 1577 // Repeat on higher bytes and join the results. 1578 // Compress 8 bytes in each iteration. 1579 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1580 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1581 bytemask_compress(rscratch1); 1582 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1583 } 1584 } else { 1585 assert(false, "unsupported"); 1586 ShouldNotReachHere(); 1587 } 1588 } 1589 1590 // Unpack the mask, a long value in src, into predicate register dst based on the 1591 // corresponding data type. Note that dst can support at most 64 lanes. 1592 // Below example gives the expected dst predicate register in different types, with 1593 // a valid src(0x658D) on a 1024-bit vector size machine. 1594 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1595 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1596 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1597 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1598 // 1599 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1600 // has 24 significant bits would be an invalid input if dst predicate register refers to 1601 // a LONG type 1024-bit vector, which has at most 16 lanes. 1602 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1603 FloatRegister vtmp1, FloatRegister vtmp2) { 1604 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1605 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1606 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1607 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1608 // Expected: dst = 0b01101001 10001101 1609 1610 // Put long value from general purpose register into the first lane of vector. 1611 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1612 sve_dup(vtmp1, B, 0); 1613 mov(vtmp1, D, 0, src); 1614 1615 // As sve_cmp generates mask value with the minimum unit in byte, we should 1616 // transform the value in the first lane which is mask in bit now to the 1617 // mask in byte, which can be done by SVE2's BDEP instruction. 1618 1619 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1620 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1621 if (lane_cnt <= 8) { 1622 // Nothing. As only one byte exsits. 1623 } else if (lane_cnt <= 16) { 1624 ins(vtmp1, B, vtmp1, 8, 1); 1625 mov(vtmp1, B, 1, zr); 1626 } else { 1627 sve_vector_extend(vtmp1, D, vtmp1, B); 1628 } 1629 1630 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1631 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1632 sve_dup(vtmp2, B, 1); 1633 1634 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1635 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1636 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1637 // --------------------------------------- 1638 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1639 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1640 1641 if (bt != T_BYTE) { 1642 sve_vector_extend(vtmp1, size, vtmp1, B); 1643 } 1644 // Generate mask according to the given vector, in which the elements have been 1645 // extended to expected type. 1646 // dst = 0b01101001 10001101 1647 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1648 } 1649 1650 // Clobbers: rflags 1651 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1652 FloatRegister zn, FloatRegister zm, Condition cond) { 1653 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1654 FloatRegister z1 = zn, z2 = zm; 1655 switch (cond) { 1656 case LE: z1 = zm; z2 = zn; cond = GE; break; 1657 case LT: z1 = zm; z2 = zn; cond = GT; break; 1658 case LO: z1 = zm; z2 = zn; cond = HI; break; 1659 case LS: z1 = zm; z2 = zn; cond = HS; break; 1660 default: 1661 break; 1662 } 1663 1664 SIMD_RegVariant size = elemType_to_regVariant(bt); 1665 if (is_floating_point_type(bt)) { 1666 sve_fcm(cond, pd, size, pg, z1, z2); 1667 } else { 1668 assert(is_integral_type(bt), "unsupported element type"); 1669 sve_cmp(cond, pd, size, pg, z1, z2); 1670 } 1671 } 1672 1673 // Get index of the last mask lane that is set 1674 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1675 SIMD_RegVariant size = elemType_to_regVariant(bt); 1676 sve_rev(ptmp, size, src); 1677 sve_brkb(ptmp, ptrue, ptmp, false); 1678 sve_cntp(dst, size, ptrue, ptmp); 1679 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1680 subw(dst, rscratch1, dst); 1681 } 1682 1683 // Extend integer vector src to dst with the same lane count 1684 // but larger element size, e.g. 4B -> 4I 1685 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1686 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1687 if (src_bt == T_BYTE) { 1688 if (dst_bt == T_SHORT) { 1689 // 4B/8B to 4S/8S 1690 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1691 } else { 1692 // 4B to 4I 1693 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1694 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1695 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1696 } 1697 } else if (src_bt == T_SHORT) { 1698 // 4S to 4I 1699 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1700 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1701 } else if (src_bt == T_INT) { 1702 // 2I to 2L 1703 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1704 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1705 } else { 1706 ShouldNotReachHere(); 1707 } 1708 } 1709 1710 // Narrow integer vector src down to dst with the same lane count 1711 // but smaller element size, e.g. 4I -> 4B 1712 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1713 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1714 if (src_bt == T_SHORT) { 1715 // 4S/8S to 4B/8B 1716 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1717 assert(dst_bt == T_BYTE, "unsupported"); 1718 xtn(dst, T8B, src, T8H); 1719 } else if (src_bt == T_INT) { 1720 // 4I to 4B/4S 1721 assert(src_vlen_in_bytes == 16, "unsupported"); 1722 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1723 xtn(dst, T4H, src, T4S); 1724 if (dst_bt == T_BYTE) { 1725 xtn(dst, T8B, dst, T8H); 1726 } 1727 } else if (src_bt == T_LONG) { 1728 // 2L to 2I 1729 assert(src_vlen_in_bytes == 16, "unsupported"); 1730 assert(dst_bt == T_INT, "unsupported"); 1731 xtn(dst, T2S, src, T2D); 1732 } else { 1733 ShouldNotReachHere(); 1734 } 1735 } 1736 1737 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1738 FloatRegister src, SIMD_RegVariant src_size, 1739 bool is_unsigned) { 1740 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1741 1742 if (src_size == B) { 1743 switch (dst_size) { 1744 case H: 1745 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1746 break; 1747 case S: 1748 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1749 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1750 break; 1751 case D: 1752 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1753 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1754 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1755 break; 1756 default: 1757 ShouldNotReachHere(); 1758 } 1759 } else if (src_size == H) { 1760 if (dst_size == S) { 1761 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1762 } else { // D 1763 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1764 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1765 } 1766 } else if (src_size == S) { 1767 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1768 } 1769 } 1770 1771 // Vector narrow from src to dst with specified element sizes. 1772 // High part of dst vector will be filled with zero. 1773 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1774 FloatRegister src, SIMD_RegVariant src_size, 1775 FloatRegister tmp) { 1776 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1777 assert_different_registers(src, tmp); 1778 sve_dup(tmp, src_size, 0); 1779 if (src_size == D) { 1780 switch (dst_size) { 1781 case S: 1782 sve_uzp1(dst, S, src, tmp); 1783 break; 1784 case H: 1785 assert_different_registers(dst, tmp); 1786 sve_uzp1(dst, S, src, tmp); 1787 sve_uzp1(dst, H, dst, tmp); 1788 break; 1789 case B: 1790 assert_different_registers(dst, tmp); 1791 sve_uzp1(dst, S, src, tmp); 1792 sve_uzp1(dst, H, dst, tmp); 1793 sve_uzp1(dst, B, dst, tmp); 1794 break; 1795 default: 1796 ShouldNotReachHere(); 1797 } 1798 } else if (src_size == S) { 1799 if (dst_size == H) { 1800 sve_uzp1(dst, H, src, tmp); 1801 } else { // B 1802 assert_different_registers(dst, tmp); 1803 sve_uzp1(dst, H, src, tmp); 1804 sve_uzp1(dst, B, dst, tmp); 1805 } 1806 } else if (src_size == H) { 1807 sve_uzp1(dst, B, src, tmp); 1808 } 1809 } 1810 1811 // Extend src predicate to dst predicate with the same lane count but larger 1812 // element size, e.g. 64Byte -> 512Long 1813 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1814 uint dst_element_length_in_bytes, 1815 uint src_element_length_in_bytes) { 1816 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1817 sve_punpklo(dst, src); 1818 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1819 sve_punpklo(dst, src); 1820 sve_punpklo(dst, dst); 1821 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1822 sve_punpklo(dst, src); 1823 sve_punpklo(dst, dst); 1824 sve_punpklo(dst, dst); 1825 } else { 1826 assert(false, "unsupported"); 1827 ShouldNotReachHere(); 1828 } 1829 } 1830 1831 // Narrow src predicate to dst predicate with the same lane count but 1832 // smaller element size, e.g. 512Long -> 64Byte 1833 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1834 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1835 // The insignificant bits in src predicate are expected to be zero. 1836 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1837 // passed as the second argument. An example narrowing operation with a given mask would be - 1838 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1839 // Mask (for 2 Longs) : TF 1840 // Predicate register for the above mask (16 bits) : 00000001 00000000 1841 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1842 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1843 assert_different_registers(src, ptmp); 1844 assert_different_registers(dst, ptmp); 1845 sve_pfalse(ptmp); 1846 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1847 sve_uzp1(dst, B, src, ptmp); 1848 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1849 sve_uzp1(dst, H, src, ptmp); 1850 sve_uzp1(dst, B, dst, ptmp); 1851 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1852 sve_uzp1(dst, S, src, ptmp); 1853 sve_uzp1(dst, H, dst, ptmp); 1854 sve_uzp1(dst, B, dst, ptmp); 1855 } else { 1856 assert(false, "unsupported"); 1857 ShouldNotReachHere(); 1858 } 1859 } 1860 1861 // Vector reduction add for integral type with ASIMD instructions. 1862 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1863 Register isrc, FloatRegister vsrc, 1864 unsigned vector_length_in_bytes, 1865 FloatRegister vtmp) { 1866 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1867 assert_different_registers(dst, isrc); 1868 bool isQ = vector_length_in_bytes == 16; 1869 1870 BLOCK_COMMENT("neon_reduce_add_integral {"); 1871 switch(bt) { 1872 case T_BYTE: 1873 addv(vtmp, isQ ? T16B : T8B, vsrc); 1874 smov(dst, vtmp, B, 0); 1875 addw(dst, dst, isrc, ext::sxtb); 1876 break; 1877 case T_SHORT: 1878 addv(vtmp, isQ ? T8H : T4H, vsrc); 1879 smov(dst, vtmp, H, 0); 1880 addw(dst, dst, isrc, ext::sxth); 1881 break; 1882 case T_INT: 1883 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1884 umov(dst, vtmp, S, 0); 1885 addw(dst, dst, isrc); 1886 break; 1887 case T_LONG: 1888 assert(isQ, "unsupported"); 1889 addpd(vtmp, vsrc); 1890 umov(dst, vtmp, D, 0); 1891 add(dst, dst, isrc); 1892 break; 1893 default: 1894 assert(false, "unsupported"); 1895 ShouldNotReachHere(); 1896 } 1897 BLOCK_COMMENT("} neon_reduce_add_integral"); 1898 } 1899 1900 // Vector reduction multiply for integral type with ASIMD instructions. 1901 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1902 // Clobbers: rscratch1 1903 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1904 Register isrc, FloatRegister vsrc, 1905 unsigned vector_length_in_bytes, 1906 FloatRegister vtmp1, FloatRegister vtmp2) { 1907 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1908 bool isQ = vector_length_in_bytes == 16; 1909 1910 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1911 switch(bt) { 1912 case T_BYTE: 1913 if (isQ) { 1914 // Multiply the lower half and higher half of vector iteratively. 1915 // vtmp1 = vsrc[8:15] 1916 ins(vtmp1, D, vsrc, 0, 1); 1917 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1918 mulv(vtmp1, T8B, vtmp1, vsrc); 1919 // vtmp2 = vtmp1[4:7] 1920 ins(vtmp2, S, vtmp1, 0, 1); 1921 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1922 mulv(vtmp1, T8B, vtmp2, vtmp1); 1923 } else { 1924 ins(vtmp1, S, vsrc, 0, 1); 1925 mulv(vtmp1, T8B, vtmp1, vsrc); 1926 } 1927 // vtmp2 = vtmp1[2:3] 1928 ins(vtmp2, H, vtmp1, 0, 1); 1929 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1930 mulv(vtmp2, T8B, vtmp2, vtmp1); 1931 // dst = vtmp2[0] * isrc * vtmp2[1] 1932 umov(rscratch1, vtmp2, B, 0); 1933 mulw(dst, rscratch1, isrc); 1934 sxtb(dst, dst); 1935 umov(rscratch1, vtmp2, B, 1); 1936 mulw(dst, rscratch1, dst); 1937 sxtb(dst, dst); 1938 break; 1939 case T_SHORT: 1940 if (isQ) { 1941 ins(vtmp2, D, vsrc, 0, 1); 1942 mulv(vtmp2, T4H, vtmp2, vsrc); 1943 ins(vtmp1, S, vtmp2, 0, 1); 1944 mulv(vtmp1, T4H, vtmp1, vtmp2); 1945 } else { 1946 ins(vtmp1, S, vsrc, 0, 1); 1947 mulv(vtmp1, T4H, vtmp1, vsrc); 1948 } 1949 umov(rscratch1, vtmp1, H, 0); 1950 mulw(dst, rscratch1, isrc); 1951 sxth(dst, dst); 1952 umov(rscratch1, vtmp1, H, 1); 1953 mulw(dst, rscratch1, dst); 1954 sxth(dst, dst); 1955 break; 1956 case T_INT: 1957 if (isQ) { 1958 ins(vtmp1, D, vsrc, 0, 1); 1959 mulv(vtmp1, T2S, vtmp1, vsrc); 1960 } else { 1961 vtmp1 = vsrc; 1962 } 1963 umov(rscratch1, vtmp1, S, 0); 1964 mul(dst, rscratch1, isrc); 1965 umov(rscratch1, vtmp1, S, 1); 1966 mul(dst, rscratch1, dst); 1967 break; 1968 case T_LONG: 1969 umov(rscratch1, vsrc, D, 0); 1970 mul(dst, isrc, rscratch1); 1971 umov(rscratch1, vsrc, D, 1); 1972 mul(dst, dst, rscratch1); 1973 break; 1974 default: 1975 assert(false, "unsupported"); 1976 ShouldNotReachHere(); 1977 } 1978 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1979 } 1980 1981 // Vector reduction multiply for floating-point type with ASIMD instructions. 1982 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1983 FloatRegister fsrc, FloatRegister vsrc, 1984 unsigned vector_length_in_bytes, 1985 FloatRegister vtmp) { 1986 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1987 bool isQ = vector_length_in_bytes == 16; 1988 1989 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1990 switch(bt) { 1991 case T_FLOAT: 1992 fmuls(dst, fsrc, vsrc); 1993 ins(vtmp, S, vsrc, 0, 1); 1994 fmuls(dst, dst, vtmp); 1995 if (isQ) { 1996 ins(vtmp, S, vsrc, 0, 2); 1997 fmuls(dst, dst, vtmp); 1998 ins(vtmp, S, vsrc, 0, 3); 1999 fmuls(dst, dst, vtmp); 2000 } 2001 break; 2002 case T_DOUBLE: 2003 assert(isQ, "unsupported"); 2004 fmuld(dst, fsrc, vsrc); 2005 ins(vtmp, D, vsrc, 0, 1); 2006 fmuld(dst, dst, vtmp); 2007 break; 2008 default: 2009 assert(false, "unsupported"); 2010 ShouldNotReachHere(); 2011 } 2012 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2013 } 2014 2015 // Helper to select logical instruction 2016 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2017 Register Rn, Register Rm, 2018 enum shift_kind kind, unsigned shift) { 2019 switch(opc) { 2020 case Op_AndReductionV: 2021 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2022 break; 2023 case Op_OrReductionV: 2024 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2025 break; 2026 case Op_XorReductionV: 2027 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2028 break; 2029 default: 2030 assert(false, "unsupported"); 2031 ShouldNotReachHere(); 2032 } 2033 } 2034 2035 // Vector reduction logical operations And, Or, Xor 2036 // Clobbers: rscratch1 2037 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2038 Register isrc, FloatRegister vsrc, 2039 unsigned vector_length_in_bytes) { 2040 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2041 "unsupported"); 2042 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2043 assert_different_registers(dst, isrc); 2044 bool isQ = vector_length_in_bytes == 16; 2045 2046 BLOCK_COMMENT("neon_reduce_logical {"); 2047 umov(rscratch1, vsrc, isQ ? D : S, 0); 2048 umov(dst, vsrc, isQ ? D : S, 1); 2049 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2050 switch(bt) { 2051 case T_BYTE: 2052 if (isQ) { 2053 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2054 } 2055 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2056 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2057 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2058 sxtb(dst, dst); 2059 break; 2060 case T_SHORT: 2061 if (isQ) { 2062 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2063 } 2064 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2065 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2066 sxth(dst, dst); 2067 break; 2068 case T_INT: 2069 if (isQ) { 2070 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2071 } 2072 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2073 break; 2074 case T_LONG: 2075 assert(isQ, "unsupported"); 2076 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2077 break; 2078 default: 2079 assert(false, "unsupported"); 2080 ShouldNotReachHere(); 2081 } 2082 BLOCK_COMMENT("} neon_reduce_logical"); 2083 } 2084 2085 // Vector reduction min/max for integral type with ASIMD instructions. 2086 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2087 // Clobbers: rscratch1, rflags 2088 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2089 Register isrc, FloatRegister vsrc, 2090 unsigned vector_length_in_bytes, 2091 FloatRegister vtmp) { 2092 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2093 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2094 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2095 assert_different_registers(dst, isrc); 2096 bool isQ = vector_length_in_bytes == 16; 2097 bool is_min = opc == Op_MinReductionV; 2098 2099 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2100 if (bt == T_LONG) { 2101 assert(vtmp == fnoreg, "should be"); 2102 assert(isQ, "should be"); 2103 umov(rscratch1, vsrc, D, 0); 2104 cmp(isrc, rscratch1); 2105 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2106 umov(rscratch1, vsrc, D, 1); 2107 cmp(dst, rscratch1); 2108 csel(dst, dst, rscratch1, is_min ? LT : GT); 2109 } else { 2110 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2111 if (size == T2S) { 2112 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2113 } else { 2114 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2115 } 2116 if (bt == T_INT) { 2117 umov(dst, vtmp, S, 0); 2118 } else { 2119 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2120 } 2121 cmpw(dst, isrc); 2122 cselw(dst, dst, isrc, is_min ? LT : GT); 2123 } 2124 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2125 } 2126 2127 // Vector reduction for integral type with SVE instruction. 2128 // Supported operations are Add, And, Or, Xor, Max, Min. 2129 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2130 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2131 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2132 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2133 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2134 assert_different_registers(src1, dst); 2135 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2136 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2137 switch (opc) { 2138 case Op_AddReductionVI: { 2139 sve_uaddv(tmp, size, pg, src2); 2140 if (bt == T_BYTE) { 2141 smov(dst, tmp, size, 0); 2142 addw(dst, src1, dst, ext::sxtb); 2143 } else if (bt == T_SHORT) { 2144 smov(dst, tmp, size, 0); 2145 addw(dst, src1, dst, ext::sxth); 2146 } else { 2147 umov(dst, tmp, size, 0); 2148 addw(dst, dst, src1); 2149 } 2150 break; 2151 } 2152 case Op_AddReductionVL: { 2153 sve_uaddv(tmp, size, pg, src2); 2154 umov(dst, tmp, size, 0); 2155 add(dst, dst, src1); 2156 break; 2157 } 2158 case Op_AndReductionV: { 2159 sve_andv(tmp, size, pg, src2); 2160 if (bt == T_INT || bt == T_LONG) { 2161 umov(dst, tmp, size, 0); 2162 } else { 2163 smov(dst, tmp, size, 0); 2164 } 2165 if (bt == T_LONG) { 2166 andr(dst, dst, src1); 2167 } else { 2168 andw(dst, dst, src1); 2169 } 2170 break; 2171 } 2172 case Op_OrReductionV: { 2173 sve_orv(tmp, size, pg, src2); 2174 if (bt == T_INT || bt == T_LONG) { 2175 umov(dst, tmp, size, 0); 2176 } else { 2177 smov(dst, tmp, size, 0); 2178 } 2179 if (bt == T_LONG) { 2180 orr(dst, dst, src1); 2181 } else { 2182 orrw(dst, dst, src1); 2183 } 2184 break; 2185 } 2186 case Op_XorReductionV: { 2187 sve_eorv(tmp, size, pg, src2); 2188 if (bt == T_INT || bt == T_LONG) { 2189 umov(dst, tmp, size, 0); 2190 } else { 2191 smov(dst, tmp, size, 0); 2192 } 2193 if (bt == T_LONG) { 2194 eor(dst, dst, src1); 2195 } else { 2196 eorw(dst, dst, src1); 2197 } 2198 break; 2199 } 2200 case Op_MaxReductionV: { 2201 sve_smaxv(tmp, size, pg, src2); 2202 if (bt == T_INT || bt == T_LONG) { 2203 umov(dst, tmp, size, 0); 2204 } else { 2205 smov(dst, tmp, size, 0); 2206 } 2207 if (bt == T_LONG) { 2208 cmp(dst, src1); 2209 csel(dst, dst, src1, Assembler::GT); 2210 } else { 2211 cmpw(dst, src1); 2212 cselw(dst, dst, src1, Assembler::GT); 2213 } 2214 break; 2215 } 2216 case Op_MinReductionV: { 2217 sve_sminv(tmp, size, pg, src2); 2218 if (bt == T_INT || bt == T_LONG) { 2219 umov(dst, tmp, size, 0); 2220 } else { 2221 smov(dst, tmp, size, 0); 2222 } 2223 if (bt == T_LONG) { 2224 cmp(dst, src1); 2225 csel(dst, dst, src1, Assembler::LT); 2226 } else { 2227 cmpw(dst, src1); 2228 cselw(dst, dst, src1, Assembler::LT); 2229 } 2230 break; 2231 } 2232 default: 2233 assert(false, "unsupported"); 2234 ShouldNotReachHere(); 2235 } 2236 2237 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2238 if (bt == T_BYTE) { 2239 sxtb(dst, dst); 2240 } else if (bt == T_SHORT) { 2241 sxth(dst, dst); 2242 } 2243 } 2244 } 2245 2246 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2247 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2248 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2249 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2250 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2251 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2252 2253 // Set all elements to false if the input "lane_cnt" is zero. 2254 if (lane_cnt == 0) { 2255 sve_pfalse(dst); 2256 return; 2257 } 2258 2259 SIMD_RegVariant size = elemType_to_regVariant(bt); 2260 assert(size != Q, "invalid size"); 2261 2262 // Set all true if "lane_cnt" equals to the max lane count. 2263 if (lane_cnt == max_vector_length) { 2264 sve_ptrue(dst, size, /* ALL */ 0b11111); 2265 return; 2266 } 2267 2268 // Fixed numbers for "ptrue". 2269 switch(lane_cnt) { 2270 case 1: /* VL1 */ 2271 case 2: /* VL2 */ 2272 case 3: /* VL3 */ 2273 case 4: /* VL4 */ 2274 case 5: /* VL5 */ 2275 case 6: /* VL6 */ 2276 case 7: /* VL7 */ 2277 case 8: /* VL8 */ 2278 sve_ptrue(dst, size, lane_cnt); 2279 return; 2280 case 16: 2281 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2282 return; 2283 case 32: 2284 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2285 return; 2286 case 64: 2287 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2288 return; 2289 case 128: 2290 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2291 return; 2292 case 256: 2293 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2294 return; 2295 default: 2296 break; 2297 } 2298 2299 // Special patterns for "ptrue". 2300 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2301 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2302 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2303 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2304 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2305 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2306 } else { 2307 // Encode to "whileltw" for the remaining cases. 2308 mov(rscratch1, lane_cnt); 2309 sve_whileltw(dst, size, zr, rscratch1); 2310 } 2311 } 2312 2313 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2314 // Any remaining elements of dst will be filled with zero. 2315 // Clobbers: rscratch1 2316 // Preserves: src, mask 2317 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2318 FloatRegister vtmp1, FloatRegister vtmp2, 2319 PRegister pgtmp) { 2320 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2321 assert_different_registers(dst, src, vtmp1, vtmp2); 2322 assert_different_registers(mask, pgtmp); 2323 2324 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2325 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2326 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2327 sve_dup(vtmp2, H, 0); 2328 2329 // Extend lowest half to type INT. 2330 // dst = 00004444 00003333 00002222 00001111 2331 sve_uunpklo(dst, S, src); 2332 // pgtmp = 00000001 00000000 00000001 00000001 2333 sve_punpklo(pgtmp, mask); 2334 // Pack the active elements in size of type INT to the right, 2335 // and fill the remainings with zero. 2336 // dst = 00000000 00004444 00002222 00001111 2337 sve_compact(dst, S, dst, pgtmp); 2338 // Narrow the result back to type SHORT. 2339 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2340 sve_uzp1(dst, H, dst, vtmp2); 2341 // Count the active elements of lowest half. 2342 // rscratch1 = 3 2343 sve_cntp(rscratch1, S, ptrue, pgtmp); 2344 2345 // Repeat to the highest half. 2346 // pgtmp = 00000001 00000000 00000000 00000001 2347 sve_punpkhi(pgtmp, mask); 2348 // vtmp1 = 00008888 00007777 00006666 00005555 2349 sve_uunpkhi(vtmp1, S, src); 2350 // vtmp1 = 00000000 00000000 00008888 00005555 2351 sve_compact(vtmp1, S, vtmp1, pgtmp); 2352 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2353 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2354 2355 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2356 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2357 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2358 // TRUE_CNT is the number of active elements in the compressed low. 2359 neg(rscratch1, rscratch1); 2360 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2361 sve_index(vtmp2, H, rscratch1, 1); 2362 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2363 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2364 2365 // Combine the compressed high(after shifted) with the compressed low. 2366 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2367 sve_orr(dst, dst, vtmp1); 2368 } 2369 2370 // Clobbers: rscratch1, rscratch2 2371 // Preserves: src, mask 2372 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2373 FloatRegister vtmp1, FloatRegister vtmp2, 2374 FloatRegister vtmp3, FloatRegister vtmp4, 2375 PRegister ptmp, PRegister pgtmp) { 2376 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2377 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2378 assert_different_registers(mask, ptmp, pgtmp); 2379 // Example input: src = 88 77 66 55 44 33 22 11 2380 // mask = 01 00 00 01 01 00 01 01 2381 // Expected result: dst = 00 00 00 88 55 44 22 11 2382 2383 sve_dup(vtmp4, B, 0); 2384 // Extend lowest half to type SHORT. 2385 // vtmp1 = 0044 0033 0022 0011 2386 sve_uunpklo(vtmp1, H, src); 2387 // ptmp = 0001 0000 0001 0001 2388 sve_punpklo(ptmp, mask); 2389 // Count the active elements of lowest half. 2390 // rscratch2 = 3 2391 sve_cntp(rscratch2, H, ptrue, ptmp); 2392 // Pack the active elements in size of type SHORT to the right, 2393 // and fill the remainings with zero. 2394 // dst = 0000 0044 0022 0011 2395 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2396 // Narrow the result back to type BYTE. 2397 // dst = 00 00 00 00 00 44 22 11 2398 sve_uzp1(dst, B, dst, vtmp4); 2399 2400 // Repeat to the highest half. 2401 // ptmp = 0001 0000 0000 0001 2402 sve_punpkhi(ptmp, mask); 2403 // vtmp1 = 0088 0077 0066 0055 2404 sve_uunpkhi(vtmp2, H, src); 2405 // vtmp1 = 0000 0000 0088 0055 2406 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2407 2408 sve_dup(vtmp4, B, 0); 2409 // vtmp1 = 00 00 00 00 00 00 88 55 2410 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2411 2412 // Compressed low: dst = 00 00 00 00 00 44 22 11 2413 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2414 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2415 // TRUE_CNT is the number of active elements in the compressed low. 2416 neg(rscratch2, rscratch2); 2417 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2418 sve_index(vtmp2, B, rscratch2, 1); 2419 // vtmp1 = 00 00 00 88 55 00 00 00 2420 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2421 // Combine the compressed high(after shifted) with the compressed low. 2422 // dst = 00 00 00 88 55 44 22 11 2423 sve_orr(dst, dst, vtmp1); 2424 } 2425 2426 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2427 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2428 SIMD_Arrangement size = isQ ? T16B : T8B; 2429 if (bt == T_BYTE) { 2430 rbit(dst, size, src); 2431 } else { 2432 neon_reverse_bytes(dst, src, bt, isQ); 2433 rbit(dst, size, dst); 2434 } 2435 } 2436 2437 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2438 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2439 SIMD_Arrangement size = isQ ? T16B : T8B; 2440 switch (bt) { 2441 case T_BYTE: 2442 if (dst != src) { 2443 orr(dst, size, src, src); 2444 } 2445 break; 2446 case T_SHORT: 2447 rev16(dst, size, src); 2448 break; 2449 case T_INT: 2450 rev32(dst, size, src); 2451 break; 2452 case T_LONG: 2453 rev64(dst, size, src); 2454 break; 2455 default: 2456 assert(false, "unsupported"); 2457 ShouldNotReachHere(); 2458 } 2459 } 2460 2461 // Extract a scalar element from an sve vector at position 'idx'. 2462 // The input elements in src are expected to be of integral type. 2463 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2464 int idx, FloatRegister vtmp) { 2465 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2466 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2467 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2468 if (bt == T_INT || bt == T_LONG) { 2469 umov(dst, src, size, idx); 2470 } else { 2471 smov(dst, src, size, idx); 2472 } 2473 } else { 2474 sve_orr(vtmp, src, src); 2475 sve_ext(vtmp, vtmp, idx << size); 2476 if (bt == T_INT || bt == T_LONG) { 2477 umov(dst, vtmp, size, 0); 2478 } else { 2479 smov(dst, vtmp, size, 0); 2480 } 2481 } 2482 } 2483 2484 // java.lang.Math::round intrinsics 2485 2486 // Clobbers: rscratch1, rflags 2487 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2488 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2489 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2490 switch (T) { 2491 case T2S: 2492 case T4S: 2493 fmovs(tmp1, T, 0.5f); 2494 mov(rscratch1, jint_cast(0x1.0p23f)); 2495 break; 2496 case T2D: 2497 fmovd(tmp1, T, 0.5); 2498 mov(rscratch1, julong_cast(0x1.0p52)); 2499 break; 2500 default: 2501 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2502 } 2503 fadd(tmp1, T, tmp1, src); 2504 fcvtms(tmp1, T, tmp1); 2505 // tmp1 = floor(src + 0.5, ties to even) 2506 2507 fcvtas(dst, T, src); 2508 // dst = round(src), ties to away 2509 2510 fneg(tmp3, T, src); 2511 dup(tmp2, T, rscratch1); 2512 cm(HS, tmp3, T, tmp3, tmp2); 2513 // tmp3 is now a set of flags 2514 2515 bif(dst, T16B, tmp1, tmp3); 2516 // result in dst 2517 } 2518 2519 // Clobbers: rscratch1, rflags 2520 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2521 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2522 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2523 assert_different_registers(tmp1, tmp2, src, dst); 2524 2525 switch (T) { 2526 case S: 2527 mov(rscratch1, jint_cast(0x1.0p23f)); 2528 break; 2529 case D: 2530 mov(rscratch1, julong_cast(0x1.0p52)); 2531 break; 2532 default: 2533 assert(T == S || T == D, "invalid register variant"); 2534 } 2535 2536 sve_frinta(dst, T, ptrue, src); 2537 // dst = round(src), ties to away 2538 2539 Label none; 2540 2541 sve_fneg(tmp1, T, ptrue, src); 2542 sve_dup(tmp2, T, rscratch1); 2543 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2544 br(EQ, none); 2545 { 2546 sve_cpy(tmp1, T, pgtmp, 0.5); 2547 sve_fadd(tmp1, T, pgtmp, src); 2548 sve_frintm(dst, T, pgtmp, tmp1); 2549 // dst = floor(src + 0.5, ties to even) 2550 } 2551 bind(none); 2552 2553 sve_fcvtzs(dst, T, ptrue, dst, T); 2554 // result in dst 2555 } 2556 2557 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2558 FloatRegister one, SIMD_Arrangement T) { 2559 assert_different_registers(dst, src, zero, one); 2560 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2561 2562 facgt(dst, T, src, zero); 2563 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2564 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2565 } 2566 2567 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2568 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2569 assert_different_registers(dst, src, zero, one, vtmp); 2570 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2571 2572 sve_orr(vtmp, src, src); 2573 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2574 switch (T) { 2575 case S: 2576 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2577 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2578 // on the sign of the float value 2579 break; 2580 case D: 2581 sve_and(vtmp, T, min_jlong); 2582 sve_orr(vtmp, T, jlong_cast(1.0)); 2583 break; 2584 default: 2585 assert(false, "unsupported"); 2586 ShouldNotReachHere(); 2587 } 2588 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2589 // Result in dst 2590 } 2591 2592 bool C2_MacroAssembler::in_scratch_emit_size() { 2593 if (ciEnv::current()->task() != nullptr) { 2594 PhaseOutput* phase_output = Compile::current()->output(); 2595 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2596 return true; 2597 } 2598 } 2599 return MacroAssembler::in_scratch_emit_size(); 2600 }