1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::entry_barrier() { 50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 51 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 52 // Dummy labels for just measuring the code size 53 Label dummy_slow_path; 54 Label dummy_continuation; 55 Label dummy_guard; 56 Label* slow_path = &dummy_slow_path; 57 Label* continuation = &dummy_continuation; 58 Label* guard = &dummy_guard; 59 if (!Compile::current()->output()->in_scratch_emit_size()) { 60 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 61 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 62 Compile::current()->output()->add_stub(stub); 63 slow_path = &stub->entry(); 64 continuation = &stub->continuation(); 65 guard = &stub->guard(); 66 } 67 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 68 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 69 } 70 } 71 72 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 73 Register tmp2Reg, Register tmp3Reg) { 74 Register oop = objectReg; 75 Register box = boxReg; 76 Register disp_hdr = tmpReg; 77 Register tmp = tmp2Reg; 78 Label cont; 79 Label object_has_monitor; 80 Label count, no_count; 81 82 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 83 assert_different_registers(oop, box, tmp, disp_hdr); 84 85 // Load markWord from object into displaced_header. 86 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 87 88 if (DiagnoseSyncOnValueBasedClasses != 0) { 89 load_klass(tmp, oop); 90 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 91 tst(tmp, KlassFlags::_misc_is_value_based_class); 92 br(Assembler::NE, cont); 93 } 94 95 // Check for existing monitor 96 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 97 98 if (LockingMode == LM_MONITOR) { 99 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 100 b(cont); 101 } else { 102 assert(LockingMode == LM_LEGACY, "must be"); 103 // Set tmp to be (markWord of object | UNLOCK_VALUE). 104 orr(tmp, disp_hdr, markWord::unlocked_value); 105 106 if (EnableValhalla) { 107 // Mask inline_type bit such that we go to the slow path if object is an inline type 108 andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place)); 109 } 110 111 // Initialize the box. (Must happen before we update the object mark!) 112 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 113 114 // Compare object markWord with an unlocked value (tmp) and if 115 // equal exchange the stack address of our box with object markWord. 116 // On failure disp_hdr contains the possibly locked markWord. 117 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 118 /*release*/ true, /*weak*/ false, disp_hdr); 119 br(Assembler::EQ, cont); 120 121 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 122 123 // If the compare-and-exchange succeeded, then we found an unlocked 124 // object, will have now locked it will continue at label cont 125 126 // Check if the owner is self by comparing the value in the 127 // markWord of object (disp_hdr) with the stack pointer. 128 mov(rscratch1, sp); 129 sub(disp_hdr, disp_hdr, rscratch1); 130 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 131 // If condition is true we are cont and hence we can store 0 as the 132 // displaced header in the box, which indicates that it is a recursive lock. 133 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 134 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 135 b(cont); 136 } 137 138 // Handle existing monitor. 139 bind(object_has_monitor); 140 141 // The object's monitor m is unlocked iff m->owner == nullptr, 142 // otherwise m->owner may contain a thread or a stack address. 143 // 144 // Try to CAS m->owner from null to current thread. 145 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 146 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 147 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 148 149 // Store a non-null value into the box to avoid looking like a re-entrant 150 // lock. The fast-path monitor unlock code checks for 151 // markWord::monitor_value so use markWord::unused_mark which has the 152 // relevant bit set, and also matches ObjectSynchronizer::enter. 153 mov(tmp, (address)markWord::unused_mark().value()); 154 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 155 156 br(Assembler::EQ, cont); // CAS success means locking succeeded 157 158 cmp(tmp3Reg, rthread); 159 br(Assembler::NE, cont); // Check for recursive locking 160 161 // Recursive lock case 162 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 163 // flag == EQ still from the cmp above, checking if this is a reentrant lock 164 165 bind(cont); 166 // flag == EQ indicates success 167 // flag == NE indicates failure 168 br(Assembler::NE, no_count); 169 170 bind(count); 171 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 172 173 bind(no_count); 174 } 175 176 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 177 Register tmp2Reg) { 178 Register oop = objectReg; 179 Register box = boxReg; 180 Register disp_hdr = tmpReg; 181 Register owner_addr = tmpReg; 182 Register tmp = tmp2Reg; 183 Label cont; 184 Label object_has_monitor; 185 Label count, no_count; 186 Label unlocked; 187 188 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 189 assert_different_registers(oop, box, tmp, disp_hdr); 190 191 if (LockingMode == LM_LEGACY) { 192 // Find the lock address and load the displaced header from the stack. 193 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 194 195 // If the displaced header is 0, we have a recursive unlock. 196 cmp(disp_hdr, zr); 197 br(Assembler::EQ, cont); 198 } 199 200 // Handle existing monitor. 201 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 202 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 203 204 if (LockingMode == LM_MONITOR) { 205 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 206 b(cont); 207 } else { 208 assert(LockingMode == LM_LEGACY, "must be"); 209 // Check if it is still a light weight lock, this is is true if we 210 // see the stack address of the basicLock in the markWord of the 211 // object. 212 213 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 214 /*release*/ true, /*weak*/ false, tmp); 215 b(cont); 216 } 217 218 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 219 220 // Handle existing monitor. 221 bind(object_has_monitor); 222 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 223 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 224 225 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 226 227 Label notRecursive; 228 cbz(disp_hdr, notRecursive); 229 230 // Recursive lock 231 sub(disp_hdr, disp_hdr, 1u); 232 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 233 cmp(disp_hdr, disp_hdr); // Sets flags for result 234 b(cont); 235 236 bind(notRecursive); 237 238 // Compute owner address. 239 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 240 241 // Set owner to null. 242 // Release to satisfy the JMM 243 stlr(zr, owner_addr); 244 // We need a full fence after clearing owner to avoid stranding. 245 // StoreLoad achieves this. 246 membar(StoreLoad); 247 248 // Check if the entry lists are empty. 249 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 250 ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset())); 251 orr(rscratch1, rscratch1, tmpReg); 252 cmp(rscratch1, zr); 253 br(Assembler::EQ, cont); // If so we are done. 254 255 // Check if there is a successor. 256 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 257 cmp(rscratch1, zr); 258 br(Assembler::NE, unlocked); // If so we are done. 259 260 // Save the monitor pointer in the current thread, so we can try to 261 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 262 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 263 264 cmp(zr, rthread); // Set Flag to NE => slow path 265 b(cont); 266 267 bind(unlocked); 268 cmp(zr, zr); // Set Flag to EQ => fast path 269 270 // Intentional fall-through 271 272 bind(cont); 273 // flag == EQ indicates success 274 // flag == NE indicates failure 275 br(Assembler::NE, no_count); 276 277 bind(count); 278 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 279 280 bind(no_count); 281 } 282 283 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 284 Register t2, Register t3) { 285 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 286 assert_different_registers(obj, box, t1, t2, t3); 287 288 // Handle inflated monitor. 289 Label inflated; 290 // Finish fast lock successfully. MUST branch to with flag == EQ 291 Label locked; 292 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 293 Label slow_path; 294 295 if (UseObjectMonitorTable) { 296 // Clear cache in case fast locking succeeds. 297 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 298 } 299 300 if (DiagnoseSyncOnValueBasedClasses != 0) { 301 load_klass(t1, obj); 302 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 303 tst(t1, KlassFlags::_misc_is_value_based_class); 304 br(Assembler::NE, slow_path); 305 } 306 307 const Register t1_mark = t1; 308 const Register t3_t = t3; 309 310 { // Lightweight locking 311 312 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 313 Label push; 314 315 const Register t2_top = t2; 316 317 // Check if lock-stack is full. 318 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 319 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 320 br(Assembler::GT, slow_path); 321 322 // Check if recursive. 323 subw(t3_t, t2_top, oopSize); 324 ldr(t3_t, Address(rthread, t3_t)); 325 cmp(obj, t3_t); 326 br(Assembler::EQ, push); 327 328 // Relaxed normal load to check for monitor. Optimization for monitor case. 329 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 330 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 331 332 // Not inflated 333 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 334 335 // Try to lock. Transition lock-bits 0b01 => 0b00 336 orr(t1_mark, t1_mark, markWord::unlocked_value); 337 eor(t3_t, t1_mark, markWord::unlocked_value); 338 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 339 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 340 br(Assembler::NE, slow_path); 341 342 bind(push); 343 // After successful lock, push object on lock-stack. 344 str(obj, Address(rthread, t2_top)); 345 addw(t2_top, t2_top, oopSize); 346 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 347 b(locked); 348 } 349 350 { // Handle inflated monitor. 351 bind(inflated); 352 353 const Register t1_monitor = t1; 354 355 if (!UseObjectMonitorTable) { 356 assert(t1_monitor == t1_mark, "should be the same here"); 357 } else { 358 Label monitor_found; 359 360 // Load cache address 361 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 362 363 const int num_unrolled = 2; 364 for (int i = 0; i < num_unrolled; i++) { 365 ldr(t1, Address(t3_t)); 366 cmp(obj, t1); 367 br(Assembler::EQ, monitor_found); 368 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 369 } 370 371 Label loop; 372 373 // Search for obj in cache. 374 bind(loop); 375 376 // Check for match. 377 ldr(t1, Address(t3_t)); 378 cmp(obj, t1); 379 br(Assembler::EQ, monitor_found); 380 381 // Search until null encountered, guaranteed _null_sentinel at end. 382 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 383 cbnz(t1, loop); 384 // Cache Miss, NE set from cmp above, cbnz does not set flags 385 b(slow_path); 386 387 bind(monitor_found); 388 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 389 } 390 391 const Register t2_owner_addr = t2; 392 const Register t3_owner = t3; 393 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 394 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 395 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 396 397 Label monitor_locked; 398 399 // Compute owner address. 400 lea(t2_owner_addr, owner_address); 401 402 // CAS owner (null => current thread). 403 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 404 /*release*/ false, /*weak*/ false, t3_owner); 405 br(Assembler::EQ, monitor_locked); 406 407 // Check if recursive. 408 cmp(t3_owner, rthread); 409 br(Assembler::NE, slow_path); 410 411 // Recursive. 412 increment(recursions_address, 1); 413 414 bind(monitor_locked); 415 if (UseObjectMonitorTable) { 416 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 417 } 418 } 419 420 bind(locked); 421 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 422 423 #ifdef ASSERT 424 // Check that locked label is reached with Flags == EQ. 425 Label flag_correct; 426 br(Assembler::EQ, flag_correct); 427 stop("Fast Lock Flag != EQ"); 428 #endif 429 430 bind(slow_path); 431 #ifdef ASSERT 432 // Check that slow_path label is reached with Flags == NE. 433 br(Assembler::NE, flag_correct); 434 stop("Fast Lock Flag != NE"); 435 bind(flag_correct); 436 #endif 437 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 438 } 439 440 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 441 Register t2, Register t3) { 442 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 443 assert_different_registers(obj, box, t1, t2, t3); 444 445 // Handle inflated monitor. 446 Label inflated, inflated_load_mark; 447 // Finish fast unlock successfully. MUST branch to with flag == EQ 448 Label unlocked; 449 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 450 Label slow_path; 451 452 const Register t1_mark = t1; 453 const Register t2_top = t2; 454 const Register t3_t = t3; 455 456 { // Lightweight unlock 457 458 Label push_and_slow_path; 459 460 // Check if obj is top of lock-stack. 461 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 462 subw(t2_top, t2_top, oopSize); 463 ldr(t3_t, Address(rthread, t2_top)); 464 cmp(obj, t3_t); 465 // Top of lock stack was not obj. Must be monitor. 466 br(Assembler::NE, inflated_load_mark); 467 468 // Pop lock-stack. 469 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 470 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 471 472 // Check if recursive. 473 subw(t3_t, t2_top, oopSize); 474 ldr(t3_t, Address(rthread, t3_t)); 475 cmp(obj, t3_t); 476 br(Assembler::EQ, unlocked); 477 478 // Not recursive. 479 // Load Mark. 480 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 481 482 // Check header for monitor (0b10). 483 // Because we got here by popping (meaning we pushed in locked) 484 // there will be no monitor in the box. So we need to push back the obj 485 // so that the runtime can fix any potential anonymous owner. 486 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 487 488 // Try to unlock. Transition lock bits 0b00 => 0b01 489 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 490 orr(t3_t, t1_mark, markWord::unlocked_value); 491 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 492 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 493 br(Assembler::EQ, unlocked); 494 495 bind(push_and_slow_path); 496 // Compare and exchange failed. 497 // Restore lock-stack and handle the unlock in runtime. 498 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 499 addw(t2_top, t2_top, oopSize); 500 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 501 b(slow_path); 502 } 503 504 505 { // Handle inflated monitor. 506 bind(inflated_load_mark); 507 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 508 #ifdef ASSERT 509 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 510 stop("Fast Unlock not monitor"); 511 #endif 512 513 bind(inflated); 514 515 #ifdef ASSERT 516 Label check_done; 517 subw(t2_top, t2_top, oopSize); 518 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 519 br(Assembler::LT, check_done); 520 ldr(t3_t, Address(rthread, t2_top)); 521 cmp(obj, t3_t); 522 br(Assembler::NE, inflated); 523 stop("Fast Unlock lock on stack"); 524 bind(check_done); 525 #endif 526 527 const Register t1_monitor = t1; 528 529 if (!UseObjectMonitorTable) { 530 assert(t1_monitor == t1_mark, "should be the same here"); 531 532 // Untag the monitor. 533 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 534 } else { 535 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 536 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 537 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 538 br(Assembler::LO, slow_path); 539 } 540 541 const Register t2_recursions = t2; 542 Label not_recursive; 543 544 // Check if recursive. 545 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 546 cbz(t2_recursions, not_recursive); 547 548 // Recursive unlock. 549 sub(t2_recursions, t2_recursions, 1u); 550 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 551 // Set flag == EQ 552 cmp(t2_recursions, t2_recursions); 553 b(unlocked); 554 555 bind(not_recursive); 556 557 const Register t2_owner_addr = t2; 558 559 // Compute owner address. 560 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 561 562 // Set owner to null. 563 // Release to satisfy the JMM 564 stlr(zr, t2_owner_addr); 565 // We need a full fence after clearing owner to avoid stranding. 566 // StoreLoad achieves this. 567 membar(StoreLoad); 568 569 // Check if the entry lists are empty. 570 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 571 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 572 orr(rscratch1, rscratch1, t3_t); 573 cmp(rscratch1, zr); 574 br(Assembler::EQ, unlocked); // If so we are done. 575 576 // Check if there is a successor. 577 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 578 cmp(rscratch1, zr); 579 br(Assembler::NE, unlocked); // If so we are done. 580 581 // Save the monitor pointer in the current thread, so we can try to 582 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 583 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 584 585 cmp(zr, rthread); // Set Flag to NE => slow path 586 b(slow_path); 587 } 588 589 bind(unlocked); 590 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 591 cmp(zr, zr); // Set Flags to EQ => fast path 592 593 #ifdef ASSERT 594 // Check that unlocked label is reached with Flags == EQ. 595 Label flag_correct; 596 br(Assembler::EQ, flag_correct); 597 stop("Fast Unlock Flag != EQ"); 598 #endif 599 600 bind(slow_path); 601 #ifdef ASSERT 602 // Check that slow_path label is reached with Flags == NE. 603 br(Assembler::NE, flag_correct); 604 stop("Fast Unlock Flag != NE"); 605 bind(flag_correct); 606 #endif 607 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 608 } 609 610 // Search for str1 in str2 and return index or -1 611 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 612 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 613 Register cnt2, Register cnt1, 614 Register tmp1, Register tmp2, 615 Register tmp3, Register tmp4, 616 Register tmp5, Register tmp6, 617 int icnt1, Register result, int ae) { 618 // NOTE: tmp5, tmp6 can be zr depending on specific method version 619 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 620 621 Register ch1 = rscratch1; 622 Register ch2 = rscratch2; 623 Register cnt1tmp = tmp1; 624 Register cnt2tmp = tmp2; 625 Register cnt1_neg = cnt1; 626 Register cnt2_neg = cnt2; 627 Register result_tmp = tmp4; 628 629 bool isL = ae == StrIntrinsicNode::LL; 630 631 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 632 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 633 int str1_chr_shift = str1_isL ? 0:1; 634 int str2_chr_shift = str2_isL ? 0:1; 635 int str1_chr_size = str1_isL ? 1:2; 636 int str2_chr_size = str2_isL ? 1:2; 637 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 638 (chr_insn)&MacroAssembler::ldrh; 639 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 640 (chr_insn)&MacroAssembler::ldrh; 641 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 642 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 643 644 // Note, inline_string_indexOf() generates checks: 645 // if (substr.count > string.count) return -1; 646 // if (substr.count == 0) return 0; 647 648 // We have two strings, a source string in str2, cnt2 and a pattern string 649 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 650 651 // For larger pattern and source we use a simplified Boyer Moore algorithm. 652 // With a small pattern and source we use linear scan. 653 654 if (icnt1 == -1) { 655 sub(result_tmp, cnt2, cnt1); 656 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 657 br(LT, LINEARSEARCH); 658 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 659 subs(zr, cnt1, 256); 660 lsr(tmp1, cnt2, 2); 661 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 662 br(GE, LINEARSTUB); 663 } 664 665 // The Boyer Moore alogorithm is based on the description here:- 666 // 667 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 668 // 669 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 670 // and the 'Good Suffix' rule. 671 // 672 // These rules are essentially heuristics for how far we can shift the 673 // pattern along the search string. 674 // 675 // The implementation here uses the 'Bad Character' rule only because of the 676 // complexity of initialisation for the 'Good Suffix' rule. 677 // 678 // This is also known as the Boyer-Moore-Horspool algorithm:- 679 // 680 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 681 // 682 // This particular implementation has few java-specific optimizations. 683 // 684 // #define ASIZE 256 685 // 686 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 687 // int i, j; 688 // unsigned c; 689 // unsigned char bc[ASIZE]; 690 // 691 // /* Preprocessing */ 692 // for (i = 0; i < ASIZE; ++i) 693 // bc[i] = m; 694 // for (i = 0; i < m - 1; ) { 695 // c = x[i]; 696 // ++i; 697 // // c < 256 for Latin1 string, so, no need for branch 698 // #ifdef PATTERN_STRING_IS_LATIN1 699 // bc[c] = m - i; 700 // #else 701 // if (c < ASIZE) bc[c] = m - i; 702 // #endif 703 // } 704 // 705 // /* Searching */ 706 // j = 0; 707 // while (j <= n - m) { 708 // c = y[i+j]; 709 // if (x[m-1] == c) 710 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 711 // if (i < 0) return j; 712 // // c < 256 for Latin1 string, so, no need for branch 713 // #ifdef SOURCE_STRING_IS_LATIN1 714 // // LL case: (c< 256) always true. Remove branch 715 // j += bc[y[j+m-1]]; 716 // #endif 717 // #ifndef PATTERN_STRING_IS_UTF 718 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 719 // if (c < ASIZE) 720 // j += bc[y[j+m-1]]; 721 // else 722 // j += 1 723 // #endif 724 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 725 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 726 // if (c < ASIZE) 727 // j += bc[y[j+m-1]]; 728 // else 729 // j += m 730 // #endif 731 // } 732 // } 733 734 if (icnt1 == -1) { 735 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 736 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 737 Register cnt1end = tmp2; 738 Register str2end = cnt2; 739 Register skipch = tmp2; 740 741 // str1 length is >=8, so, we can read at least 1 register for cases when 742 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 743 // UL case. We'll re-read last character in inner pre-loop code to have 744 // single outer pre-loop load 745 const int firstStep = isL ? 7 : 3; 746 747 const int ASIZE = 256; 748 const int STORED_BYTES = 32; // amount of bytes stored per instruction 749 sub(sp, sp, ASIZE); 750 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 751 mov(ch1, sp); 752 BIND(BM_INIT_LOOP); 753 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 754 subs(tmp5, tmp5, 1); 755 br(GT, BM_INIT_LOOP); 756 757 sub(cnt1tmp, cnt1, 1); 758 mov(tmp5, str2); 759 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 760 sub(ch2, cnt1, 1); 761 mov(tmp3, str1); 762 BIND(BCLOOP); 763 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 764 if (!str1_isL) { 765 subs(zr, ch1, ASIZE); 766 br(HS, BCSKIP); 767 } 768 strb(ch2, Address(sp, ch1)); 769 BIND(BCSKIP); 770 subs(ch2, ch2, 1); 771 br(GT, BCLOOP); 772 773 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 774 if (str1_isL == str2_isL) { 775 // load last 8 bytes (8LL/4UU symbols) 776 ldr(tmp6, Address(tmp6, -wordSize)); 777 } else { 778 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 779 // convert Latin1 to UTF. We'll have to wait until load completed, but 780 // it's still faster than per-character loads+checks 781 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 782 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 783 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 784 andr(tmp6, tmp6, 0xFF); // str1[N-4] 785 orr(ch2, ch1, ch2, LSL, 16); 786 orr(tmp6, tmp6, tmp3, LSL, 48); 787 orr(tmp6, tmp6, ch2, LSL, 16); 788 } 789 BIND(BMLOOPSTR2); 790 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 791 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 792 if (str1_isL == str2_isL) { 793 // re-init tmp3. It's for free because it's executed in parallel with 794 // load above. Alternative is to initialize it before loop, but it'll 795 // affect performance on in-order systems with 2 or more ld/st pipelines 796 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 797 } 798 if (!isL) { // UU/UL case 799 lsl(ch2, cnt1tmp, 1); // offset in bytes 800 } 801 cmp(tmp3, skipch); 802 br(NE, BMSKIP); 803 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 804 mov(ch1, tmp6); 805 if (isL) { 806 b(BMLOOPSTR1_AFTER_LOAD); 807 } else { 808 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 809 b(BMLOOPSTR1_CMP); 810 } 811 BIND(BMLOOPSTR1); 812 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 813 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 814 BIND(BMLOOPSTR1_AFTER_LOAD); 815 subs(cnt1tmp, cnt1tmp, 1); 816 br(LT, BMLOOPSTR1_LASTCMP); 817 BIND(BMLOOPSTR1_CMP); 818 cmp(ch1, ch2); 819 br(EQ, BMLOOPSTR1); 820 BIND(BMSKIP); 821 if (!isL) { 822 // if we've met UTF symbol while searching Latin1 pattern, then we can 823 // skip cnt1 symbols 824 if (str1_isL != str2_isL) { 825 mov(result_tmp, cnt1); 826 } else { 827 mov(result_tmp, 1); 828 } 829 subs(zr, skipch, ASIZE); 830 br(HS, BMADV); 831 } 832 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 833 BIND(BMADV); 834 sub(cnt1tmp, cnt1, 1); 835 add(str2, str2, result_tmp, LSL, str2_chr_shift); 836 cmp(str2, str2end); 837 br(LE, BMLOOPSTR2); 838 add(sp, sp, ASIZE); 839 b(NOMATCH); 840 BIND(BMLOOPSTR1_LASTCMP); 841 cmp(ch1, ch2); 842 br(NE, BMSKIP); 843 BIND(BMMATCH); 844 sub(result, str2, tmp5); 845 if (!str2_isL) lsr(result, result, 1); 846 add(sp, sp, ASIZE); 847 b(DONE); 848 849 BIND(LINEARSTUB); 850 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 851 br(LT, LINEAR_MEDIUM); 852 mov(result, zr); 853 RuntimeAddress stub = nullptr; 854 if (isL) { 855 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 856 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 857 } else if (str1_isL) { 858 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 859 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 860 } else { 861 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 862 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 863 } 864 address call = trampoline_call(stub); 865 if (call == nullptr) { 866 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 867 ciEnv::current()->record_failure("CodeCache is full"); 868 return; 869 } 870 b(DONE); 871 } 872 873 BIND(LINEARSEARCH); 874 { 875 Label DO1, DO2, DO3; 876 877 Register str2tmp = tmp2; 878 Register first = tmp3; 879 880 if (icnt1 == -1) 881 { 882 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 883 884 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 885 br(LT, DOSHORT); 886 BIND(LINEAR_MEDIUM); 887 (this->*str1_load_1chr)(first, Address(str1)); 888 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 889 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 890 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 891 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 892 893 BIND(FIRST_LOOP); 894 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 895 cmp(first, ch2); 896 br(EQ, STR1_LOOP); 897 BIND(STR2_NEXT); 898 adds(cnt2_neg, cnt2_neg, str2_chr_size); 899 br(LE, FIRST_LOOP); 900 b(NOMATCH); 901 902 BIND(STR1_LOOP); 903 adds(cnt1tmp, cnt1_neg, str1_chr_size); 904 add(cnt2tmp, cnt2_neg, str2_chr_size); 905 br(GE, MATCH); 906 907 BIND(STR1_NEXT); 908 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 909 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 910 cmp(ch1, ch2); 911 br(NE, STR2_NEXT); 912 adds(cnt1tmp, cnt1tmp, str1_chr_size); 913 add(cnt2tmp, cnt2tmp, str2_chr_size); 914 br(LT, STR1_NEXT); 915 b(MATCH); 916 917 BIND(DOSHORT); 918 if (str1_isL == str2_isL) { 919 cmp(cnt1, (u1)2); 920 br(LT, DO1); 921 br(GT, DO3); 922 } 923 } 924 925 if (icnt1 == 4) { 926 Label CH1_LOOP; 927 928 (this->*load_4chr)(ch1, str1); 929 sub(result_tmp, cnt2, 4); 930 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 931 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 932 933 BIND(CH1_LOOP); 934 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 935 cmp(ch1, ch2); 936 br(EQ, MATCH); 937 adds(cnt2_neg, cnt2_neg, str2_chr_size); 938 br(LE, CH1_LOOP); 939 b(NOMATCH); 940 } 941 942 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 943 Label CH1_LOOP; 944 945 BIND(DO2); 946 (this->*load_2chr)(ch1, str1); 947 if (icnt1 == 2) { 948 sub(result_tmp, cnt2, 2); 949 } 950 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 951 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 952 BIND(CH1_LOOP); 953 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 954 cmp(ch1, ch2); 955 br(EQ, MATCH); 956 adds(cnt2_neg, cnt2_neg, str2_chr_size); 957 br(LE, CH1_LOOP); 958 b(NOMATCH); 959 } 960 961 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 962 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 963 964 BIND(DO3); 965 (this->*load_2chr)(first, str1); 966 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 967 if (icnt1 == 3) { 968 sub(result_tmp, cnt2, 3); 969 } 970 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 971 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 972 BIND(FIRST_LOOP); 973 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 974 cmpw(first, ch2); 975 br(EQ, STR1_LOOP); 976 BIND(STR2_NEXT); 977 adds(cnt2_neg, cnt2_neg, str2_chr_size); 978 br(LE, FIRST_LOOP); 979 b(NOMATCH); 980 981 BIND(STR1_LOOP); 982 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 983 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 984 cmp(ch1, ch2); 985 br(NE, STR2_NEXT); 986 b(MATCH); 987 } 988 989 if (icnt1 == -1 || icnt1 == 1) { 990 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 991 992 BIND(DO1); 993 (this->*str1_load_1chr)(ch1, str1); 994 cmp(cnt2, (u1)8); 995 br(LT, DO1_SHORT); 996 997 sub(result_tmp, cnt2, 8/str2_chr_size); 998 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 999 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1000 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1001 1002 if (str2_isL) { 1003 orr(ch1, ch1, ch1, LSL, 8); 1004 } 1005 orr(ch1, ch1, ch1, LSL, 16); 1006 orr(ch1, ch1, ch1, LSL, 32); 1007 BIND(CH1_LOOP); 1008 ldr(ch2, Address(str2, cnt2_neg)); 1009 eor(ch2, ch1, ch2); 1010 sub(tmp1, ch2, tmp3); 1011 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1012 bics(tmp1, tmp1, tmp2); 1013 br(NE, HAS_ZERO); 1014 adds(cnt2_neg, cnt2_neg, 8); 1015 br(LT, CH1_LOOP); 1016 1017 cmp(cnt2_neg, (u1)8); 1018 mov(cnt2_neg, 0); 1019 br(LT, CH1_LOOP); 1020 b(NOMATCH); 1021 1022 BIND(HAS_ZERO); 1023 rev(tmp1, tmp1); 1024 clz(tmp1, tmp1); 1025 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1026 b(MATCH); 1027 1028 BIND(DO1_SHORT); 1029 mov(result_tmp, cnt2); 1030 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1031 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1032 BIND(DO1_LOOP); 1033 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1034 cmpw(ch1, ch2); 1035 br(EQ, MATCH); 1036 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1037 br(LT, DO1_LOOP); 1038 } 1039 } 1040 BIND(NOMATCH); 1041 mov(result, -1); 1042 b(DONE); 1043 BIND(MATCH); 1044 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1045 BIND(DONE); 1046 } 1047 1048 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1049 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1050 1051 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1052 Register ch, Register result, 1053 Register tmp1, Register tmp2, Register tmp3) 1054 { 1055 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1056 Register cnt1_neg = cnt1; 1057 Register ch1 = rscratch1; 1058 Register result_tmp = rscratch2; 1059 1060 cbz(cnt1, NOMATCH); 1061 1062 cmp(cnt1, (u1)4); 1063 br(LT, DO1_SHORT); 1064 1065 orr(ch, ch, ch, LSL, 16); 1066 orr(ch, ch, ch, LSL, 32); 1067 1068 sub(cnt1, cnt1, 4); 1069 mov(result_tmp, cnt1); 1070 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1071 sub(cnt1_neg, zr, cnt1, LSL, 1); 1072 1073 mov(tmp3, 0x0001000100010001); 1074 1075 BIND(CH1_LOOP); 1076 ldr(ch1, Address(str1, cnt1_neg)); 1077 eor(ch1, ch, ch1); 1078 sub(tmp1, ch1, tmp3); 1079 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1080 bics(tmp1, tmp1, tmp2); 1081 br(NE, HAS_ZERO); 1082 adds(cnt1_neg, cnt1_neg, 8); 1083 br(LT, CH1_LOOP); 1084 1085 cmp(cnt1_neg, (u1)8); 1086 mov(cnt1_neg, 0); 1087 br(LT, CH1_LOOP); 1088 b(NOMATCH); 1089 1090 BIND(HAS_ZERO); 1091 rev(tmp1, tmp1); 1092 clz(tmp1, tmp1); 1093 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1094 b(MATCH); 1095 1096 BIND(DO1_SHORT); 1097 mov(result_tmp, cnt1); 1098 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1099 sub(cnt1_neg, zr, cnt1, LSL, 1); 1100 BIND(DO1_LOOP); 1101 ldrh(ch1, Address(str1, cnt1_neg)); 1102 cmpw(ch, ch1); 1103 br(EQ, MATCH); 1104 adds(cnt1_neg, cnt1_neg, 2); 1105 br(LT, DO1_LOOP); 1106 BIND(NOMATCH); 1107 mov(result, -1); 1108 b(DONE); 1109 BIND(MATCH); 1110 add(result, result_tmp, cnt1_neg, ASR, 1); 1111 BIND(DONE); 1112 } 1113 1114 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1115 Register ch, Register result, 1116 FloatRegister ztmp1, 1117 FloatRegister ztmp2, 1118 PRegister tmp_pg, 1119 PRegister tmp_pdn, bool isL) 1120 { 1121 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1122 assert(tmp_pg->is_governing(), 1123 "this register has to be a governing predicate register"); 1124 1125 Label LOOP, MATCH, DONE, NOMATCH; 1126 Register vec_len = rscratch1; 1127 Register idx = rscratch2; 1128 1129 SIMD_RegVariant T = (isL == true) ? B : H; 1130 1131 cbz(cnt1, NOMATCH); 1132 1133 // Assign the particular char throughout the vector. 1134 sve_dup(ztmp2, T, ch); 1135 if (isL) { 1136 sve_cntb(vec_len); 1137 } else { 1138 sve_cnth(vec_len); 1139 } 1140 mov(idx, 0); 1141 1142 // Generate a predicate to control the reading of input string. 1143 sve_whilelt(tmp_pg, T, idx, cnt1); 1144 1145 BIND(LOOP); 1146 // Read a vector of 8- or 16-bit data depending on the string type. Note 1147 // that inactive elements indicated by the predicate register won't cause 1148 // a data read from memory to the destination vector. 1149 if (isL) { 1150 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1151 } else { 1152 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1153 } 1154 add(idx, idx, vec_len); 1155 1156 // Perform the comparison. An element of the destination predicate is set 1157 // to active if the particular char is matched. 1158 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1159 1160 // Branch if the particular char is found. 1161 br(NE, MATCH); 1162 1163 sve_whilelt(tmp_pg, T, idx, cnt1); 1164 1165 // Loop back if the particular char not found. 1166 br(MI, LOOP); 1167 1168 BIND(NOMATCH); 1169 mov(result, -1); 1170 b(DONE); 1171 1172 BIND(MATCH); 1173 // Undo the index increment. 1174 sub(idx, idx, vec_len); 1175 1176 // Crop the vector to find its location. 1177 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1178 add(result, idx, -1); 1179 sve_incp(result, T, tmp_pdn); 1180 BIND(DONE); 1181 } 1182 1183 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1184 Register ch, Register result, 1185 Register tmp1, Register tmp2, Register tmp3) 1186 { 1187 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1188 Register cnt1_neg = cnt1; 1189 Register ch1 = rscratch1; 1190 Register result_tmp = rscratch2; 1191 1192 cbz(cnt1, NOMATCH); 1193 1194 cmp(cnt1, (u1)8); 1195 br(LT, DO1_SHORT); 1196 1197 orr(ch, ch, ch, LSL, 8); 1198 orr(ch, ch, ch, LSL, 16); 1199 orr(ch, ch, ch, LSL, 32); 1200 1201 sub(cnt1, cnt1, 8); 1202 mov(result_tmp, cnt1); 1203 lea(str1, Address(str1, cnt1)); 1204 sub(cnt1_neg, zr, cnt1); 1205 1206 mov(tmp3, 0x0101010101010101); 1207 1208 BIND(CH1_LOOP); 1209 ldr(ch1, Address(str1, cnt1_neg)); 1210 eor(ch1, ch, ch1); 1211 sub(tmp1, ch1, tmp3); 1212 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1213 bics(tmp1, tmp1, tmp2); 1214 br(NE, HAS_ZERO); 1215 adds(cnt1_neg, cnt1_neg, 8); 1216 br(LT, CH1_LOOP); 1217 1218 cmp(cnt1_neg, (u1)8); 1219 mov(cnt1_neg, 0); 1220 br(LT, CH1_LOOP); 1221 b(NOMATCH); 1222 1223 BIND(HAS_ZERO); 1224 rev(tmp1, tmp1); 1225 clz(tmp1, tmp1); 1226 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1227 b(MATCH); 1228 1229 BIND(DO1_SHORT); 1230 mov(result_tmp, cnt1); 1231 lea(str1, Address(str1, cnt1)); 1232 sub(cnt1_neg, zr, cnt1); 1233 BIND(DO1_LOOP); 1234 ldrb(ch1, Address(str1, cnt1_neg)); 1235 cmp(ch, ch1); 1236 br(EQ, MATCH); 1237 adds(cnt1_neg, cnt1_neg, 1); 1238 br(LT, DO1_LOOP); 1239 BIND(NOMATCH); 1240 mov(result, -1); 1241 b(DONE); 1242 BIND(MATCH); 1243 add(result, result_tmp, cnt1_neg); 1244 BIND(DONE); 1245 } 1246 1247 // Compare strings. 1248 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1249 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1250 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1251 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1252 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1253 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1254 SHORT_LOOP_START, TAIL_CHECK; 1255 1256 bool isLL = ae == StrIntrinsicNode::LL; 1257 bool isLU = ae == StrIntrinsicNode::LU; 1258 bool isUL = ae == StrIntrinsicNode::UL; 1259 1260 // The stub threshold for LL strings is: 72 (64 + 8) chars 1261 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1262 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1263 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1264 1265 bool str1_isL = isLL || isLU; 1266 bool str2_isL = isLL || isUL; 1267 1268 int str1_chr_shift = str1_isL ? 0 : 1; 1269 int str2_chr_shift = str2_isL ? 0 : 1; 1270 int str1_chr_size = str1_isL ? 1 : 2; 1271 int str2_chr_size = str2_isL ? 1 : 2; 1272 int minCharsInWord = isLL ? wordSize : wordSize/2; 1273 1274 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1275 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1276 (chr_insn)&MacroAssembler::ldrh; 1277 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1278 (chr_insn)&MacroAssembler::ldrh; 1279 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1280 (uxt_insn)&MacroAssembler::uxthw; 1281 1282 BLOCK_COMMENT("string_compare {"); 1283 1284 // Bizarrely, the counts are passed in bytes, regardless of whether they 1285 // are L or U strings, however the result is always in characters. 1286 if (!str1_isL) asrw(cnt1, cnt1, 1); 1287 if (!str2_isL) asrw(cnt2, cnt2, 1); 1288 1289 // Compute the minimum of the string lengths and save the difference. 1290 subsw(result, cnt1, cnt2); 1291 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1292 1293 // A very short string 1294 cmpw(cnt2, minCharsInWord); 1295 br(Assembler::LE, SHORT_STRING); 1296 1297 // Compare longwords 1298 // load first parts of strings and finish initialization while loading 1299 { 1300 if (str1_isL == str2_isL) { // LL or UU 1301 ldr(tmp1, Address(str1)); 1302 cmp(str1, str2); 1303 br(Assembler::EQ, DONE); 1304 ldr(tmp2, Address(str2)); 1305 cmp(cnt2, stub_threshold); 1306 br(GE, STUB); 1307 subsw(cnt2, cnt2, minCharsInWord); 1308 br(EQ, TAIL_CHECK); 1309 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1310 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1311 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1312 } else if (isLU) { 1313 ldrs(vtmp, Address(str1)); 1314 ldr(tmp2, Address(str2)); 1315 cmp(cnt2, stub_threshold); 1316 br(GE, STUB); 1317 subw(cnt2, cnt2, 4); 1318 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1319 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1320 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1321 zip1(vtmp, T8B, vtmp, vtmpZ); 1322 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1323 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1324 add(cnt1, cnt1, 4); 1325 fmovd(tmp1, vtmp); 1326 } else { // UL case 1327 ldr(tmp1, Address(str1)); 1328 ldrs(vtmp, Address(str2)); 1329 cmp(cnt2, stub_threshold); 1330 br(GE, STUB); 1331 subw(cnt2, cnt2, 4); 1332 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1333 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1334 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1335 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1336 zip1(vtmp, T8B, vtmp, vtmpZ); 1337 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1338 add(cnt1, cnt1, 8); 1339 fmovd(tmp2, vtmp); 1340 } 1341 adds(cnt2, cnt2, isUL ? 4 : 8); 1342 br(GE, TAIL); 1343 eor(rscratch2, tmp1, tmp2); 1344 cbnz(rscratch2, DIFF); 1345 // main loop 1346 bind(NEXT_WORD); 1347 if (str1_isL == str2_isL) { 1348 ldr(tmp1, Address(str1, cnt2)); 1349 ldr(tmp2, Address(str2, cnt2)); 1350 adds(cnt2, cnt2, 8); 1351 } else if (isLU) { 1352 ldrs(vtmp, Address(str1, cnt1)); 1353 ldr(tmp2, Address(str2, cnt2)); 1354 add(cnt1, cnt1, 4); 1355 zip1(vtmp, T8B, vtmp, vtmpZ); 1356 fmovd(tmp1, vtmp); 1357 adds(cnt2, cnt2, 8); 1358 } else { // UL 1359 ldrs(vtmp, Address(str2, cnt2)); 1360 ldr(tmp1, Address(str1, cnt1)); 1361 zip1(vtmp, T8B, vtmp, vtmpZ); 1362 add(cnt1, cnt1, 8); 1363 fmovd(tmp2, vtmp); 1364 adds(cnt2, cnt2, 4); 1365 } 1366 br(GE, TAIL); 1367 1368 eor(rscratch2, tmp1, tmp2); 1369 cbz(rscratch2, NEXT_WORD); 1370 b(DIFF); 1371 bind(TAIL); 1372 eor(rscratch2, tmp1, tmp2); 1373 cbnz(rscratch2, DIFF); 1374 // Last longword. In the case where length == 4 we compare the 1375 // same longword twice, but that's still faster than another 1376 // conditional branch. 1377 if (str1_isL == str2_isL) { 1378 ldr(tmp1, Address(str1)); 1379 ldr(tmp2, Address(str2)); 1380 } else if (isLU) { 1381 ldrs(vtmp, Address(str1)); 1382 ldr(tmp2, Address(str2)); 1383 zip1(vtmp, T8B, vtmp, vtmpZ); 1384 fmovd(tmp1, vtmp); 1385 } else { // UL 1386 ldrs(vtmp, Address(str2)); 1387 ldr(tmp1, Address(str1)); 1388 zip1(vtmp, T8B, vtmp, vtmpZ); 1389 fmovd(tmp2, vtmp); 1390 } 1391 bind(TAIL_CHECK); 1392 eor(rscratch2, tmp1, tmp2); 1393 cbz(rscratch2, DONE); 1394 1395 // Find the first different characters in the longwords and 1396 // compute their difference. 1397 bind(DIFF); 1398 rev(rscratch2, rscratch2); 1399 clz(rscratch2, rscratch2); 1400 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1401 lsrv(tmp1, tmp1, rscratch2); 1402 (this->*ext_chr)(tmp1, tmp1); 1403 lsrv(tmp2, tmp2, rscratch2); 1404 (this->*ext_chr)(tmp2, tmp2); 1405 subw(result, tmp1, tmp2); 1406 b(DONE); 1407 } 1408 1409 bind(STUB); 1410 RuntimeAddress stub = nullptr; 1411 switch(ae) { 1412 case StrIntrinsicNode::LL: 1413 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1414 break; 1415 case StrIntrinsicNode::UU: 1416 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1417 break; 1418 case StrIntrinsicNode::LU: 1419 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1420 break; 1421 case StrIntrinsicNode::UL: 1422 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1423 break; 1424 default: 1425 ShouldNotReachHere(); 1426 } 1427 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1428 address call = trampoline_call(stub); 1429 if (call == nullptr) { 1430 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1431 ciEnv::current()->record_failure("CodeCache is full"); 1432 return; 1433 } 1434 b(DONE); 1435 1436 bind(SHORT_STRING); 1437 // Is the minimum length zero? 1438 cbz(cnt2, DONE); 1439 // arrange code to do most branches while loading and loading next characters 1440 // while comparing previous 1441 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1442 subs(cnt2, cnt2, 1); 1443 br(EQ, SHORT_LAST_INIT); 1444 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1445 b(SHORT_LOOP_START); 1446 bind(SHORT_LOOP); 1447 subs(cnt2, cnt2, 1); 1448 br(EQ, SHORT_LAST); 1449 bind(SHORT_LOOP_START); 1450 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1451 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1452 cmp(tmp1, cnt1); 1453 br(NE, SHORT_LOOP_TAIL); 1454 subs(cnt2, cnt2, 1); 1455 br(EQ, SHORT_LAST2); 1456 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1457 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1458 cmp(tmp2, rscratch1); 1459 br(EQ, SHORT_LOOP); 1460 sub(result, tmp2, rscratch1); 1461 b(DONE); 1462 bind(SHORT_LOOP_TAIL); 1463 sub(result, tmp1, cnt1); 1464 b(DONE); 1465 bind(SHORT_LAST2); 1466 cmp(tmp2, rscratch1); 1467 br(EQ, DONE); 1468 sub(result, tmp2, rscratch1); 1469 1470 b(DONE); 1471 bind(SHORT_LAST_INIT); 1472 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1473 bind(SHORT_LAST); 1474 cmp(tmp1, cnt1); 1475 br(EQ, DONE); 1476 sub(result, tmp1, cnt1); 1477 1478 bind(DONE); 1479 1480 BLOCK_COMMENT("} string_compare"); 1481 } 1482 1483 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1484 FloatRegister src2, Condition cond, bool isQ) { 1485 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1486 FloatRegister zn = src1, zm = src2; 1487 bool needs_negation = false; 1488 switch (cond) { 1489 case LT: cond = GT; zn = src2; zm = src1; break; 1490 case LE: cond = GE; zn = src2; zm = src1; break; 1491 case LO: cond = HI; zn = src2; zm = src1; break; 1492 case LS: cond = HS; zn = src2; zm = src1; break; 1493 case NE: cond = EQ; needs_negation = true; break; 1494 default: 1495 break; 1496 } 1497 1498 if (is_floating_point_type(bt)) { 1499 fcm(cond, dst, size, zn, zm); 1500 } else { 1501 cm(cond, dst, size, zn, zm); 1502 } 1503 1504 if (needs_negation) { 1505 notr(dst, isQ ? T16B : T8B, dst); 1506 } 1507 } 1508 1509 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1510 Condition cond, bool isQ) { 1511 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1512 if (bt == T_FLOAT || bt == T_DOUBLE) { 1513 if (cond == Assembler::NE) { 1514 fcm(Assembler::EQ, dst, size, src); 1515 notr(dst, isQ ? T16B : T8B, dst); 1516 } else { 1517 fcm(cond, dst, size, src); 1518 } 1519 } else { 1520 if (cond == Assembler::NE) { 1521 cm(Assembler::EQ, dst, size, src); 1522 notr(dst, isQ ? T16B : T8B, dst); 1523 } else { 1524 cm(cond, dst, size, src); 1525 } 1526 } 1527 } 1528 1529 // Compress the least significant bit of each byte to the rightmost and clear 1530 // the higher garbage bits. 1531 void C2_MacroAssembler::bytemask_compress(Register dst) { 1532 // Example input, dst = 0x01 00 00 00 01 01 00 01 1533 // The "??" bytes are garbage. 1534 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1535 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1536 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1537 andr(dst, dst, 0xff); // dst = 0x8D 1538 } 1539 1540 // Pack the lowest-numbered bit of each mask element in src into a long value 1541 // in dst, at most the first 64 lane elements. 1542 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1543 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1544 FloatRegister vtmp1, FloatRegister vtmp2) { 1545 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1546 assert_different_registers(dst, rscratch1); 1547 assert_different_registers(vtmp1, vtmp2); 1548 1549 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1550 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1551 // Expected: dst = 0x658D 1552 1553 // Convert the mask into vector with sequential bytes. 1554 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1555 sve_cpy(vtmp1, size, src, 1, false); 1556 if (bt != T_BYTE) { 1557 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1558 } 1559 1560 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1561 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1562 // is to compress each significant bit of the byte in a cross-lane way. Due 1563 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1564 // (bit-compress in each lane) with the biggest lane size (T = D) then 1565 // concatenate the results. 1566 1567 // The second source input of BEXT, initialized with 0x01 in each byte. 1568 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1569 sve_dup(vtmp2, B, 1); 1570 1571 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1572 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1573 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1574 // --------------------------------------- 1575 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1576 sve_bext(vtmp1, D, vtmp1, vtmp2); 1577 1578 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1579 // result to dst. 1580 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1581 // dst = 0x658D 1582 if (lane_cnt <= 8) { 1583 // No need to concatenate. 1584 umov(dst, vtmp1, B, 0); 1585 } else if (lane_cnt <= 16) { 1586 ins(vtmp1, B, vtmp1, 1, 8); 1587 umov(dst, vtmp1, H, 0); 1588 } else { 1589 // As the lane count is 64 at most, the final expected value must be in 1590 // the lowest 64 bits after narrowing vtmp1 from D to B. 1591 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1592 umov(dst, vtmp1, D, 0); 1593 } 1594 } else if (UseSVE > 0) { 1595 // Compress the lowest 8 bytes. 1596 fmovd(dst, vtmp1); 1597 bytemask_compress(dst); 1598 if (lane_cnt <= 8) return; 1599 1600 // Repeat on higher bytes and join the results. 1601 // Compress 8 bytes in each iteration. 1602 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1603 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1604 bytemask_compress(rscratch1); 1605 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1606 } 1607 } else { 1608 assert(false, "unsupported"); 1609 ShouldNotReachHere(); 1610 } 1611 } 1612 1613 // Unpack the mask, a long value in src, into predicate register dst based on the 1614 // corresponding data type. Note that dst can support at most 64 lanes. 1615 // Below example gives the expected dst predicate register in different types, with 1616 // a valid src(0x658D) on a 1024-bit vector size machine. 1617 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1618 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1619 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1620 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1621 // 1622 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1623 // has 24 significant bits would be an invalid input if dst predicate register refers to 1624 // a LONG type 1024-bit vector, which has at most 16 lanes. 1625 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1626 FloatRegister vtmp1, FloatRegister vtmp2) { 1627 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1628 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1629 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1630 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1631 // Expected: dst = 0b01101001 10001101 1632 1633 // Put long value from general purpose register into the first lane of vector. 1634 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1635 sve_dup(vtmp1, B, 0); 1636 mov(vtmp1, D, 0, src); 1637 1638 // As sve_cmp generates mask value with the minimum unit in byte, we should 1639 // transform the value in the first lane which is mask in bit now to the 1640 // mask in byte, which can be done by SVE2's BDEP instruction. 1641 1642 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1643 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1644 if (lane_cnt <= 8) { 1645 // Nothing. As only one byte exsits. 1646 } else if (lane_cnt <= 16) { 1647 ins(vtmp1, B, vtmp1, 8, 1); 1648 mov(vtmp1, B, 1, zr); 1649 } else { 1650 sve_vector_extend(vtmp1, D, vtmp1, B); 1651 } 1652 1653 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1654 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1655 sve_dup(vtmp2, B, 1); 1656 1657 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1658 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1659 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1660 // --------------------------------------- 1661 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1662 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1663 1664 if (bt != T_BYTE) { 1665 sve_vector_extend(vtmp1, size, vtmp1, B); 1666 } 1667 // Generate mask according to the given vector, in which the elements have been 1668 // extended to expected type. 1669 // dst = 0b01101001 10001101 1670 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1671 } 1672 1673 // Clobbers: rflags 1674 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1675 FloatRegister zn, FloatRegister zm, Condition cond) { 1676 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1677 FloatRegister z1 = zn, z2 = zm; 1678 switch (cond) { 1679 case LE: z1 = zm; z2 = zn; cond = GE; break; 1680 case LT: z1 = zm; z2 = zn; cond = GT; break; 1681 case LO: z1 = zm; z2 = zn; cond = HI; break; 1682 case LS: z1 = zm; z2 = zn; cond = HS; break; 1683 default: 1684 break; 1685 } 1686 1687 SIMD_RegVariant size = elemType_to_regVariant(bt); 1688 if (is_floating_point_type(bt)) { 1689 sve_fcm(cond, pd, size, pg, z1, z2); 1690 } else { 1691 assert(is_integral_type(bt), "unsupported element type"); 1692 sve_cmp(cond, pd, size, pg, z1, z2); 1693 } 1694 } 1695 1696 // Get index of the last mask lane that is set 1697 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1698 SIMD_RegVariant size = elemType_to_regVariant(bt); 1699 sve_rev(ptmp, size, src); 1700 sve_brkb(ptmp, ptrue, ptmp, false); 1701 sve_cntp(dst, size, ptrue, ptmp); 1702 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1703 subw(dst, rscratch1, dst); 1704 } 1705 1706 // Extend integer vector src to dst with the same lane count 1707 // but larger element size, e.g. 4B -> 4I 1708 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1709 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1710 if (src_bt == T_BYTE) { 1711 if (dst_bt == T_SHORT) { 1712 // 4B/8B to 4S/8S 1713 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1714 } else { 1715 // 4B to 4I 1716 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1717 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1718 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1719 } 1720 } else if (src_bt == T_SHORT) { 1721 // 4S to 4I 1722 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1723 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1724 } else if (src_bt == T_INT) { 1725 // 2I to 2L 1726 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1727 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1728 } else { 1729 ShouldNotReachHere(); 1730 } 1731 } 1732 1733 // Narrow integer vector src down to dst with the same lane count 1734 // but smaller element size, e.g. 4I -> 4B 1735 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1736 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1737 if (src_bt == T_SHORT) { 1738 // 4S/8S to 4B/8B 1739 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1740 assert(dst_bt == T_BYTE, "unsupported"); 1741 xtn(dst, T8B, src, T8H); 1742 } else if (src_bt == T_INT) { 1743 // 4I to 4B/4S 1744 assert(src_vlen_in_bytes == 16, "unsupported"); 1745 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1746 xtn(dst, T4H, src, T4S); 1747 if (dst_bt == T_BYTE) { 1748 xtn(dst, T8B, dst, T8H); 1749 } 1750 } else if (src_bt == T_LONG) { 1751 // 2L to 2I 1752 assert(src_vlen_in_bytes == 16, "unsupported"); 1753 assert(dst_bt == T_INT, "unsupported"); 1754 xtn(dst, T2S, src, T2D); 1755 } else { 1756 ShouldNotReachHere(); 1757 } 1758 } 1759 1760 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1761 FloatRegister src, SIMD_RegVariant src_size, 1762 bool is_unsigned) { 1763 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1764 1765 if (src_size == B) { 1766 switch (dst_size) { 1767 case H: 1768 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1769 break; 1770 case S: 1771 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1772 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1773 break; 1774 case D: 1775 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1776 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1777 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1778 break; 1779 default: 1780 ShouldNotReachHere(); 1781 } 1782 } else if (src_size == H) { 1783 if (dst_size == S) { 1784 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1785 } else { // D 1786 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1787 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1788 } 1789 } else if (src_size == S) { 1790 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1791 } 1792 } 1793 1794 // Vector narrow from src to dst with specified element sizes. 1795 // High part of dst vector will be filled with zero. 1796 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1797 FloatRegister src, SIMD_RegVariant src_size, 1798 FloatRegister tmp) { 1799 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1800 assert_different_registers(src, tmp); 1801 sve_dup(tmp, src_size, 0); 1802 if (src_size == D) { 1803 switch (dst_size) { 1804 case S: 1805 sve_uzp1(dst, S, src, tmp); 1806 break; 1807 case H: 1808 assert_different_registers(dst, tmp); 1809 sve_uzp1(dst, S, src, tmp); 1810 sve_uzp1(dst, H, dst, tmp); 1811 break; 1812 case B: 1813 assert_different_registers(dst, tmp); 1814 sve_uzp1(dst, S, src, tmp); 1815 sve_uzp1(dst, H, dst, tmp); 1816 sve_uzp1(dst, B, dst, tmp); 1817 break; 1818 default: 1819 ShouldNotReachHere(); 1820 } 1821 } else if (src_size == S) { 1822 if (dst_size == H) { 1823 sve_uzp1(dst, H, src, tmp); 1824 } else { // B 1825 assert_different_registers(dst, tmp); 1826 sve_uzp1(dst, H, src, tmp); 1827 sve_uzp1(dst, B, dst, tmp); 1828 } 1829 } else if (src_size == H) { 1830 sve_uzp1(dst, B, src, tmp); 1831 } 1832 } 1833 1834 // Extend src predicate to dst predicate with the same lane count but larger 1835 // element size, e.g. 64Byte -> 512Long 1836 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1837 uint dst_element_length_in_bytes, 1838 uint src_element_length_in_bytes) { 1839 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1840 sve_punpklo(dst, src); 1841 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1842 sve_punpklo(dst, src); 1843 sve_punpklo(dst, dst); 1844 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1845 sve_punpklo(dst, src); 1846 sve_punpklo(dst, dst); 1847 sve_punpklo(dst, dst); 1848 } else { 1849 assert(false, "unsupported"); 1850 ShouldNotReachHere(); 1851 } 1852 } 1853 1854 // Narrow src predicate to dst predicate with the same lane count but 1855 // smaller element size, e.g. 512Long -> 64Byte 1856 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1857 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1858 // The insignificant bits in src predicate are expected to be zero. 1859 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1860 // passed as the second argument. An example narrowing operation with a given mask would be - 1861 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1862 // Mask (for 2 Longs) : TF 1863 // Predicate register for the above mask (16 bits) : 00000001 00000000 1864 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1865 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1866 assert_different_registers(src, ptmp); 1867 assert_different_registers(dst, ptmp); 1868 sve_pfalse(ptmp); 1869 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1870 sve_uzp1(dst, B, src, ptmp); 1871 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1872 sve_uzp1(dst, H, src, ptmp); 1873 sve_uzp1(dst, B, dst, ptmp); 1874 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1875 sve_uzp1(dst, S, src, ptmp); 1876 sve_uzp1(dst, H, dst, ptmp); 1877 sve_uzp1(dst, B, dst, ptmp); 1878 } else { 1879 assert(false, "unsupported"); 1880 ShouldNotReachHere(); 1881 } 1882 } 1883 1884 // Vector reduction add for integral type with ASIMD instructions. 1885 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1886 Register isrc, FloatRegister vsrc, 1887 unsigned vector_length_in_bytes, 1888 FloatRegister vtmp) { 1889 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1890 assert_different_registers(dst, isrc); 1891 bool isQ = vector_length_in_bytes == 16; 1892 1893 BLOCK_COMMENT("neon_reduce_add_integral {"); 1894 switch(bt) { 1895 case T_BYTE: 1896 addv(vtmp, isQ ? T16B : T8B, vsrc); 1897 smov(dst, vtmp, B, 0); 1898 addw(dst, dst, isrc, ext::sxtb); 1899 break; 1900 case T_SHORT: 1901 addv(vtmp, isQ ? T8H : T4H, vsrc); 1902 smov(dst, vtmp, H, 0); 1903 addw(dst, dst, isrc, ext::sxth); 1904 break; 1905 case T_INT: 1906 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1907 umov(dst, vtmp, S, 0); 1908 addw(dst, dst, isrc); 1909 break; 1910 case T_LONG: 1911 assert(isQ, "unsupported"); 1912 addpd(vtmp, vsrc); 1913 umov(dst, vtmp, D, 0); 1914 add(dst, dst, isrc); 1915 break; 1916 default: 1917 assert(false, "unsupported"); 1918 ShouldNotReachHere(); 1919 } 1920 BLOCK_COMMENT("} neon_reduce_add_integral"); 1921 } 1922 1923 // Vector reduction multiply for integral type with ASIMD instructions. 1924 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1925 // Clobbers: rscratch1 1926 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1927 Register isrc, FloatRegister vsrc, 1928 unsigned vector_length_in_bytes, 1929 FloatRegister vtmp1, FloatRegister vtmp2) { 1930 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1931 bool isQ = vector_length_in_bytes == 16; 1932 1933 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1934 switch(bt) { 1935 case T_BYTE: 1936 if (isQ) { 1937 // Multiply the lower half and higher half of vector iteratively. 1938 // vtmp1 = vsrc[8:15] 1939 ins(vtmp1, D, vsrc, 0, 1); 1940 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1941 mulv(vtmp1, T8B, vtmp1, vsrc); 1942 // vtmp2 = vtmp1[4:7] 1943 ins(vtmp2, S, vtmp1, 0, 1); 1944 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1945 mulv(vtmp1, T8B, vtmp2, vtmp1); 1946 } else { 1947 ins(vtmp1, S, vsrc, 0, 1); 1948 mulv(vtmp1, T8B, vtmp1, vsrc); 1949 } 1950 // vtmp2 = vtmp1[2:3] 1951 ins(vtmp2, H, vtmp1, 0, 1); 1952 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1953 mulv(vtmp2, T8B, vtmp2, vtmp1); 1954 // dst = vtmp2[0] * isrc * vtmp2[1] 1955 umov(rscratch1, vtmp2, B, 0); 1956 mulw(dst, rscratch1, isrc); 1957 sxtb(dst, dst); 1958 umov(rscratch1, vtmp2, B, 1); 1959 mulw(dst, rscratch1, dst); 1960 sxtb(dst, dst); 1961 break; 1962 case T_SHORT: 1963 if (isQ) { 1964 ins(vtmp2, D, vsrc, 0, 1); 1965 mulv(vtmp2, T4H, vtmp2, vsrc); 1966 ins(vtmp1, S, vtmp2, 0, 1); 1967 mulv(vtmp1, T4H, vtmp1, vtmp2); 1968 } else { 1969 ins(vtmp1, S, vsrc, 0, 1); 1970 mulv(vtmp1, T4H, vtmp1, vsrc); 1971 } 1972 umov(rscratch1, vtmp1, H, 0); 1973 mulw(dst, rscratch1, isrc); 1974 sxth(dst, dst); 1975 umov(rscratch1, vtmp1, H, 1); 1976 mulw(dst, rscratch1, dst); 1977 sxth(dst, dst); 1978 break; 1979 case T_INT: 1980 if (isQ) { 1981 ins(vtmp1, D, vsrc, 0, 1); 1982 mulv(vtmp1, T2S, vtmp1, vsrc); 1983 } else { 1984 vtmp1 = vsrc; 1985 } 1986 umov(rscratch1, vtmp1, S, 0); 1987 mul(dst, rscratch1, isrc); 1988 umov(rscratch1, vtmp1, S, 1); 1989 mul(dst, rscratch1, dst); 1990 break; 1991 case T_LONG: 1992 umov(rscratch1, vsrc, D, 0); 1993 mul(dst, isrc, rscratch1); 1994 umov(rscratch1, vsrc, D, 1); 1995 mul(dst, dst, rscratch1); 1996 break; 1997 default: 1998 assert(false, "unsupported"); 1999 ShouldNotReachHere(); 2000 } 2001 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2002 } 2003 2004 // Vector reduction multiply for floating-point type with ASIMD instructions. 2005 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2006 FloatRegister fsrc, FloatRegister vsrc, 2007 unsigned vector_length_in_bytes, 2008 FloatRegister vtmp) { 2009 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2010 bool isQ = vector_length_in_bytes == 16; 2011 2012 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2013 switch(bt) { 2014 case T_FLOAT: 2015 fmuls(dst, fsrc, vsrc); 2016 ins(vtmp, S, vsrc, 0, 1); 2017 fmuls(dst, dst, vtmp); 2018 if (isQ) { 2019 ins(vtmp, S, vsrc, 0, 2); 2020 fmuls(dst, dst, vtmp); 2021 ins(vtmp, S, vsrc, 0, 3); 2022 fmuls(dst, dst, vtmp); 2023 } 2024 break; 2025 case T_DOUBLE: 2026 assert(isQ, "unsupported"); 2027 fmuld(dst, fsrc, vsrc); 2028 ins(vtmp, D, vsrc, 0, 1); 2029 fmuld(dst, dst, vtmp); 2030 break; 2031 default: 2032 assert(false, "unsupported"); 2033 ShouldNotReachHere(); 2034 } 2035 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2036 } 2037 2038 // Helper to select logical instruction 2039 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2040 Register Rn, Register Rm, 2041 enum shift_kind kind, unsigned shift) { 2042 switch(opc) { 2043 case Op_AndReductionV: 2044 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2045 break; 2046 case Op_OrReductionV: 2047 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2048 break; 2049 case Op_XorReductionV: 2050 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2051 break; 2052 default: 2053 assert(false, "unsupported"); 2054 ShouldNotReachHere(); 2055 } 2056 } 2057 2058 // Vector reduction logical operations And, Or, Xor 2059 // Clobbers: rscratch1 2060 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2061 Register isrc, FloatRegister vsrc, 2062 unsigned vector_length_in_bytes) { 2063 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2064 "unsupported"); 2065 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2066 assert_different_registers(dst, isrc); 2067 bool isQ = vector_length_in_bytes == 16; 2068 2069 BLOCK_COMMENT("neon_reduce_logical {"); 2070 umov(rscratch1, vsrc, isQ ? D : S, 0); 2071 umov(dst, vsrc, isQ ? D : S, 1); 2072 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2073 switch(bt) { 2074 case T_BYTE: 2075 if (isQ) { 2076 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2077 } 2078 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2079 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2080 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2081 sxtb(dst, dst); 2082 break; 2083 case T_SHORT: 2084 if (isQ) { 2085 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2086 } 2087 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2088 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2089 sxth(dst, dst); 2090 break; 2091 case T_INT: 2092 if (isQ) { 2093 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2094 } 2095 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2096 break; 2097 case T_LONG: 2098 assert(isQ, "unsupported"); 2099 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2100 break; 2101 default: 2102 assert(false, "unsupported"); 2103 ShouldNotReachHere(); 2104 } 2105 BLOCK_COMMENT("} neon_reduce_logical"); 2106 } 2107 2108 // Vector reduction min/max for integral type with ASIMD instructions. 2109 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2110 // Clobbers: rscratch1, rflags 2111 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2112 Register isrc, FloatRegister vsrc, 2113 unsigned vector_length_in_bytes, 2114 FloatRegister vtmp) { 2115 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2116 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2117 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2118 assert_different_registers(dst, isrc); 2119 bool isQ = vector_length_in_bytes == 16; 2120 bool is_min = opc == Op_MinReductionV; 2121 2122 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2123 if (bt == T_LONG) { 2124 assert(vtmp == fnoreg, "should be"); 2125 assert(isQ, "should be"); 2126 umov(rscratch1, vsrc, D, 0); 2127 cmp(isrc, rscratch1); 2128 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2129 umov(rscratch1, vsrc, D, 1); 2130 cmp(dst, rscratch1); 2131 csel(dst, dst, rscratch1, is_min ? LT : GT); 2132 } else { 2133 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2134 if (size == T2S) { 2135 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2136 } else { 2137 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2138 } 2139 if (bt == T_INT) { 2140 umov(dst, vtmp, S, 0); 2141 } else { 2142 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2143 } 2144 cmpw(dst, isrc); 2145 cselw(dst, dst, isrc, is_min ? LT : GT); 2146 } 2147 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2148 } 2149 2150 // Vector reduction for integral type with SVE instruction. 2151 // Supported operations are Add, And, Or, Xor, Max, Min. 2152 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2153 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2154 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2155 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2156 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2157 assert_different_registers(src1, dst); 2158 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2159 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2160 switch (opc) { 2161 case Op_AddReductionVI: { 2162 sve_uaddv(tmp, size, pg, src2); 2163 if (bt == T_BYTE) { 2164 smov(dst, tmp, size, 0); 2165 addw(dst, src1, dst, ext::sxtb); 2166 } else if (bt == T_SHORT) { 2167 smov(dst, tmp, size, 0); 2168 addw(dst, src1, dst, ext::sxth); 2169 } else { 2170 umov(dst, tmp, size, 0); 2171 addw(dst, dst, src1); 2172 } 2173 break; 2174 } 2175 case Op_AddReductionVL: { 2176 sve_uaddv(tmp, size, pg, src2); 2177 umov(dst, tmp, size, 0); 2178 add(dst, dst, src1); 2179 break; 2180 } 2181 case Op_AndReductionV: { 2182 sve_andv(tmp, size, pg, src2); 2183 if (bt == T_INT || bt == T_LONG) { 2184 umov(dst, tmp, size, 0); 2185 } else { 2186 smov(dst, tmp, size, 0); 2187 } 2188 if (bt == T_LONG) { 2189 andr(dst, dst, src1); 2190 } else { 2191 andw(dst, dst, src1); 2192 } 2193 break; 2194 } 2195 case Op_OrReductionV: { 2196 sve_orv(tmp, size, pg, src2); 2197 if (bt == T_INT || bt == T_LONG) { 2198 umov(dst, tmp, size, 0); 2199 } else { 2200 smov(dst, tmp, size, 0); 2201 } 2202 if (bt == T_LONG) { 2203 orr(dst, dst, src1); 2204 } else { 2205 orrw(dst, dst, src1); 2206 } 2207 break; 2208 } 2209 case Op_XorReductionV: { 2210 sve_eorv(tmp, size, pg, src2); 2211 if (bt == T_INT || bt == T_LONG) { 2212 umov(dst, tmp, size, 0); 2213 } else { 2214 smov(dst, tmp, size, 0); 2215 } 2216 if (bt == T_LONG) { 2217 eor(dst, dst, src1); 2218 } else { 2219 eorw(dst, dst, src1); 2220 } 2221 break; 2222 } 2223 case Op_MaxReductionV: { 2224 sve_smaxv(tmp, size, pg, src2); 2225 if (bt == T_INT || bt == T_LONG) { 2226 umov(dst, tmp, size, 0); 2227 } else { 2228 smov(dst, tmp, size, 0); 2229 } 2230 if (bt == T_LONG) { 2231 cmp(dst, src1); 2232 csel(dst, dst, src1, Assembler::GT); 2233 } else { 2234 cmpw(dst, src1); 2235 cselw(dst, dst, src1, Assembler::GT); 2236 } 2237 break; 2238 } 2239 case Op_MinReductionV: { 2240 sve_sminv(tmp, size, pg, src2); 2241 if (bt == T_INT || bt == T_LONG) { 2242 umov(dst, tmp, size, 0); 2243 } else { 2244 smov(dst, tmp, size, 0); 2245 } 2246 if (bt == T_LONG) { 2247 cmp(dst, src1); 2248 csel(dst, dst, src1, Assembler::LT); 2249 } else { 2250 cmpw(dst, src1); 2251 cselw(dst, dst, src1, Assembler::LT); 2252 } 2253 break; 2254 } 2255 default: 2256 assert(false, "unsupported"); 2257 ShouldNotReachHere(); 2258 } 2259 2260 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2261 if (bt == T_BYTE) { 2262 sxtb(dst, dst); 2263 } else if (bt == T_SHORT) { 2264 sxth(dst, dst); 2265 } 2266 } 2267 } 2268 2269 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2270 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2271 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2272 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2273 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2274 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2275 2276 // Set all elements to false if the input "lane_cnt" is zero. 2277 if (lane_cnt == 0) { 2278 sve_pfalse(dst); 2279 return; 2280 } 2281 2282 SIMD_RegVariant size = elemType_to_regVariant(bt); 2283 assert(size != Q, "invalid size"); 2284 2285 // Set all true if "lane_cnt" equals to the max lane count. 2286 if (lane_cnt == max_vector_length) { 2287 sve_ptrue(dst, size, /* ALL */ 0b11111); 2288 return; 2289 } 2290 2291 // Fixed numbers for "ptrue". 2292 switch(lane_cnt) { 2293 case 1: /* VL1 */ 2294 case 2: /* VL2 */ 2295 case 3: /* VL3 */ 2296 case 4: /* VL4 */ 2297 case 5: /* VL5 */ 2298 case 6: /* VL6 */ 2299 case 7: /* VL7 */ 2300 case 8: /* VL8 */ 2301 sve_ptrue(dst, size, lane_cnt); 2302 return; 2303 case 16: 2304 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2305 return; 2306 case 32: 2307 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2308 return; 2309 case 64: 2310 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2311 return; 2312 case 128: 2313 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2314 return; 2315 case 256: 2316 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2317 return; 2318 default: 2319 break; 2320 } 2321 2322 // Special patterns for "ptrue". 2323 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2324 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2325 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2326 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2327 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2328 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2329 } else { 2330 // Encode to "whileltw" for the remaining cases. 2331 mov(rscratch1, lane_cnt); 2332 sve_whileltw(dst, size, zr, rscratch1); 2333 } 2334 } 2335 2336 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2337 // Any remaining elements of dst will be filled with zero. 2338 // Clobbers: rscratch1 2339 // Preserves: src, mask 2340 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2341 FloatRegister vtmp1, FloatRegister vtmp2, 2342 PRegister pgtmp) { 2343 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2344 assert_different_registers(dst, src, vtmp1, vtmp2); 2345 assert_different_registers(mask, pgtmp); 2346 2347 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2348 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2349 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2350 sve_dup(vtmp2, H, 0); 2351 2352 // Extend lowest half to type INT. 2353 // dst = 00004444 00003333 00002222 00001111 2354 sve_uunpklo(dst, S, src); 2355 // pgtmp = 00000001 00000000 00000001 00000001 2356 sve_punpklo(pgtmp, mask); 2357 // Pack the active elements in size of type INT to the right, 2358 // and fill the remainings with zero. 2359 // dst = 00000000 00004444 00002222 00001111 2360 sve_compact(dst, S, dst, pgtmp); 2361 // Narrow the result back to type SHORT. 2362 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2363 sve_uzp1(dst, H, dst, vtmp2); 2364 // Count the active elements of lowest half. 2365 // rscratch1 = 3 2366 sve_cntp(rscratch1, S, ptrue, pgtmp); 2367 2368 // Repeat to the highest half. 2369 // pgtmp = 00000001 00000000 00000000 00000001 2370 sve_punpkhi(pgtmp, mask); 2371 // vtmp1 = 00008888 00007777 00006666 00005555 2372 sve_uunpkhi(vtmp1, S, src); 2373 // vtmp1 = 00000000 00000000 00008888 00005555 2374 sve_compact(vtmp1, S, vtmp1, pgtmp); 2375 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2376 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2377 2378 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2379 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2380 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2381 // TRUE_CNT is the number of active elements in the compressed low. 2382 neg(rscratch1, rscratch1); 2383 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2384 sve_index(vtmp2, H, rscratch1, 1); 2385 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2386 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2387 2388 // Combine the compressed high(after shifted) with the compressed low. 2389 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2390 sve_orr(dst, dst, vtmp1); 2391 } 2392 2393 // Clobbers: rscratch1, rscratch2 2394 // Preserves: src, mask 2395 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2396 FloatRegister vtmp1, FloatRegister vtmp2, 2397 FloatRegister vtmp3, FloatRegister vtmp4, 2398 PRegister ptmp, PRegister pgtmp) { 2399 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2400 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2401 assert_different_registers(mask, ptmp, pgtmp); 2402 // Example input: src = 88 77 66 55 44 33 22 11 2403 // mask = 01 00 00 01 01 00 01 01 2404 // Expected result: dst = 00 00 00 88 55 44 22 11 2405 2406 sve_dup(vtmp4, B, 0); 2407 // Extend lowest half to type SHORT. 2408 // vtmp1 = 0044 0033 0022 0011 2409 sve_uunpklo(vtmp1, H, src); 2410 // ptmp = 0001 0000 0001 0001 2411 sve_punpklo(ptmp, mask); 2412 // Count the active elements of lowest half. 2413 // rscratch2 = 3 2414 sve_cntp(rscratch2, H, ptrue, ptmp); 2415 // Pack the active elements in size of type SHORT to the right, 2416 // and fill the remainings with zero. 2417 // dst = 0000 0044 0022 0011 2418 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2419 // Narrow the result back to type BYTE. 2420 // dst = 00 00 00 00 00 44 22 11 2421 sve_uzp1(dst, B, dst, vtmp4); 2422 2423 // Repeat to the highest half. 2424 // ptmp = 0001 0000 0000 0001 2425 sve_punpkhi(ptmp, mask); 2426 // vtmp1 = 0088 0077 0066 0055 2427 sve_uunpkhi(vtmp2, H, src); 2428 // vtmp1 = 0000 0000 0088 0055 2429 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2430 2431 sve_dup(vtmp4, B, 0); 2432 // vtmp1 = 00 00 00 00 00 00 88 55 2433 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2434 2435 // Compressed low: dst = 00 00 00 00 00 44 22 11 2436 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2437 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2438 // TRUE_CNT is the number of active elements in the compressed low. 2439 neg(rscratch2, rscratch2); 2440 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2441 sve_index(vtmp2, B, rscratch2, 1); 2442 // vtmp1 = 00 00 00 88 55 00 00 00 2443 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2444 // Combine the compressed high(after shifted) with the compressed low. 2445 // dst = 00 00 00 88 55 44 22 11 2446 sve_orr(dst, dst, vtmp1); 2447 } 2448 2449 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2450 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2451 SIMD_Arrangement size = isQ ? T16B : T8B; 2452 if (bt == T_BYTE) { 2453 rbit(dst, size, src); 2454 } else { 2455 neon_reverse_bytes(dst, src, bt, isQ); 2456 rbit(dst, size, dst); 2457 } 2458 } 2459 2460 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2461 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2462 SIMD_Arrangement size = isQ ? T16B : T8B; 2463 switch (bt) { 2464 case T_BYTE: 2465 if (dst != src) { 2466 orr(dst, size, src, src); 2467 } 2468 break; 2469 case T_SHORT: 2470 rev16(dst, size, src); 2471 break; 2472 case T_INT: 2473 rev32(dst, size, src); 2474 break; 2475 case T_LONG: 2476 rev64(dst, size, src); 2477 break; 2478 default: 2479 assert(false, "unsupported"); 2480 ShouldNotReachHere(); 2481 } 2482 } 2483 2484 // Extract a scalar element from an sve vector at position 'idx'. 2485 // The input elements in src are expected to be of integral type. 2486 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2487 int idx, FloatRegister vtmp) { 2488 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2489 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2490 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2491 if (bt == T_INT || bt == T_LONG) { 2492 umov(dst, src, size, idx); 2493 } else { 2494 smov(dst, src, size, idx); 2495 } 2496 } else { 2497 sve_orr(vtmp, src, src); 2498 sve_ext(vtmp, vtmp, idx << size); 2499 if (bt == T_INT || bt == T_LONG) { 2500 umov(dst, vtmp, size, 0); 2501 } else { 2502 smov(dst, vtmp, size, 0); 2503 } 2504 } 2505 } 2506 2507 // java.lang.Math::round intrinsics 2508 2509 // Clobbers: rscratch1, rflags 2510 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2511 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2512 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2513 switch (T) { 2514 case T2S: 2515 case T4S: 2516 fmovs(tmp1, T, 0.5f); 2517 mov(rscratch1, jint_cast(0x1.0p23f)); 2518 break; 2519 case T2D: 2520 fmovd(tmp1, T, 0.5); 2521 mov(rscratch1, julong_cast(0x1.0p52)); 2522 break; 2523 default: 2524 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2525 } 2526 fadd(tmp1, T, tmp1, src); 2527 fcvtms(tmp1, T, tmp1); 2528 // tmp1 = floor(src + 0.5, ties to even) 2529 2530 fcvtas(dst, T, src); 2531 // dst = round(src), ties to away 2532 2533 fneg(tmp3, T, src); 2534 dup(tmp2, T, rscratch1); 2535 cm(HS, tmp3, T, tmp3, tmp2); 2536 // tmp3 is now a set of flags 2537 2538 bif(dst, T16B, tmp1, tmp3); 2539 // result in dst 2540 } 2541 2542 // Clobbers: rscratch1, rflags 2543 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2544 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2545 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2546 assert_different_registers(tmp1, tmp2, src, dst); 2547 2548 switch (T) { 2549 case S: 2550 mov(rscratch1, jint_cast(0x1.0p23f)); 2551 break; 2552 case D: 2553 mov(rscratch1, julong_cast(0x1.0p52)); 2554 break; 2555 default: 2556 assert(T == S || T == D, "invalid register variant"); 2557 } 2558 2559 sve_frinta(dst, T, ptrue, src); 2560 // dst = round(src), ties to away 2561 2562 Label none; 2563 2564 sve_fneg(tmp1, T, ptrue, src); 2565 sve_dup(tmp2, T, rscratch1); 2566 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2567 br(EQ, none); 2568 { 2569 sve_cpy(tmp1, T, pgtmp, 0.5); 2570 sve_fadd(tmp1, T, pgtmp, src); 2571 sve_frintm(dst, T, pgtmp, tmp1); 2572 // dst = floor(src + 0.5, ties to even) 2573 } 2574 bind(none); 2575 2576 sve_fcvtzs(dst, T, ptrue, dst, T); 2577 // result in dst 2578 } 2579 2580 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2581 FloatRegister one, SIMD_Arrangement T) { 2582 assert_different_registers(dst, src, zero, one); 2583 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2584 2585 facgt(dst, T, src, zero); 2586 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2587 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2588 } 2589 2590 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2591 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2592 assert_different_registers(dst, src, zero, one, vtmp); 2593 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2594 2595 sve_orr(vtmp, src, src); 2596 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2597 switch (T) { 2598 case S: 2599 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2600 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2601 // on the sign of the float value 2602 break; 2603 case D: 2604 sve_and(vtmp, T, min_jlong); 2605 sve_orr(vtmp, T, jlong_cast(1.0)); 2606 break; 2607 default: 2608 assert(false, "unsupported"); 2609 ShouldNotReachHere(); 2610 } 2611 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2612 // Result in dst 2613 } 2614 2615 bool C2_MacroAssembler::in_scratch_emit_size() { 2616 if (ciEnv::current()->task() != nullptr) { 2617 PhaseOutput* phase_output = Compile::current()->output(); 2618 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2619 return true; 2620 } 2621 } 2622 return MacroAssembler::in_scratch_emit_size(); 2623 }