1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::entry_barrier() { 50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 51 // Dummy labels for just measuring the code size 52 Label dummy_slow_path; 53 Label dummy_continuation; 54 Label dummy_guard; 55 Label* slow_path = &dummy_slow_path; 56 Label* continuation = &dummy_continuation; 57 Label* guard = &dummy_guard; 58 if (!Compile::current()->output()->in_scratch_emit_size()) { 59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 61 Compile::current()->output()->add_stub(stub); 62 slow_path = &stub->entry(); 63 continuation = &stub->continuation(); 64 guard = &stub->guard(); 65 } 66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 68 } 69 70 // jdk.internal.util.ArraysSupport.vectorizedHashCode 71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 72 FloatRegister vdata0, FloatRegister vdata1, 73 FloatRegister vdata2, FloatRegister vdata3, 74 FloatRegister vmul0, FloatRegister vmul1, 75 FloatRegister vmul2, FloatRegister vmul3, 76 FloatRegister vpow, FloatRegister vpowm, 77 BasicType eltype) { 78 ARRAYS_HASHCODE_REGISTERS; 79 80 Register tmp1 = rscratch1, tmp2 = rscratch2; 81 82 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 83 84 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 85 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 86 // use 4H for chars and shorts instead, but using 8H gives better performance. 87 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 88 : eltype == T_CHAR || eltype == T_SHORT ? 8 89 : eltype == T_INT ? 4 90 : 0; 91 guarantee(vf, "unsupported eltype"); 92 93 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 94 const size_t unroll_factor = 4; 95 96 switch (eltype) { 97 case T_BOOLEAN: 98 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 99 break; 100 case T_CHAR: 101 BLOCK_COMMENT("arrays_hashcode(char) {"); 102 break; 103 case T_BYTE: 104 BLOCK_COMMENT("arrays_hashcode(byte) {"); 105 break; 106 case T_SHORT: 107 BLOCK_COMMENT("arrays_hashcode(short) {"); 108 break; 109 case T_INT: 110 BLOCK_COMMENT("arrays_hashcode(int) {"); 111 break; 112 default: 113 ShouldNotReachHere(); 114 } 115 116 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 117 // implemented by the stub executes just once. Call the stub only if at least two iterations will 118 // be executed. 119 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 120 cmpw(cnt, large_threshold); 121 br(Assembler::HS, LARGE); 122 123 bind(TAIL); 124 125 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 126 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 127 // Iteration eats up the remainder, uf elements at a time. 128 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 129 andr(tmp2, cnt, unroll_factor - 1); 130 adr(tmp1, BR_BASE); 131 sub(tmp1, tmp1, tmp2, ext::sxtw, 3); 132 movw(tmp2, 0x1f); 133 br(tmp1); 134 135 bind(LOOP); 136 for (size_t i = 0; i < unroll_factor; ++i) { 137 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 138 maddw(result, result, tmp2, tmp1); 139 } 140 bind(BR_BASE); 141 subsw(cnt, cnt, unroll_factor); 142 br(Assembler::HS, LOOP); 143 144 b(DONE); 145 146 bind(LARGE); 147 148 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 149 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 150 address tpc = trampoline_call(stub); 151 if (tpc == nullptr) { 152 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 153 postcond(pc() == badAddress); 154 return nullptr; 155 } 156 157 bind(DONE); 158 159 BLOCK_COMMENT("} // arrays_hashcode"); 160 161 postcond(pc() != badAddress); 162 return pc(); 163 } 164 165 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 166 Register tmp2Reg, Register tmp3Reg) { 167 Register oop = objectReg; 168 Register box = boxReg; 169 Register disp_hdr = tmpReg; 170 Register tmp = tmp2Reg; 171 Label cont; 172 Label object_has_monitor; 173 Label count, no_count; 174 175 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 176 assert_different_registers(oop, box, tmp, disp_hdr, rscratch2); 177 178 // Load markWord from object into displaced_header. 179 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 180 181 if (DiagnoseSyncOnValueBasedClasses != 0) { 182 load_klass(tmp, oop); 183 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 184 tst(tmp, KlassFlags::_misc_is_value_based_class); 185 br(Assembler::NE, cont); 186 } 187 188 // Check for existing monitor 189 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 190 191 if (LockingMode == LM_MONITOR) { 192 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 193 b(cont); 194 } else { 195 assert(LockingMode == LM_LEGACY, "must be"); 196 // Set tmp to be (markWord of object | UNLOCK_VALUE). 197 orr(tmp, disp_hdr, markWord::unlocked_value); 198 199 if (EnableValhalla) { 200 // Mask inline_type bit such that we go to the slow path if object is an inline type 201 andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place)); 202 } 203 204 // Initialize the box. (Must happen before we update the object mark!) 205 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 206 207 // Compare object markWord with an unlocked value (tmp) and if 208 // equal exchange the stack address of our box with object markWord. 209 // On failure disp_hdr contains the possibly locked markWord. 210 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 211 /*release*/ true, /*weak*/ false, disp_hdr); 212 br(Assembler::EQ, cont); 213 214 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 215 216 // If the compare-and-exchange succeeded, then we found an unlocked 217 // object, will have now locked it will continue at label cont 218 219 // Check if the owner is self by comparing the value in the 220 // markWord of object (disp_hdr) with the stack pointer. 221 mov(rscratch1, sp); 222 sub(disp_hdr, disp_hdr, rscratch1); 223 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 224 // If condition is true we are cont and hence we can store 0 as the 225 // displaced header in the box, which indicates that it is a recursive lock. 226 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 227 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 228 b(cont); 229 } 230 231 // Handle existing monitor. 232 bind(object_has_monitor); 233 234 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 235 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 236 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 237 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 238 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 239 240 // Store a non-null value into the box to avoid looking like a re-entrant 241 // lock. The fast-path monitor unlock code checks for 242 // markWord::monitor_value so use markWord::unused_mark which has the 243 // relevant bit set, and also matches ObjectSynchronizer::enter. 244 mov(tmp, (address)markWord::unused_mark().value()); 245 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 246 247 br(Assembler::EQ, cont); // CAS success means locking succeeded 248 249 cmp(tmp3Reg, rscratch2); 250 br(Assembler::NE, cont); // Check for recursive locking 251 252 // Recursive lock case 253 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 254 // flag == EQ still from the cmp above, checking if this is a reentrant lock 255 256 bind(cont); 257 // flag == EQ indicates success 258 // flag == NE indicates failure 259 br(Assembler::NE, no_count); 260 261 bind(count); 262 if (LockingMode == LM_LEGACY) { 263 inc_held_monitor_count(rscratch1); 264 } 265 266 bind(no_count); 267 } 268 269 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 270 Register tmp2Reg) { 271 Register oop = objectReg; 272 Register box = boxReg; 273 Register disp_hdr = tmpReg; 274 Register owner_addr = tmpReg; 275 Register tmp = tmp2Reg; 276 Label cont; 277 Label object_has_monitor; 278 Label count, no_count; 279 Label unlocked; 280 281 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 282 assert_different_registers(oop, box, tmp, disp_hdr); 283 284 if (LockingMode == LM_LEGACY) { 285 // Find the lock address and load the displaced header from the stack. 286 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 287 288 // If the displaced header is 0, we have a recursive unlock. 289 cmp(disp_hdr, zr); 290 br(Assembler::EQ, cont); 291 } 292 293 // Handle existing monitor. 294 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 295 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 296 297 if (LockingMode == LM_MONITOR) { 298 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 299 b(cont); 300 } else { 301 assert(LockingMode == LM_LEGACY, "must be"); 302 // Check if it is still a light weight lock, this is is true if we 303 // see the stack address of the basicLock in the markWord of the 304 // object. 305 306 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 307 /*release*/ true, /*weak*/ false, tmp); 308 b(cont); 309 } 310 311 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 312 313 // Handle existing monitor. 314 bind(object_has_monitor); 315 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 316 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 317 318 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 319 320 Label notRecursive; 321 cbz(disp_hdr, notRecursive); 322 323 // Recursive lock 324 sub(disp_hdr, disp_hdr, 1u); 325 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 326 cmp(disp_hdr, disp_hdr); // Sets flags for result 327 b(cont); 328 329 bind(notRecursive); 330 331 // Compute owner address. 332 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 333 334 // Set owner to null. 335 // Release to satisfy the JMM 336 stlr(zr, owner_addr); 337 // We need a full fence after clearing owner to avoid stranding. 338 // StoreLoad achieves this. 339 membar(StoreLoad); 340 341 // Check if the entry_list is empty. 342 ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset())); 343 cmp(rscratch1, zr); 344 br(Assembler::EQ, cont); // If so we are done. 345 346 // Check if there is a successor. 347 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 348 cmp(rscratch1, zr); 349 br(Assembler::NE, unlocked); // If so we are done. 350 351 // Save the monitor pointer in the current thread, so we can try to 352 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 353 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 354 355 cmp(zr, rthread); // Set Flag to NE => slow path 356 b(cont); 357 358 bind(unlocked); 359 cmp(zr, zr); // Set Flag to EQ => fast path 360 361 // Intentional fall-through 362 363 bind(cont); 364 // flag == EQ indicates success 365 // flag == NE indicates failure 366 br(Assembler::NE, no_count); 367 368 bind(count); 369 if (LockingMode == LM_LEGACY) { 370 dec_held_monitor_count(rscratch1); 371 } 372 373 bind(no_count); 374 } 375 376 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 377 Register t2, Register t3) { 378 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 379 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 380 381 // Handle inflated monitor. 382 Label inflated; 383 // Finish fast lock successfully. MUST branch to with flag == EQ 384 Label locked; 385 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 386 Label slow_path; 387 388 if (UseObjectMonitorTable) { 389 // Clear cache in case fast locking succeeds. 390 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 391 } 392 393 if (DiagnoseSyncOnValueBasedClasses != 0) { 394 load_klass(t1, obj); 395 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 396 tst(t1, KlassFlags::_misc_is_value_based_class); 397 br(Assembler::NE, slow_path); 398 } 399 400 const Register t1_mark = t1; 401 const Register t3_t = t3; 402 403 { // Lightweight locking 404 405 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 406 Label push; 407 408 const Register t2_top = t2; 409 410 // Check if lock-stack is full. 411 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 412 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 413 br(Assembler::GT, slow_path); 414 415 // Check if recursive. 416 subw(t3_t, t2_top, oopSize); 417 ldr(t3_t, Address(rthread, t3_t)); 418 cmp(obj, t3_t); 419 br(Assembler::EQ, push); 420 421 // Relaxed normal load to check for monitor. Optimization for monitor case. 422 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 423 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 424 425 // Not inflated 426 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 427 428 // Try to lock. Transition lock-bits 0b01 => 0b00 429 orr(t1_mark, t1_mark, markWord::unlocked_value); 430 eor(t3_t, t1_mark, markWord::unlocked_value); 431 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 432 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 433 br(Assembler::NE, slow_path); 434 435 bind(push); 436 // After successful lock, push object on lock-stack. 437 str(obj, Address(rthread, t2_top)); 438 addw(t2_top, t2_top, oopSize); 439 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 440 b(locked); 441 } 442 443 { // Handle inflated monitor. 444 bind(inflated); 445 446 const Register t1_monitor = t1; 447 448 if (!UseObjectMonitorTable) { 449 assert(t1_monitor == t1_mark, "should be the same here"); 450 } else { 451 Label monitor_found; 452 453 // Load cache address 454 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 455 456 const int num_unrolled = 2; 457 for (int i = 0; i < num_unrolled; i++) { 458 ldr(t1, Address(t3_t)); 459 cmp(obj, t1); 460 br(Assembler::EQ, monitor_found); 461 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 462 } 463 464 Label loop; 465 466 // Search for obj in cache. 467 bind(loop); 468 469 // Check for match. 470 ldr(t1, Address(t3_t)); 471 cmp(obj, t1); 472 br(Assembler::EQ, monitor_found); 473 474 // Search until null encountered, guaranteed _null_sentinel at end. 475 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 476 cbnz(t1, loop); 477 // Cache Miss, NE set from cmp above, cbnz does not set flags 478 b(slow_path); 479 480 bind(monitor_found); 481 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 482 } 483 484 const Register t2_owner_addr = t2; 485 const Register t3_owner = t3; 486 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 487 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 488 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 489 490 Label monitor_locked; 491 492 // Compute owner address. 493 lea(t2_owner_addr, owner_address); 494 495 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 496 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 497 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 498 /*release*/ false, /*weak*/ false, t3_owner); 499 br(Assembler::EQ, monitor_locked); 500 501 // Check if recursive. 502 cmp(t3_owner, rscratch2); 503 br(Assembler::NE, slow_path); 504 505 // Recursive. 506 increment(recursions_address, 1); 507 508 bind(monitor_locked); 509 if (UseObjectMonitorTable) { 510 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 511 } 512 } 513 514 bind(locked); 515 516 #ifdef ASSERT 517 // Check that locked label is reached with Flags == EQ. 518 Label flag_correct; 519 br(Assembler::EQ, flag_correct); 520 stop("Fast Lock Flag != EQ"); 521 #endif 522 523 bind(slow_path); 524 #ifdef ASSERT 525 // Check that slow_path label is reached with Flags == NE. 526 br(Assembler::NE, flag_correct); 527 stop("Fast Lock Flag != NE"); 528 bind(flag_correct); 529 #endif 530 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 531 } 532 533 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 534 Register t2, Register t3) { 535 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 536 assert_different_registers(obj, box, t1, t2, t3); 537 538 // Handle inflated monitor. 539 Label inflated, inflated_load_mark; 540 // Finish fast unlock successfully. MUST branch to with flag == EQ 541 Label unlocked; 542 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 543 Label slow_path; 544 545 const Register t1_mark = t1; 546 const Register t2_top = t2; 547 const Register t3_t = t3; 548 549 { // Lightweight unlock 550 551 Label push_and_slow_path; 552 553 // Check if obj is top of lock-stack. 554 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 555 subw(t2_top, t2_top, oopSize); 556 ldr(t3_t, Address(rthread, t2_top)); 557 cmp(obj, t3_t); 558 // Top of lock stack was not obj. Must be monitor. 559 br(Assembler::NE, inflated_load_mark); 560 561 // Pop lock-stack. 562 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 563 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 564 565 // Check if recursive. 566 subw(t3_t, t2_top, oopSize); 567 ldr(t3_t, Address(rthread, t3_t)); 568 cmp(obj, t3_t); 569 br(Assembler::EQ, unlocked); 570 571 // Not recursive. 572 // Load Mark. 573 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 574 575 // Check header for monitor (0b10). 576 // Because we got here by popping (meaning we pushed in locked) 577 // there will be no monitor in the box. So we need to push back the obj 578 // so that the runtime can fix any potential anonymous owner. 579 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 580 581 // Try to unlock. Transition lock bits 0b00 => 0b01 582 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 583 orr(t3_t, t1_mark, markWord::unlocked_value); 584 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 585 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 586 br(Assembler::EQ, unlocked); 587 588 bind(push_and_slow_path); 589 // Compare and exchange failed. 590 // Restore lock-stack and handle the unlock in runtime. 591 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 592 addw(t2_top, t2_top, oopSize); 593 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 594 b(slow_path); 595 } 596 597 598 { // Handle inflated monitor. 599 bind(inflated_load_mark); 600 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 601 #ifdef ASSERT 602 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 603 stop("Fast Unlock not monitor"); 604 #endif 605 606 bind(inflated); 607 608 #ifdef ASSERT 609 Label check_done; 610 subw(t2_top, t2_top, oopSize); 611 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 612 br(Assembler::LT, check_done); 613 ldr(t3_t, Address(rthread, t2_top)); 614 cmp(obj, t3_t); 615 br(Assembler::NE, inflated); 616 stop("Fast Unlock lock on stack"); 617 bind(check_done); 618 #endif 619 620 const Register t1_monitor = t1; 621 622 if (!UseObjectMonitorTable) { 623 assert(t1_monitor == t1_mark, "should be the same here"); 624 625 // Untag the monitor. 626 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 627 } else { 628 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 629 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 630 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 631 br(Assembler::LO, slow_path); 632 } 633 634 const Register t2_recursions = t2; 635 Label not_recursive; 636 637 // Check if recursive. 638 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 639 cbz(t2_recursions, not_recursive); 640 641 // Recursive unlock. 642 sub(t2_recursions, t2_recursions, 1u); 643 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 644 // Set flag == EQ 645 cmp(t2_recursions, t2_recursions); 646 b(unlocked); 647 648 bind(not_recursive); 649 650 const Register t2_owner_addr = t2; 651 652 // Compute owner address. 653 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 654 655 // Set owner to null. 656 // Release to satisfy the JMM 657 stlr(zr, t2_owner_addr); 658 // We need a full fence after clearing owner to avoid stranding. 659 // StoreLoad achieves this. 660 membar(StoreLoad); 661 662 // Check if the entry_list is empty. 663 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset())); 664 cmp(rscratch1, zr); 665 br(Assembler::EQ, unlocked); // If so we are done. 666 667 // Check if there is a successor. 668 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 669 cmp(rscratch1, zr); 670 br(Assembler::NE, unlocked); // If so we are done. 671 672 // Save the monitor pointer in the current thread, so we can try to 673 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 674 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 675 676 cmp(zr, rthread); // Set Flag to NE => slow path 677 b(slow_path); 678 } 679 680 bind(unlocked); 681 cmp(zr, zr); // Set Flags to EQ => fast path 682 683 #ifdef ASSERT 684 // Check that unlocked label is reached with Flags == EQ. 685 Label flag_correct; 686 br(Assembler::EQ, flag_correct); 687 stop("Fast Unlock Flag != EQ"); 688 #endif 689 690 bind(slow_path); 691 #ifdef ASSERT 692 // Check that slow_path label is reached with Flags == NE. 693 br(Assembler::NE, flag_correct); 694 stop("Fast Unlock Flag != NE"); 695 bind(flag_correct); 696 #endif 697 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 698 } 699 700 // Search for str1 in str2 and return index or -1 701 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 702 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 703 Register cnt2, Register cnt1, 704 Register tmp1, Register tmp2, 705 Register tmp3, Register tmp4, 706 Register tmp5, Register tmp6, 707 int icnt1, Register result, int ae) { 708 // NOTE: tmp5, tmp6 can be zr depending on specific method version 709 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 710 711 Register ch1 = rscratch1; 712 Register ch2 = rscratch2; 713 Register cnt1tmp = tmp1; 714 Register cnt2tmp = tmp2; 715 Register cnt1_neg = cnt1; 716 Register cnt2_neg = cnt2; 717 Register result_tmp = tmp4; 718 719 bool isL = ae == StrIntrinsicNode::LL; 720 721 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 722 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 723 int str1_chr_shift = str1_isL ? 0:1; 724 int str2_chr_shift = str2_isL ? 0:1; 725 int str1_chr_size = str1_isL ? 1:2; 726 int str2_chr_size = str2_isL ? 1:2; 727 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 728 (chr_insn)&MacroAssembler::ldrh; 729 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 730 (chr_insn)&MacroAssembler::ldrh; 731 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 732 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 733 734 // Note, inline_string_indexOf() generates checks: 735 // if (substr.count > string.count) return -1; 736 // if (substr.count == 0) return 0; 737 738 // We have two strings, a source string in str2, cnt2 and a pattern string 739 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 740 741 // For larger pattern and source we use a simplified Boyer Moore algorithm. 742 // With a small pattern and source we use linear scan. 743 744 if (icnt1 == -1) { 745 sub(result_tmp, cnt2, cnt1); 746 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 747 br(LT, LINEARSEARCH); 748 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 749 subs(zr, cnt1, 256); 750 lsr(tmp1, cnt2, 2); 751 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 752 br(GE, LINEARSTUB); 753 } 754 755 // The Boyer Moore alogorithm is based on the description here:- 756 // 757 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 758 // 759 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 760 // and the 'Good Suffix' rule. 761 // 762 // These rules are essentially heuristics for how far we can shift the 763 // pattern along the search string. 764 // 765 // The implementation here uses the 'Bad Character' rule only because of the 766 // complexity of initialisation for the 'Good Suffix' rule. 767 // 768 // This is also known as the Boyer-Moore-Horspool algorithm:- 769 // 770 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 771 // 772 // This particular implementation has few java-specific optimizations. 773 // 774 // #define ASIZE 256 775 // 776 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 777 // int i, j; 778 // unsigned c; 779 // unsigned char bc[ASIZE]; 780 // 781 // /* Preprocessing */ 782 // for (i = 0; i < ASIZE; ++i) 783 // bc[i] = m; 784 // for (i = 0; i < m - 1; ) { 785 // c = x[i]; 786 // ++i; 787 // // c < 256 for Latin1 string, so, no need for branch 788 // #ifdef PATTERN_STRING_IS_LATIN1 789 // bc[c] = m - i; 790 // #else 791 // if (c < ASIZE) bc[c] = m - i; 792 // #endif 793 // } 794 // 795 // /* Searching */ 796 // j = 0; 797 // while (j <= n - m) { 798 // c = y[i+j]; 799 // if (x[m-1] == c) 800 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 801 // if (i < 0) return j; 802 // // c < 256 for Latin1 string, so, no need for branch 803 // #ifdef SOURCE_STRING_IS_LATIN1 804 // // LL case: (c< 256) always true. Remove branch 805 // j += bc[y[j+m-1]]; 806 // #endif 807 // #ifndef PATTERN_STRING_IS_UTF 808 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 809 // if (c < ASIZE) 810 // j += bc[y[j+m-1]]; 811 // else 812 // j += 1 813 // #endif 814 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 815 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 816 // if (c < ASIZE) 817 // j += bc[y[j+m-1]]; 818 // else 819 // j += m 820 // #endif 821 // } 822 // } 823 824 if (icnt1 == -1) { 825 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 826 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 827 Register cnt1end = tmp2; 828 Register str2end = cnt2; 829 Register skipch = tmp2; 830 831 // str1 length is >=8, so, we can read at least 1 register for cases when 832 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 833 // UL case. We'll re-read last character in inner pre-loop code to have 834 // single outer pre-loop load 835 const int firstStep = isL ? 7 : 3; 836 837 const int ASIZE = 256; 838 const int STORED_BYTES = 32; // amount of bytes stored per instruction 839 sub(sp, sp, ASIZE); 840 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 841 mov(ch1, sp); 842 BIND(BM_INIT_LOOP); 843 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 844 subs(tmp5, tmp5, 1); 845 br(GT, BM_INIT_LOOP); 846 847 sub(cnt1tmp, cnt1, 1); 848 mov(tmp5, str2); 849 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 850 sub(ch2, cnt1, 1); 851 mov(tmp3, str1); 852 BIND(BCLOOP); 853 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 854 if (!str1_isL) { 855 subs(zr, ch1, ASIZE); 856 br(HS, BCSKIP); 857 } 858 strb(ch2, Address(sp, ch1)); 859 BIND(BCSKIP); 860 subs(ch2, ch2, 1); 861 br(GT, BCLOOP); 862 863 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 864 if (str1_isL == str2_isL) { 865 // load last 8 bytes (8LL/4UU symbols) 866 ldr(tmp6, Address(tmp6, -wordSize)); 867 } else { 868 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 869 // convert Latin1 to UTF. We'll have to wait until load completed, but 870 // it's still faster than per-character loads+checks 871 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 872 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 873 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 874 andr(tmp6, tmp6, 0xFF); // str1[N-4] 875 orr(ch2, ch1, ch2, LSL, 16); 876 orr(tmp6, tmp6, tmp3, LSL, 48); 877 orr(tmp6, tmp6, ch2, LSL, 16); 878 } 879 BIND(BMLOOPSTR2); 880 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 881 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 882 if (str1_isL == str2_isL) { 883 // re-init tmp3. It's for free because it's executed in parallel with 884 // load above. Alternative is to initialize it before loop, but it'll 885 // affect performance on in-order systems with 2 or more ld/st pipelines 886 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 887 } 888 if (!isL) { // UU/UL case 889 lsl(ch2, cnt1tmp, 1); // offset in bytes 890 } 891 cmp(tmp3, skipch); 892 br(NE, BMSKIP); 893 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 894 mov(ch1, tmp6); 895 if (isL) { 896 b(BMLOOPSTR1_AFTER_LOAD); 897 } else { 898 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 899 b(BMLOOPSTR1_CMP); 900 } 901 BIND(BMLOOPSTR1); 902 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 903 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 904 BIND(BMLOOPSTR1_AFTER_LOAD); 905 subs(cnt1tmp, cnt1tmp, 1); 906 br(LT, BMLOOPSTR1_LASTCMP); 907 BIND(BMLOOPSTR1_CMP); 908 cmp(ch1, ch2); 909 br(EQ, BMLOOPSTR1); 910 BIND(BMSKIP); 911 if (!isL) { 912 // if we've met UTF symbol while searching Latin1 pattern, then we can 913 // skip cnt1 symbols 914 if (str1_isL != str2_isL) { 915 mov(result_tmp, cnt1); 916 } else { 917 mov(result_tmp, 1); 918 } 919 subs(zr, skipch, ASIZE); 920 br(HS, BMADV); 921 } 922 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 923 BIND(BMADV); 924 sub(cnt1tmp, cnt1, 1); 925 add(str2, str2, result_tmp, LSL, str2_chr_shift); 926 cmp(str2, str2end); 927 br(LE, BMLOOPSTR2); 928 add(sp, sp, ASIZE); 929 b(NOMATCH); 930 BIND(BMLOOPSTR1_LASTCMP); 931 cmp(ch1, ch2); 932 br(NE, BMSKIP); 933 BIND(BMMATCH); 934 sub(result, str2, tmp5); 935 if (!str2_isL) lsr(result, result, 1); 936 add(sp, sp, ASIZE); 937 b(DONE); 938 939 BIND(LINEARSTUB); 940 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 941 br(LT, LINEAR_MEDIUM); 942 mov(result, zr); 943 RuntimeAddress stub = nullptr; 944 if (isL) { 945 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 946 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 947 } else if (str1_isL) { 948 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 949 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 950 } else { 951 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 952 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 953 } 954 address call = trampoline_call(stub); 955 if (call == nullptr) { 956 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 957 ciEnv::current()->record_failure("CodeCache is full"); 958 return; 959 } 960 b(DONE); 961 } 962 963 BIND(LINEARSEARCH); 964 { 965 Label DO1, DO2, DO3; 966 967 Register str2tmp = tmp2; 968 Register first = tmp3; 969 970 if (icnt1 == -1) 971 { 972 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 973 974 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 975 br(LT, DOSHORT); 976 BIND(LINEAR_MEDIUM); 977 (this->*str1_load_1chr)(first, Address(str1)); 978 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 979 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 980 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 981 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 982 983 BIND(FIRST_LOOP); 984 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 985 cmp(first, ch2); 986 br(EQ, STR1_LOOP); 987 BIND(STR2_NEXT); 988 adds(cnt2_neg, cnt2_neg, str2_chr_size); 989 br(LE, FIRST_LOOP); 990 b(NOMATCH); 991 992 BIND(STR1_LOOP); 993 adds(cnt1tmp, cnt1_neg, str1_chr_size); 994 add(cnt2tmp, cnt2_neg, str2_chr_size); 995 br(GE, MATCH); 996 997 BIND(STR1_NEXT); 998 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 999 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1000 cmp(ch1, ch2); 1001 br(NE, STR2_NEXT); 1002 adds(cnt1tmp, cnt1tmp, str1_chr_size); 1003 add(cnt2tmp, cnt2tmp, str2_chr_size); 1004 br(LT, STR1_NEXT); 1005 b(MATCH); 1006 1007 BIND(DOSHORT); 1008 if (str1_isL == str2_isL) { 1009 cmp(cnt1, (u1)2); 1010 br(LT, DO1); 1011 br(GT, DO3); 1012 } 1013 } 1014 1015 if (icnt1 == 4) { 1016 Label CH1_LOOP; 1017 1018 (this->*load_4chr)(ch1, str1); 1019 sub(result_tmp, cnt2, 4); 1020 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1021 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1022 1023 BIND(CH1_LOOP); 1024 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 1025 cmp(ch1, ch2); 1026 br(EQ, MATCH); 1027 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1028 br(LE, CH1_LOOP); 1029 b(NOMATCH); 1030 } 1031 1032 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1033 Label CH1_LOOP; 1034 1035 BIND(DO2); 1036 (this->*load_2chr)(ch1, str1); 1037 if (icnt1 == 2) { 1038 sub(result_tmp, cnt2, 2); 1039 } 1040 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1041 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1042 BIND(CH1_LOOP); 1043 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1044 cmp(ch1, ch2); 1045 br(EQ, MATCH); 1046 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1047 br(LE, CH1_LOOP); 1048 b(NOMATCH); 1049 } 1050 1051 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1052 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1053 1054 BIND(DO3); 1055 (this->*load_2chr)(first, str1); 1056 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1057 if (icnt1 == 3) { 1058 sub(result_tmp, cnt2, 3); 1059 } 1060 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1061 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1062 BIND(FIRST_LOOP); 1063 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1064 cmpw(first, ch2); 1065 br(EQ, STR1_LOOP); 1066 BIND(STR2_NEXT); 1067 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1068 br(LE, FIRST_LOOP); 1069 b(NOMATCH); 1070 1071 BIND(STR1_LOOP); 1072 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1073 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1074 cmp(ch1, ch2); 1075 br(NE, STR2_NEXT); 1076 b(MATCH); 1077 } 1078 1079 if (icnt1 == -1 || icnt1 == 1) { 1080 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1081 1082 BIND(DO1); 1083 (this->*str1_load_1chr)(ch1, str1); 1084 cmp(cnt2, (u1)8); 1085 br(LT, DO1_SHORT); 1086 1087 sub(result_tmp, cnt2, 8/str2_chr_size); 1088 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1089 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1090 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1091 1092 if (str2_isL) { 1093 orr(ch1, ch1, ch1, LSL, 8); 1094 } 1095 orr(ch1, ch1, ch1, LSL, 16); 1096 orr(ch1, ch1, ch1, LSL, 32); 1097 BIND(CH1_LOOP); 1098 ldr(ch2, Address(str2, cnt2_neg)); 1099 eor(ch2, ch1, ch2); 1100 sub(tmp1, ch2, tmp3); 1101 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1102 bics(tmp1, tmp1, tmp2); 1103 br(NE, HAS_ZERO); 1104 adds(cnt2_neg, cnt2_neg, 8); 1105 br(LT, CH1_LOOP); 1106 1107 cmp(cnt2_neg, (u1)8); 1108 mov(cnt2_neg, 0); 1109 br(LT, CH1_LOOP); 1110 b(NOMATCH); 1111 1112 BIND(HAS_ZERO); 1113 rev(tmp1, tmp1); 1114 clz(tmp1, tmp1); 1115 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1116 b(MATCH); 1117 1118 BIND(DO1_SHORT); 1119 mov(result_tmp, cnt2); 1120 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1121 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1122 BIND(DO1_LOOP); 1123 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1124 cmpw(ch1, ch2); 1125 br(EQ, MATCH); 1126 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1127 br(LT, DO1_LOOP); 1128 } 1129 } 1130 BIND(NOMATCH); 1131 mov(result, -1); 1132 b(DONE); 1133 BIND(MATCH); 1134 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1135 BIND(DONE); 1136 } 1137 1138 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1139 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1140 1141 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1142 Register ch, Register result, 1143 Register tmp1, Register tmp2, Register tmp3) 1144 { 1145 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1146 Register cnt1_neg = cnt1; 1147 Register ch1 = rscratch1; 1148 Register result_tmp = rscratch2; 1149 1150 cbz(cnt1, NOMATCH); 1151 1152 cmp(cnt1, (u1)4); 1153 br(LT, DO1_SHORT); 1154 1155 orr(ch, ch, ch, LSL, 16); 1156 orr(ch, ch, ch, LSL, 32); 1157 1158 sub(cnt1, cnt1, 4); 1159 mov(result_tmp, cnt1); 1160 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1161 sub(cnt1_neg, zr, cnt1, LSL, 1); 1162 1163 mov(tmp3, 0x0001000100010001); 1164 1165 BIND(CH1_LOOP); 1166 ldr(ch1, Address(str1, cnt1_neg)); 1167 eor(ch1, ch, ch1); 1168 sub(tmp1, ch1, tmp3); 1169 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1170 bics(tmp1, tmp1, tmp2); 1171 br(NE, HAS_ZERO); 1172 adds(cnt1_neg, cnt1_neg, 8); 1173 br(LT, CH1_LOOP); 1174 1175 cmp(cnt1_neg, (u1)8); 1176 mov(cnt1_neg, 0); 1177 br(LT, CH1_LOOP); 1178 b(NOMATCH); 1179 1180 BIND(HAS_ZERO); 1181 rev(tmp1, tmp1); 1182 clz(tmp1, tmp1); 1183 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1184 b(MATCH); 1185 1186 BIND(DO1_SHORT); 1187 mov(result_tmp, cnt1); 1188 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1189 sub(cnt1_neg, zr, cnt1, LSL, 1); 1190 BIND(DO1_LOOP); 1191 ldrh(ch1, Address(str1, cnt1_neg)); 1192 cmpw(ch, ch1); 1193 br(EQ, MATCH); 1194 adds(cnt1_neg, cnt1_neg, 2); 1195 br(LT, DO1_LOOP); 1196 BIND(NOMATCH); 1197 mov(result, -1); 1198 b(DONE); 1199 BIND(MATCH); 1200 add(result, result_tmp, cnt1_neg, ASR, 1); 1201 BIND(DONE); 1202 } 1203 1204 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1205 Register ch, Register result, 1206 FloatRegister ztmp1, 1207 FloatRegister ztmp2, 1208 PRegister tmp_pg, 1209 PRegister tmp_pdn, bool isL) 1210 { 1211 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1212 assert(tmp_pg->is_governing(), 1213 "this register has to be a governing predicate register"); 1214 1215 Label LOOP, MATCH, DONE, NOMATCH; 1216 Register vec_len = rscratch1; 1217 Register idx = rscratch2; 1218 1219 SIMD_RegVariant T = (isL == true) ? B : H; 1220 1221 cbz(cnt1, NOMATCH); 1222 1223 // Assign the particular char throughout the vector. 1224 sve_dup(ztmp2, T, ch); 1225 if (isL) { 1226 sve_cntb(vec_len); 1227 } else { 1228 sve_cnth(vec_len); 1229 } 1230 mov(idx, 0); 1231 1232 // Generate a predicate to control the reading of input string. 1233 sve_whilelt(tmp_pg, T, idx, cnt1); 1234 1235 BIND(LOOP); 1236 // Read a vector of 8- or 16-bit data depending on the string type. Note 1237 // that inactive elements indicated by the predicate register won't cause 1238 // a data read from memory to the destination vector. 1239 if (isL) { 1240 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1241 } else { 1242 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1243 } 1244 add(idx, idx, vec_len); 1245 1246 // Perform the comparison. An element of the destination predicate is set 1247 // to active if the particular char is matched. 1248 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1249 1250 // Branch if the particular char is found. 1251 br(NE, MATCH); 1252 1253 sve_whilelt(tmp_pg, T, idx, cnt1); 1254 1255 // Loop back if the particular char not found. 1256 br(MI, LOOP); 1257 1258 BIND(NOMATCH); 1259 mov(result, -1); 1260 b(DONE); 1261 1262 BIND(MATCH); 1263 // Undo the index increment. 1264 sub(idx, idx, vec_len); 1265 1266 // Crop the vector to find its location. 1267 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1268 add(result, idx, -1); 1269 sve_incp(result, T, tmp_pdn); 1270 BIND(DONE); 1271 } 1272 1273 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1274 Register ch, Register result, 1275 Register tmp1, Register tmp2, Register tmp3) 1276 { 1277 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1278 Register cnt1_neg = cnt1; 1279 Register ch1 = rscratch1; 1280 Register result_tmp = rscratch2; 1281 1282 cbz(cnt1, NOMATCH); 1283 1284 cmp(cnt1, (u1)8); 1285 br(LT, DO1_SHORT); 1286 1287 orr(ch, ch, ch, LSL, 8); 1288 orr(ch, ch, ch, LSL, 16); 1289 orr(ch, ch, ch, LSL, 32); 1290 1291 sub(cnt1, cnt1, 8); 1292 mov(result_tmp, cnt1); 1293 lea(str1, Address(str1, cnt1)); 1294 sub(cnt1_neg, zr, cnt1); 1295 1296 mov(tmp3, 0x0101010101010101); 1297 1298 BIND(CH1_LOOP); 1299 ldr(ch1, Address(str1, cnt1_neg)); 1300 eor(ch1, ch, ch1); 1301 sub(tmp1, ch1, tmp3); 1302 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1303 bics(tmp1, tmp1, tmp2); 1304 br(NE, HAS_ZERO); 1305 adds(cnt1_neg, cnt1_neg, 8); 1306 br(LT, CH1_LOOP); 1307 1308 cmp(cnt1_neg, (u1)8); 1309 mov(cnt1_neg, 0); 1310 br(LT, CH1_LOOP); 1311 b(NOMATCH); 1312 1313 BIND(HAS_ZERO); 1314 rev(tmp1, tmp1); 1315 clz(tmp1, tmp1); 1316 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1317 b(MATCH); 1318 1319 BIND(DO1_SHORT); 1320 mov(result_tmp, cnt1); 1321 lea(str1, Address(str1, cnt1)); 1322 sub(cnt1_neg, zr, cnt1); 1323 BIND(DO1_LOOP); 1324 ldrb(ch1, Address(str1, cnt1_neg)); 1325 cmp(ch, ch1); 1326 br(EQ, MATCH); 1327 adds(cnt1_neg, cnt1_neg, 1); 1328 br(LT, DO1_LOOP); 1329 BIND(NOMATCH); 1330 mov(result, -1); 1331 b(DONE); 1332 BIND(MATCH); 1333 add(result, result_tmp, cnt1_neg); 1334 BIND(DONE); 1335 } 1336 1337 // Compare strings. 1338 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1339 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1340 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1341 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1342 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1343 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1344 SHORT_LOOP_START, TAIL_CHECK; 1345 1346 bool isLL = ae == StrIntrinsicNode::LL; 1347 bool isLU = ae == StrIntrinsicNode::LU; 1348 bool isUL = ae == StrIntrinsicNode::UL; 1349 1350 // The stub threshold for LL strings is: 72 (64 + 8) chars 1351 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1352 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1353 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1354 1355 bool str1_isL = isLL || isLU; 1356 bool str2_isL = isLL || isUL; 1357 1358 int str1_chr_shift = str1_isL ? 0 : 1; 1359 int str2_chr_shift = str2_isL ? 0 : 1; 1360 int str1_chr_size = str1_isL ? 1 : 2; 1361 int str2_chr_size = str2_isL ? 1 : 2; 1362 int minCharsInWord = isLL ? wordSize : wordSize/2; 1363 1364 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1365 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1366 (chr_insn)&MacroAssembler::ldrh; 1367 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1368 (chr_insn)&MacroAssembler::ldrh; 1369 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1370 (uxt_insn)&MacroAssembler::uxthw; 1371 1372 BLOCK_COMMENT("string_compare {"); 1373 1374 // Bizarrely, the counts are passed in bytes, regardless of whether they 1375 // are L or U strings, however the result is always in characters. 1376 if (!str1_isL) asrw(cnt1, cnt1, 1); 1377 if (!str2_isL) asrw(cnt2, cnt2, 1); 1378 1379 // Compute the minimum of the string lengths and save the difference. 1380 subsw(result, cnt1, cnt2); 1381 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1382 1383 // A very short string 1384 cmpw(cnt2, minCharsInWord); 1385 br(Assembler::LE, SHORT_STRING); 1386 1387 // Compare longwords 1388 // load first parts of strings and finish initialization while loading 1389 { 1390 if (str1_isL == str2_isL) { // LL or UU 1391 ldr(tmp1, Address(str1)); 1392 cmp(str1, str2); 1393 br(Assembler::EQ, DONE); 1394 ldr(tmp2, Address(str2)); 1395 cmp(cnt2, stub_threshold); 1396 br(GE, STUB); 1397 subsw(cnt2, cnt2, minCharsInWord); 1398 br(EQ, TAIL_CHECK); 1399 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1400 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1401 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1402 } else if (isLU) { 1403 ldrs(vtmp, Address(str1)); 1404 ldr(tmp2, Address(str2)); 1405 cmp(cnt2, stub_threshold); 1406 br(GE, STUB); 1407 subw(cnt2, cnt2, 4); 1408 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1409 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1410 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1411 zip1(vtmp, T8B, vtmp, vtmpZ); 1412 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1413 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1414 add(cnt1, cnt1, 4); 1415 fmovd(tmp1, vtmp); 1416 } else { // UL case 1417 ldr(tmp1, Address(str1)); 1418 ldrs(vtmp, Address(str2)); 1419 cmp(cnt2, stub_threshold); 1420 br(GE, STUB); 1421 subw(cnt2, cnt2, 4); 1422 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1423 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1424 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1425 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1426 zip1(vtmp, T8B, vtmp, vtmpZ); 1427 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1428 add(cnt1, cnt1, 8); 1429 fmovd(tmp2, vtmp); 1430 } 1431 adds(cnt2, cnt2, isUL ? 4 : 8); 1432 br(GE, TAIL); 1433 eor(rscratch2, tmp1, tmp2); 1434 cbnz(rscratch2, DIFF); 1435 // main loop 1436 bind(NEXT_WORD); 1437 if (str1_isL == str2_isL) { 1438 ldr(tmp1, Address(str1, cnt2)); 1439 ldr(tmp2, Address(str2, cnt2)); 1440 adds(cnt2, cnt2, 8); 1441 } else if (isLU) { 1442 ldrs(vtmp, Address(str1, cnt1)); 1443 ldr(tmp2, Address(str2, cnt2)); 1444 add(cnt1, cnt1, 4); 1445 zip1(vtmp, T8B, vtmp, vtmpZ); 1446 fmovd(tmp1, vtmp); 1447 adds(cnt2, cnt2, 8); 1448 } else { // UL 1449 ldrs(vtmp, Address(str2, cnt2)); 1450 ldr(tmp1, Address(str1, cnt1)); 1451 zip1(vtmp, T8B, vtmp, vtmpZ); 1452 add(cnt1, cnt1, 8); 1453 fmovd(tmp2, vtmp); 1454 adds(cnt2, cnt2, 4); 1455 } 1456 br(GE, TAIL); 1457 1458 eor(rscratch2, tmp1, tmp2); 1459 cbz(rscratch2, NEXT_WORD); 1460 b(DIFF); 1461 bind(TAIL); 1462 eor(rscratch2, tmp1, tmp2); 1463 cbnz(rscratch2, DIFF); 1464 // Last longword. In the case where length == 4 we compare the 1465 // same longword twice, but that's still faster than another 1466 // conditional branch. 1467 if (str1_isL == str2_isL) { 1468 ldr(tmp1, Address(str1)); 1469 ldr(tmp2, Address(str2)); 1470 } else if (isLU) { 1471 ldrs(vtmp, Address(str1)); 1472 ldr(tmp2, Address(str2)); 1473 zip1(vtmp, T8B, vtmp, vtmpZ); 1474 fmovd(tmp1, vtmp); 1475 } else { // UL 1476 ldrs(vtmp, Address(str2)); 1477 ldr(tmp1, Address(str1)); 1478 zip1(vtmp, T8B, vtmp, vtmpZ); 1479 fmovd(tmp2, vtmp); 1480 } 1481 bind(TAIL_CHECK); 1482 eor(rscratch2, tmp1, tmp2); 1483 cbz(rscratch2, DONE); 1484 1485 // Find the first different characters in the longwords and 1486 // compute their difference. 1487 bind(DIFF); 1488 rev(rscratch2, rscratch2); 1489 clz(rscratch2, rscratch2); 1490 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1491 lsrv(tmp1, tmp1, rscratch2); 1492 (this->*ext_chr)(tmp1, tmp1); 1493 lsrv(tmp2, tmp2, rscratch2); 1494 (this->*ext_chr)(tmp2, tmp2); 1495 subw(result, tmp1, tmp2); 1496 b(DONE); 1497 } 1498 1499 bind(STUB); 1500 RuntimeAddress stub = nullptr; 1501 switch(ae) { 1502 case StrIntrinsicNode::LL: 1503 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1504 break; 1505 case StrIntrinsicNode::UU: 1506 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1507 break; 1508 case StrIntrinsicNode::LU: 1509 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1510 break; 1511 case StrIntrinsicNode::UL: 1512 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1513 break; 1514 default: 1515 ShouldNotReachHere(); 1516 } 1517 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1518 address call = trampoline_call(stub); 1519 if (call == nullptr) { 1520 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1521 ciEnv::current()->record_failure("CodeCache is full"); 1522 return; 1523 } 1524 b(DONE); 1525 1526 bind(SHORT_STRING); 1527 // Is the minimum length zero? 1528 cbz(cnt2, DONE); 1529 // arrange code to do most branches while loading and loading next characters 1530 // while comparing previous 1531 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1532 subs(cnt2, cnt2, 1); 1533 br(EQ, SHORT_LAST_INIT); 1534 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1535 b(SHORT_LOOP_START); 1536 bind(SHORT_LOOP); 1537 subs(cnt2, cnt2, 1); 1538 br(EQ, SHORT_LAST); 1539 bind(SHORT_LOOP_START); 1540 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1541 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1542 cmp(tmp1, cnt1); 1543 br(NE, SHORT_LOOP_TAIL); 1544 subs(cnt2, cnt2, 1); 1545 br(EQ, SHORT_LAST2); 1546 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1547 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1548 cmp(tmp2, rscratch1); 1549 br(EQ, SHORT_LOOP); 1550 sub(result, tmp2, rscratch1); 1551 b(DONE); 1552 bind(SHORT_LOOP_TAIL); 1553 sub(result, tmp1, cnt1); 1554 b(DONE); 1555 bind(SHORT_LAST2); 1556 cmp(tmp2, rscratch1); 1557 br(EQ, DONE); 1558 sub(result, tmp2, rscratch1); 1559 1560 b(DONE); 1561 bind(SHORT_LAST_INIT); 1562 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1563 bind(SHORT_LAST); 1564 cmp(tmp1, cnt1); 1565 br(EQ, DONE); 1566 sub(result, tmp1, cnt1); 1567 1568 bind(DONE); 1569 1570 BLOCK_COMMENT("} string_compare"); 1571 } 1572 1573 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1574 FloatRegister src2, Condition cond, bool isQ) { 1575 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1576 FloatRegister zn = src1, zm = src2; 1577 bool needs_negation = false; 1578 switch (cond) { 1579 case LT: cond = GT; zn = src2; zm = src1; break; 1580 case LE: cond = GE; zn = src2; zm = src1; break; 1581 case LO: cond = HI; zn = src2; zm = src1; break; 1582 case LS: cond = HS; zn = src2; zm = src1; break; 1583 case NE: cond = EQ; needs_negation = true; break; 1584 default: 1585 break; 1586 } 1587 1588 if (is_floating_point_type(bt)) { 1589 fcm(cond, dst, size, zn, zm); 1590 } else { 1591 cm(cond, dst, size, zn, zm); 1592 } 1593 1594 if (needs_negation) { 1595 notr(dst, isQ ? T16B : T8B, dst); 1596 } 1597 } 1598 1599 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1600 Condition cond, bool isQ) { 1601 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1602 if (bt == T_FLOAT || bt == T_DOUBLE) { 1603 if (cond == Assembler::NE) { 1604 fcm(Assembler::EQ, dst, size, src); 1605 notr(dst, isQ ? T16B : T8B, dst); 1606 } else { 1607 fcm(cond, dst, size, src); 1608 } 1609 } else { 1610 if (cond == Assembler::NE) { 1611 cm(Assembler::EQ, dst, size, src); 1612 notr(dst, isQ ? T16B : T8B, dst); 1613 } else { 1614 cm(cond, dst, size, src); 1615 } 1616 } 1617 } 1618 1619 // Compress the least significant bit of each byte to the rightmost and clear 1620 // the higher garbage bits. 1621 void C2_MacroAssembler::bytemask_compress(Register dst) { 1622 // Example input, dst = 0x01 00 00 00 01 01 00 01 1623 // The "??" bytes are garbage. 1624 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1625 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1626 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1627 andr(dst, dst, 0xff); // dst = 0x8D 1628 } 1629 1630 // Pack the lowest-numbered bit of each mask element in src into a long value 1631 // in dst, at most the first 64 lane elements. 1632 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1633 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1634 FloatRegister vtmp1, FloatRegister vtmp2) { 1635 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1636 assert_different_registers(dst, rscratch1); 1637 assert_different_registers(vtmp1, vtmp2); 1638 1639 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1640 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1641 // Expected: dst = 0x658D 1642 1643 // Convert the mask into vector with sequential bytes. 1644 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1645 sve_cpy(vtmp1, size, src, 1, false); 1646 if (bt != T_BYTE) { 1647 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1648 } 1649 1650 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1651 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1652 // is to compress each significant bit of the byte in a cross-lane way. Due 1653 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1654 // (bit-compress in each lane) with the biggest lane size (T = D) then 1655 // concatenate the results. 1656 1657 // The second source input of BEXT, initialized with 0x01 in each byte. 1658 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1659 sve_dup(vtmp2, B, 1); 1660 1661 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1662 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1663 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1664 // --------------------------------------- 1665 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1666 sve_bext(vtmp1, D, vtmp1, vtmp2); 1667 1668 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1669 // result to dst. 1670 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1671 // dst = 0x658D 1672 if (lane_cnt <= 8) { 1673 // No need to concatenate. 1674 umov(dst, vtmp1, B, 0); 1675 } else if (lane_cnt <= 16) { 1676 ins(vtmp1, B, vtmp1, 1, 8); 1677 umov(dst, vtmp1, H, 0); 1678 } else { 1679 // As the lane count is 64 at most, the final expected value must be in 1680 // the lowest 64 bits after narrowing vtmp1 from D to B. 1681 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1682 umov(dst, vtmp1, D, 0); 1683 } 1684 } else if (UseSVE > 0) { 1685 // Compress the lowest 8 bytes. 1686 fmovd(dst, vtmp1); 1687 bytemask_compress(dst); 1688 if (lane_cnt <= 8) return; 1689 1690 // Repeat on higher bytes and join the results. 1691 // Compress 8 bytes in each iteration. 1692 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1693 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1694 bytemask_compress(rscratch1); 1695 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1696 } 1697 } else { 1698 assert(false, "unsupported"); 1699 ShouldNotReachHere(); 1700 } 1701 } 1702 1703 // Unpack the mask, a long value in src, into predicate register dst based on the 1704 // corresponding data type. Note that dst can support at most 64 lanes. 1705 // Below example gives the expected dst predicate register in different types, with 1706 // a valid src(0x658D) on a 1024-bit vector size machine. 1707 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1708 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1709 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1710 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1711 // 1712 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1713 // has 24 significant bits would be an invalid input if dst predicate register refers to 1714 // a LONG type 1024-bit vector, which has at most 16 lanes. 1715 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1716 FloatRegister vtmp1, FloatRegister vtmp2) { 1717 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1718 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1719 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1720 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1721 // Expected: dst = 0b01101001 10001101 1722 1723 // Put long value from general purpose register into the first lane of vector. 1724 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1725 sve_dup(vtmp1, B, 0); 1726 mov(vtmp1, D, 0, src); 1727 1728 // As sve_cmp generates mask value with the minimum unit in byte, we should 1729 // transform the value in the first lane which is mask in bit now to the 1730 // mask in byte, which can be done by SVE2's BDEP instruction. 1731 1732 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1733 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1734 if (lane_cnt <= 8) { 1735 // Nothing. As only one byte exsits. 1736 } else if (lane_cnt <= 16) { 1737 ins(vtmp1, B, vtmp1, 8, 1); 1738 mov(vtmp1, B, 1, zr); 1739 } else { 1740 sve_vector_extend(vtmp1, D, vtmp1, B); 1741 } 1742 1743 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1744 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1745 sve_dup(vtmp2, B, 1); 1746 1747 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1748 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1749 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1750 // --------------------------------------- 1751 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1752 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1753 1754 if (bt != T_BYTE) { 1755 sve_vector_extend(vtmp1, size, vtmp1, B); 1756 } 1757 // Generate mask according to the given vector, in which the elements have been 1758 // extended to expected type. 1759 // dst = 0b01101001 10001101 1760 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1761 } 1762 1763 // Clobbers: rflags 1764 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1765 FloatRegister zn, FloatRegister zm, Condition cond) { 1766 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1767 FloatRegister z1 = zn, z2 = zm; 1768 switch (cond) { 1769 case LE: z1 = zm; z2 = zn; cond = GE; break; 1770 case LT: z1 = zm; z2 = zn; cond = GT; break; 1771 case LO: z1 = zm; z2 = zn; cond = HI; break; 1772 case LS: z1 = zm; z2 = zn; cond = HS; break; 1773 default: 1774 break; 1775 } 1776 1777 SIMD_RegVariant size = elemType_to_regVariant(bt); 1778 if (is_floating_point_type(bt)) { 1779 sve_fcm(cond, pd, size, pg, z1, z2); 1780 } else { 1781 assert(is_integral_type(bt), "unsupported element type"); 1782 sve_cmp(cond, pd, size, pg, z1, z2); 1783 } 1784 } 1785 1786 // Get index of the last mask lane that is set 1787 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1788 SIMD_RegVariant size = elemType_to_regVariant(bt); 1789 sve_rev(ptmp, size, src); 1790 sve_brkb(ptmp, ptrue, ptmp, false); 1791 sve_cntp(dst, size, ptrue, ptmp); 1792 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1793 subw(dst, rscratch1, dst); 1794 } 1795 1796 // Extend integer vector src to dst with the same lane count 1797 // but larger element size, e.g. 4B -> 4I 1798 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1799 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1800 if (src_bt == T_BYTE) { 1801 if (dst_bt == T_SHORT) { 1802 // 4B/8B to 4S/8S 1803 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1804 } else { 1805 // 4B to 4I 1806 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1807 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1808 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1809 } 1810 } else if (src_bt == T_SHORT) { 1811 // 4S to 4I 1812 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1813 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1814 } else if (src_bt == T_INT) { 1815 // 2I to 2L 1816 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1817 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1818 } else { 1819 ShouldNotReachHere(); 1820 } 1821 } 1822 1823 // Narrow integer vector src down to dst with the same lane count 1824 // but smaller element size, e.g. 4I -> 4B 1825 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1826 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1827 if (src_bt == T_SHORT) { 1828 // 4S/8S to 4B/8B 1829 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1830 assert(dst_bt == T_BYTE, "unsupported"); 1831 xtn(dst, T8B, src, T8H); 1832 } else if (src_bt == T_INT) { 1833 // 4I to 4B/4S 1834 assert(src_vlen_in_bytes == 16, "unsupported"); 1835 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1836 xtn(dst, T4H, src, T4S); 1837 if (dst_bt == T_BYTE) { 1838 xtn(dst, T8B, dst, T8H); 1839 } 1840 } else if (src_bt == T_LONG) { 1841 // 2L to 2I 1842 assert(src_vlen_in_bytes == 16, "unsupported"); 1843 assert(dst_bt == T_INT, "unsupported"); 1844 xtn(dst, T2S, src, T2D); 1845 } else { 1846 ShouldNotReachHere(); 1847 } 1848 } 1849 1850 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1851 FloatRegister src, SIMD_RegVariant src_size, 1852 bool is_unsigned) { 1853 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1854 1855 if (src_size == B) { 1856 switch (dst_size) { 1857 case H: 1858 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1859 break; 1860 case S: 1861 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1862 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1863 break; 1864 case D: 1865 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1866 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1867 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1868 break; 1869 default: 1870 ShouldNotReachHere(); 1871 } 1872 } else if (src_size == H) { 1873 if (dst_size == S) { 1874 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1875 } else { // D 1876 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1877 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1878 } 1879 } else if (src_size == S) { 1880 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1881 } 1882 } 1883 1884 // Vector narrow from src to dst with specified element sizes. 1885 // High part of dst vector will be filled with zero. 1886 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1887 FloatRegister src, SIMD_RegVariant src_size, 1888 FloatRegister tmp) { 1889 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1890 assert_different_registers(src, tmp); 1891 sve_dup(tmp, src_size, 0); 1892 if (src_size == D) { 1893 switch (dst_size) { 1894 case S: 1895 sve_uzp1(dst, S, src, tmp); 1896 break; 1897 case H: 1898 assert_different_registers(dst, tmp); 1899 sve_uzp1(dst, S, src, tmp); 1900 sve_uzp1(dst, H, dst, tmp); 1901 break; 1902 case B: 1903 assert_different_registers(dst, tmp); 1904 sve_uzp1(dst, S, src, tmp); 1905 sve_uzp1(dst, H, dst, tmp); 1906 sve_uzp1(dst, B, dst, tmp); 1907 break; 1908 default: 1909 ShouldNotReachHere(); 1910 } 1911 } else if (src_size == S) { 1912 if (dst_size == H) { 1913 sve_uzp1(dst, H, src, tmp); 1914 } else { // B 1915 assert_different_registers(dst, tmp); 1916 sve_uzp1(dst, H, src, tmp); 1917 sve_uzp1(dst, B, dst, tmp); 1918 } 1919 } else if (src_size == H) { 1920 sve_uzp1(dst, B, src, tmp); 1921 } 1922 } 1923 1924 // Extend src predicate to dst predicate with the same lane count but larger 1925 // element size, e.g. 64Byte -> 512Long 1926 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1927 uint dst_element_length_in_bytes, 1928 uint src_element_length_in_bytes) { 1929 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1930 sve_punpklo(dst, src); 1931 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1932 sve_punpklo(dst, src); 1933 sve_punpklo(dst, dst); 1934 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1935 sve_punpklo(dst, src); 1936 sve_punpklo(dst, dst); 1937 sve_punpklo(dst, dst); 1938 } else { 1939 assert(false, "unsupported"); 1940 ShouldNotReachHere(); 1941 } 1942 } 1943 1944 // Narrow src predicate to dst predicate with the same lane count but 1945 // smaller element size, e.g. 512Long -> 64Byte 1946 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1947 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1948 // The insignificant bits in src predicate are expected to be zero. 1949 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1950 // passed as the second argument. An example narrowing operation with a given mask would be - 1951 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1952 // Mask (for 2 Longs) : TF 1953 // Predicate register for the above mask (16 bits) : 00000001 00000000 1954 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1955 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1956 assert_different_registers(src, ptmp); 1957 assert_different_registers(dst, ptmp); 1958 sve_pfalse(ptmp); 1959 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1960 sve_uzp1(dst, B, src, ptmp); 1961 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1962 sve_uzp1(dst, H, src, ptmp); 1963 sve_uzp1(dst, B, dst, ptmp); 1964 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1965 sve_uzp1(dst, S, src, ptmp); 1966 sve_uzp1(dst, H, dst, ptmp); 1967 sve_uzp1(dst, B, dst, ptmp); 1968 } else { 1969 assert(false, "unsupported"); 1970 ShouldNotReachHere(); 1971 } 1972 } 1973 1974 // Vector reduction add for integral type with ASIMD instructions. 1975 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1976 Register isrc, FloatRegister vsrc, 1977 unsigned vector_length_in_bytes, 1978 FloatRegister vtmp) { 1979 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1980 assert_different_registers(dst, isrc); 1981 bool isQ = vector_length_in_bytes == 16; 1982 1983 BLOCK_COMMENT("neon_reduce_add_integral {"); 1984 switch(bt) { 1985 case T_BYTE: 1986 addv(vtmp, isQ ? T16B : T8B, vsrc); 1987 smov(dst, vtmp, B, 0); 1988 addw(dst, dst, isrc, ext::sxtb); 1989 break; 1990 case T_SHORT: 1991 addv(vtmp, isQ ? T8H : T4H, vsrc); 1992 smov(dst, vtmp, H, 0); 1993 addw(dst, dst, isrc, ext::sxth); 1994 break; 1995 case T_INT: 1996 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1997 umov(dst, vtmp, S, 0); 1998 addw(dst, dst, isrc); 1999 break; 2000 case T_LONG: 2001 assert(isQ, "unsupported"); 2002 addpd(vtmp, vsrc); 2003 umov(dst, vtmp, D, 0); 2004 add(dst, dst, isrc); 2005 break; 2006 default: 2007 assert(false, "unsupported"); 2008 ShouldNotReachHere(); 2009 } 2010 BLOCK_COMMENT("} neon_reduce_add_integral"); 2011 } 2012 2013 // Vector reduction multiply for integral type with ASIMD instructions. 2014 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 2015 // Clobbers: rscratch1 2016 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 2017 Register isrc, FloatRegister vsrc, 2018 unsigned vector_length_in_bytes, 2019 FloatRegister vtmp1, FloatRegister vtmp2) { 2020 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2021 bool isQ = vector_length_in_bytes == 16; 2022 2023 BLOCK_COMMENT("neon_reduce_mul_integral {"); 2024 switch(bt) { 2025 case T_BYTE: 2026 if (isQ) { 2027 // Multiply the lower half and higher half of vector iteratively. 2028 // vtmp1 = vsrc[8:15] 2029 ins(vtmp1, D, vsrc, 0, 1); 2030 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2031 mulv(vtmp1, T8B, vtmp1, vsrc); 2032 // vtmp2 = vtmp1[4:7] 2033 ins(vtmp2, S, vtmp1, 0, 1); 2034 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2035 mulv(vtmp1, T8B, vtmp2, vtmp1); 2036 } else { 2037 ins(vtmp1, S, vsrc, 0, 1); 2038 mulv(vtmp1, T8B, vtmp1, vsrc); 2039 } 2040 // vtmp2 = vtmp1[2:3] 2041 ins(vtmp2, H, vtmp1, 0, 1); 2042 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2043 mulv(vtmp2, T8B, vtmp2, vtmp1); 2044 // dst = vtmp2[0] * isrc * vtmp2[1] 2045 umov(rscratch1, vtmp2, B, 0); 2046 mulw(dst, rscratch1, isrc); 2047 sxtb(dst, dst); 2048 umov(rscratch1, vtmp2, B, 1); 2049 mulw(dst, rscratch1, dst); 2050 sxtb(dst, dst); 2051 break; 2052 case T_SHORT: 2053 if (isQ) { 2054 ins(vtmp2, D, vsrc, 0, 1); 2055 mulv(vtmp2, T4H, vtmp2, vsrc); 2056 ins(vtmp1, S, vtmp2, 0, 1); 2057 mulv(vtmp1, T4H, vtmp1, vtmp2); 2058 } else { 2059 ins(vtmp1, S, vsrc, 0, 1); 2060 mulv(vtmp1, T4H, vtmp1, vsrc); 2061 } 2062 umov(rscratch1, vtmp1, H, 0); 2063 mulw(dst, rscratch1, isrc); 2064 sxth(dst, dst); 2065 umov(rscratch1, vtmp1, H, 1); 2066 mulw(dst, rscratch1, dst); 2067 sxth(dst, dst); 2068 break; 2069 case T_INT: 2070 if (isQ) { 2071 ins(vtmp1, D, vsrc, 0, 1); 2072 mulv(vtmp1, T2S, vtmp1, vsrc); 2073 } else { 2074 vtmp1 = vsrc; 2075 } 2076 umov(rscratch1, vtmp1, S, 0); 2077 mul(dst, rscratch1, isrc); 2078 umov(rscratch1, vtmp1, S, 1); 2079 mul(dst, rscratch1, dst); 2080 break; 2081 case T_LONG: 2082 umov(rscratch1, vsrc, D, 0); 2083 mul(dst, isrc, rscratch1); 2084 umov(rscratch1, vsrc, D, 1); 2085 mul(dst, dst, rscratch1); 2086 break; 2087 default: 2088 assert(false, "unsupported"); 2089 ShouldNotReachHere(); 2090 } 2091 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2092 } 2093 2094 // Vector reduction multiply for floating-point type with ASIMD instructions. 2095 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2096 FloatRegister fsrc, FloatRegister vsrc, 2097 unsigned vector_length_in_bytes, 2098 FloatRegister vtmp) { 2099 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2100 bool isQ = vector_length_in_bytes == 16; 2101 2102 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2103 switch(bt) { 2104 case T_FLOAT: 2105 fmuls(dst, fsrc, vsrc); 2106 ins(vtmp, S, vsrc, 0, 1); 2107 fmuls(dst, dst, vtmp); 2108 if (isQ) { 2109 ins(vtmp, S, vsrc, 0, 2); 2110 fmuls(dst, dst, vtmp); 2111 ins(vtmp, S, vsrc, 0, 3); 2112 fmuls(dst, dst, vtmp); 2113 } 2114 break; 2115 case T_DOUBLE: 2116 assert(isQ, "unsupported"); 2117 fmuld(dst, fsrc, vsrc); 2118 ins(vtmp, D, vsrc, 0, 1); 2119 fmuld(dst, dst, vtmp); 2120 break; 2121 default: 2122 assert(false, "unsupported"); 2123 ShouldNotReachHere(); 2124 } 2125 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2126 } 2127 2128 // Helper to select logical instruction 2129 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2130 Register Rn, Register Rm, 2131 enum shift_kind kind, unsigned shift) { 2132 switch(opc) { 2133 case Op_AndReductionV: 2134 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2135 break; 2136 case Op_OrReductionV: 2137 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2138 break; 2139 case Op_XorReductionV: 2140 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2141 break; 2142 default: 2143 assert(false, "unsupported"); 2144 ShouldNotReachHere(); 2145 } 2146 } 2147 2148 // Vector reduction logical operations And, Or, Xor 2149 // Clobbers: rscratch1 2150 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2151 Register isrc, FloatRegister vsrc, 2152 unsigned vector_length_in_bytes) { 2153 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2154 "unsupported"); 2155 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2156 assert_different_registers(dst, isrc); 2157 bool isQ = vector_length_in_bytes == 16; 2158 2159 BLOCK_COMMENT("neon_reduce_logical {"); 2160 umov(rscratch1, vsrc, isQ ? D : S, 0); 2161 umov(dst, vsrc, isQ ? D : S, 1); 2162 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2163 switch(bt) { 2164 case T_BYTE: 2165 if (isQ) { 2166 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2167 } 2168 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2169 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2170 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2171 sxtb(dst, dst); 2172 break; 2173 case T_SHORT: 2174 if (isQ) { 2175 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2176 } 2177 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2178 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2179 sxth(dst, dst); 2180 break; 2181 case T_INT: 2182 if (isQ) { 2183 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2184 } 2185 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2186 break; 2187 case T_LONG: 2188 assert(isQ, "unsupported"); 2189 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2190 break; 2191 default: 2192 assert(false, "unsupported"); 2193 ShouldNotReachHere(); 2194 } 2195 BLOCK_COMMENT("} neon_reduce_logical"); 2196 } 2197 2198 // Vector reduction min/max for integral type with ASIMD instructions. 2199 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2200 // Clobbers: rscratch1, rflags 2201 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2202 Register isrc, FloatRegister vsrc, 2203 unsigned vector_length_in_bytes, 2204 FloatRegister vtmp) { 2205 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2206 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2207 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2208 assert_different_registers(dst, isrc); 2209 bool isQ = vector_length_in_bytes == 16; 2210 bool is_min = opc == Op_MinReductionV; 2211 2212 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2213 if (bt == T_LONG) { 2214 assert(vtmp == fnoreg, "should be"); 2215 assert(isQ, "should be"); 2216 umov(rscratch1, vsrc, D, 0); 2217 cmp(isrc, rscratch1); 2218 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2219 umov(rscratch1, vsrc, D, 1); 2220 cmp(dst, rscratch1); 2221 csel(dst, dst, rscratch1, is_min ? LT : GT); 2222 } else { 2223 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2224 if (size == T2S) { 2225 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2226 } else { 2227 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2228 } 2229 if (bt == T_INT) { 2230 umov(dst, vtmp, S, 0); 2231 } else { 2232 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2233 } 2234 cmpw(dst, isrc); 2235 cselw(dst, dst, isrc, is_min ? LT : GT); 2236 } 2237 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2238 } 2239 2240 // Vector reduction for integral type with SVE instruction. 2241 // Supported operations are Add, And, Or, Xor, Max, Min. 2242 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2243 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2244 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2245 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2246 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2247 assert_different_registers(src1, dst); 2248 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2249 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2250 switch (opc) { 2251 case Op_AddReductionVI: { 2252 sve_uaddv(tmp, size, pg, src2); 2253 if (bt == T_BYTE) { 2254 smov(dst, tmp, size, 0); 2255 addw(dst, src1, dst, ext::sxtb); 2256 } else if (bt == T_SHORT) { 2257 smov(dst, tmp, size, 0); 2258 addw(dst, src1, dst, ext::sxth); 2259 } else { 2260 umov(dst, tmp, size, 0); 2261 addw(dst, dst, src1); 2262 } 2263 break; 2264 } 2265 case Op_AddReductionVL: { 2266 sve_uaddv(tmp, size, pg, src2); 2267 umov(dst, tmp, size, 0); 2268 add(dst, dst, src1); 2269 break; 2270 } 2271 case Op_AndReductionV: { 2272 sve_andv(tmp, size, pg, src2); 2273 if (bt == T_INT || bt == T_LONG) { 2274 umov(dst, tmp, size, 0); 2275 } else { 2276 smov(dst, tmp, size, 0); 2277 } 2278 if (bt == T_LONG) { 2279 andr(dst, dst, src1); 2280 } else { 2281 andw(dst, dst, src1); 2282 } 2283 break; 2284 } 2285 case Op_OrReductionV: { 2286 sve_orv(tmp, size, pg, src2); 2287 if (bt == T_INT || bt == T_LONG) { 2288 umov(dst, tmp, size, 0); 2289 } else { 2290 smov(dst, tmp, size, 0); 2291 } 2292 if (bt == T_LONG) { 2293 orr(dst, dst, src1); 2294 } else { 2295 orrw(dst, dst, src1); 2296 } 2297 break; 2298 } 2299 case Op_XorReductionV: { 2300 sve_eorv(tmp, size, pg, src2); 2301 if (bt == T_INT || bt == T_LONG) { 2302 umov(dst, tmp, size, 0); 2303 } else { 2304 smov(dst, tmp, size, 0); 2305 } 2306 if (bt == T_LONG) { 2307 eor(dst, dst, src1); 2308 } else { 2309 eorw(dst, dst, src1); 2310 } 2311 break; 2312 } 2313 case Op_MaxReductionV: { 2314 sve_smaxv(tmp, size, pg, src2); 2315 if (bt == T_INT || bt == T_LONG) { 2316 umov(dst, tmp, size, 0); 2317 } else { 2318 smov(dst, tmp, size, 0); 2319 } 2320 if (bt == T_LONG) { 2321 cmp(dst, src1); 2322 csel(dst, dst, src1, Assembler::GT); 2323 } else { 2324 cmpw(dst, src1); 2325 cselw(dst, dst, src1, Assembler::GT); 2326 } 2327 break; 2328 } 2329 case Op_MinReductionV: { 2330 sve_sminv(tmp, size, pg, src2); 2331 if (bt == T_INT || bt == T_LONG) { 2332 umov(dst, tmp, size, 0); 2333 } else { 2334 smov(dst, tmp, size, 0); 2335 } 2336 if (bt == T_LONG) { 2337 cmp(dst, src1); 2338 csel(dst, dst, src1, Assembler::LT); 2339 } else { 2340 cmpw(dst, src1); 2341 cselw(dst, dst, src1, Assembler::LT); 2342 } 2343 break; 2344 } 2345 default: 2346 assert(false, "unsupported"); 2347 ShouldNotReachHere(); 2348 } 2349 2350 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2351 if (bt == T_BYTE) { 2352 sxtb(dst, dst); 2353 } else if (bt == T_SHORT) { 2354 sxth(dst, dst); 2355 } 2356 } 2357 } 2358 2359 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2360 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2361 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2362 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2363 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2364 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2365 2366 // Set all elements to false if the input "lane_cnt" is zero. 2367 if (lane_cnt == 0) { 2368 sve_pfalse(dst); 2369 return; 2370 } 2371 2372 SIMD_RegVariant size = elemType_to_regVariant(bt); 2373 assert(size != Q, "invalid size"); 2374 2375 // Set all true if "lane_cnt" equals to the max lane count. 2376 if (lane_cnt == max_vector_length) { 2377 sve_ptrue(dst, size, /* ALL */ 0b11111); 2378 return; 2379 } 2380 2381 // Fixed numbers for "ptrue". 2382 switch(lane_cnt) { 2383 case 1: /* VL1 */ 2384 case 2: /* VL2 */ 2385 case 3: /* VL3 */ 2386 case 4: /* VL4 */ 2387 case 5: /* VL5 */ 2388 case 6: /* VL6 */ 2389 case 7: /* VL7 */ 2390 case 8: /* VL8 */ 2391 sve_ptrue(dst, size, lane_cnt); 2392 return; 2393 case 16: 2394 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2395 return; 2396 case 32: 2397 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2398 return; 2399 case 64: 2400 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2401 return; 2402 case 128: 2403 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2404 return; 2405 case 256: 2406 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2407 return; 2408 default: 2409 break; 2410 } 2411 2412 // Special patterns for "ptrue". 2413 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2414 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2415 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2416 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2417 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2418 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2419 } else { 2420 // Encode to "whileltw" for the remaining cases. 2421 mov(rscratch1, lane_cnt); 2422 sve_whileltw(dst, size, zr, rscratch1); 2423 } 2424 } 2425 2426 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2427 // Any remaining elements of dst will be filled with zero. 2428 // Clobbers: rscratch1 2429 // Preserves: src, mask 2430 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2431 FloatRegister vtmp1, FloatRegister vtmp2, 2432 PRegister pgtmp) { 2433 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2434 assert_different_registers(dst, src, vtmp1, vtmp2); 2435 assert_different_registers(mask, pgtmp); 2436 2437 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2438 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2439 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2440 sve_dup(vtmp2, H, 0); 2441 2442 // Extend lowest half to type INT. 2443 // dst = 00004444 00003333 00002222 00001111 2444 sve_uunpklo(dst, S, src); 2445 // pgtmp = 00000001 00000000 00000001 00000001 2446 sve_punpklo(pgtmp, mask); 2447 // Pack the active elements in size of type INT to the right, 2448 // and fill the remainings with zero. 2449 // dst = 00000000 00004444 00002222 00001111 2450 sve_compact(dst, S, dst, pgtmp); 2451 // Narrow the result back to type SHORT. 2452 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2453 sve_uzp1(dst, H, dst, vtmp2); 2454 // Count the active elements of lowest half. 2455 // rscratch1 = 3 2456 sve_cntp(rscratch1, S, ptrue, pgtmp); 2457 2458 // Repeat to the highest half. 2459 // pgtmp = 00000001 00000000 00000000 00000001 2460 sve_punpkhi(pgtmp, mask); 2461 // vtmp1 = 00008888 00007777 00006666 00005555 2462 sve_uunpkhi(vtmp1, S, src); 2463 // vtmp1 = 00000000 00000000 00008888 00005555 2464 sve_compact(vtmp1, S, vtmp1, pgtmp); 2465 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2466 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2467 2468 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2469 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2470 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2471 // TRUE_CNT is the number of active elements in the compressed low. 2472 neg(rscratch1, rscratch1); 2473 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2474 sve_index(vtmp2, H, rscratch1, 1); 2475 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2476 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2477 2478 // Combine the compressed high(after shifted) with the compressed low. 2479 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2480 sve_orr(dst, dst, vtmp1); 2481 } 2482 2483 // Clobbers: rscratch1, rscratch2 2484 // Preserves: src, mask 2485 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2486 FloatRegister vtmp1, FloatRegister vtmp2, 2487 FloatRegister vtmp3, FloatRegister vtmp4, 2488 PRegister ptmp, PRegister pgtmp) { 2489 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2490 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2491 assert_different_registers(mask, ptmp, pgtmp); 2492 // Example input: src = 88 77 66 55 44 33 22 11 2493 // mask = 01 00 00 01 01 00 01 01 2494 // Expected result: dst = 00 00 00 88 55 44 22 11 2495 2496 sve_dup(vtmp4, B, 0); 2497 // Extend lowest half to type SHORT. 2498 // vtmp1 = 0044 0033 0022 0011 2499 sve_uunpklo(vtmp1, H, src); 2500 // ptmp = 0001 0000 0001 0001 2501 sve_punpklo(ptmp, mask); 2502 // Count the active elements of lowest half. 2503 // rscratch2 = 3 2504 sve_cntp(rscratch2, H, ptrue, ptmp); 2505 // Pack the active elements in size of type SHORT to the right, 2506 // and fill the remainings with zero. 2507 // dst = 0000 0044 0022 0011 2508 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2509 // Narrow the result back to type BYTE. 2510 // dst = 00 00 00 00 00 44 22 11 2511 sve_uzp1(dst, B, dst, vtmp4); 2512 2513 // Repeat to the highest half. 2514 // ptmp = 0001 0000 0000 0001 2515 sve_punpkhi(ptmp, mask); 2516 // vtmp1 = 0088 0077 0066 0055 2517 sve_uunpkhi(vtmp2, H, src); 2518 // vtmp1 = 0000 0000 0088 0055 2519 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2520 2521 sve_dup(vtmp4, B, 0); 2522 // vtmp1 = 00 00 00 00 00 00 88 55 2523 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2524 2525 // Compressed low: dst = 00 00 00 00 00 44 22 11 2526 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2527 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2528 // TRUE_CNT is the number of active elements in the compressed low. 2529 neg(rscratch2, rscratch2); 2530 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2531 sve_index(vtmp2, B, rscratch2, 1); 2532 // vtmp1 = 00 00 00 88 55 00 00 00 2533 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2534 // Combine the compressed high(after shifted) with the compressed low. 2535 // dst = 00 00 00 88 55 44 22 11 2536 sve_orr(dst, dst, vtmp1); 2537 } 2538 2539 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2540 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2541 SIMD_Arrangement size = isQ ? T16B : T8B; 2542 if (bt == T_BYTE) { 2543 rbit(dst, size, src); 2544 } else { 2545 neon_reverse_bytes(dst, src, bt, isQ); 2546 rbit(dst, size, dst); 2547 } 2548 } 2549 2550 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2551 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2552 SIMD_Arrangement size = isQ ? T16B : T8B; 2553 switch (bt) { 2554 case T_BYTE: 2555 if (dst != src) { 2556 orr(dst, size, src, src); 2557 } 2558 break; 2559 case T_SHORT: 2560 rev16(dst, size, src); 2561 break; 2562 case T_INT: 2563 rev32(dst, size, src); 2564 break; 2565 case T_LONG: 2566 rev64(dst, size, src); 2567 break; 2568 default: 2569 assert(false, "unsupported"); 2570 ShouldNotReachHere(); 2571 } 2572 } 2573 2574 // VectorRearrange implementation for short/int/float/long/double types with NEON 2575 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. 2576 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group. 2577 // For VectorRearrange long/double, we compare the shuffle input with iota indices, 2578 // and use bsl to implement the operation. 2579 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, 2580 FloatRegister shuffle, FloatRegister tmp, 2581 BasicType bt, bool isQ) { 2582 assert_different_registers(dst, src, shuffle, tmp); 2583 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2584 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2585 2586 // Here is an example that rearranges a NEON vector with 4 ints: 2587 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] 2588 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. 2589 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector 2590 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get 2591 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. 2592 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], 2593 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] 2594 // 4. Use Vm as index register, and use V1 as table register. 2595 // Then get V2 as the result by tbl NEON instructions. 2596 switch (bt) { 2597 case T_SHORT: 2598 mov(tmp, size1, 0x02); 2599 mulv(dst, size2, shuffle, tmp); 2600 mov(tmp, size2, 0x0100); 2601 addv(dst, size1, dst, tmp); 2602 tbl(dst, size1, src, 1, dst); 2603 break; 2604 case T_INT: 2605 case T_FLOAT: 2606 mov(tmp, size1, 0x04); 2607 mulv(dst, size2, shuffle, tmp); 2608 mov(tmp, size2, 0x03020100); 2609 addv(dst, size1, dst, tmp); 2610 tbl(dst, size1, src, 1, dst); 2611 break; 2612 case T_LONG: 2613 case T_DOUBLE: 2614 // Load the iota indices for Long type. The indices are ordered by 2615 // type B/S/I/L/F/D, and the offset between two types is 16; Hence 2616 // the offset for L is 48. 2617 lea(rscratch1, 2618 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); 2619 ldrq(tmp, rscratch1); 2620 // Check whether the input "shuffle" is the same with iota indices. 2621 // Return "src" if true, otherwise swap the two elements of "src". 2622 cm(EQ, dst, size2, shuffle, tmp); 2623 ext(tmp, size1, src, src, 8); 2624 bsl(dst, size1, src, tmp); 2625 break; 2626 default: 2627 assert(false, "unsupported element type"); 2628 ShouldNotReachHere(); 2629 } 2630 } 2631 2632 // Extract a scalar element from an sve vector at position 'idx'. 2633 // The input elements in src are expected to be of integral type. 2634 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2635 int idx, FloatRegister vtmp) { 2636 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2637 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2638 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2639 if (bt == T_INT || bt == T_LONG) { 2640 umov(dst, src, size, idx); 2641 } else { 2642 smov(dst, src, size, idx); 2643 } 2644 } else { 2645 sve_orr(vtmp, src, src); 2646 sve_ext(vtmp, vtmp, idx << size); 2647 if (bt == T_INT || bt == T_LONG) { 2648 umov(dst, vtmp, size, 0); 2649 } else { 2650 smov(dst, vtmp, size, 0); 2651 } 2652 } 2653 } 2654 2655 // java.lang.Math::round intrinsics 2656 2657 // Clobbers: rscratch1, rflags 2658 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2659 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2660 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2661 switch (T) { 2662 case T2S: 2663 case T4S: 2664 fmovs(tmp1, T, 0.5f); 2665 mov(rscratch1, jint_cast(0x1.0p23f)); 2666 break; 2667 case T2D: 2668 fmovd(tmp1, T, 0.5); 2669 mov(rscratch1, julong_cast(0x1.0p52)); 2670 break; 2671 default: 2672 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2673 } 2674 fadd(tmp1, T, tmp1, src); 2675 fcvtms(tmp1, T, tmp1); 2676 // tmp1 = floor(src + 0.5, ties to even) 2677 2678 fcvtas(dst, T, src); 2679 // dst = round(src), ties to away 2680 2681 fneg(tmp3, T, src); 2682 dup(tmp2, T, rscratch1); 2683 cm(HS, tmp3, T, tmp3, tmp2); 2684 // tmp3 is now a set of flags 2685 2686 bif(dst, T16B, tmp1, tmp3); 2687 // result in dst 2688 } 2689 2690 // Clobbers: rscratch1, rflags 2691 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2692 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2693 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2694 assert_different_registers(tmp1, tmp2, src, dst); 2695 2696 switch (T) { 2697 case S: 2698 mov(rscratch1, jint_cast(0x1.0p23f)); 2699 break; 2700 case D: 2701 mov(rscratch1, julong_cast(0x1.0p52)); 2702 break; 2703 default: 2704 assert(T == S || T == D, "invalid register variant"); 2705 } 2706 2707 sve_frinta(dst, T, ptrue, src); 2708 // dst = round(src), ties to away 2709 2710 Label none; 2711 2712 sve_fneg(tmp1, T, ptrue, src); 2713 sve_dup(tmp2, T, rscratch1); 2714 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2715 br(EQ, none); 2716 { 2717 sve_cpy(tmp1, T, pgtmp, 0.5); 2718 sve_fadd(tmp1, T, pgtmp, src); 2719 sve_frintm(dst, T, pgtmp, tmp1); 2720 // dst = floor(src + 0.5, ties to even) 2721 } 2722 bind(none); 2723 2724 sve_fcvtzs(dst, T, ptrue, dst, T); 2725 // result in dst 2726 } 2727 2728 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2729 FloatRegister one, SIMD_Arrangement T) { 2730 assert_different_registers(dst, src, zero, one); 2731 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2732 2733 facgt(dst, T, src, zero); 2734 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2735 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2736 } 2737 2738 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2739 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2740 assert_different_registers(dst, src, zero, one, vtmp); 2741 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2742 2743 sve_orr(vtmp, src, src); 2744 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2745 switch (T) { 2746 case S: 2747 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2748 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2749 // on the sign of the float value 2750 break; 2751 case D: 2752 sve_and(vtmp, T, min_jlong); 2753 sve_orr(vtmp, T, jlong_cast(1.0)); 2754 break; 2755 default: 2756 assert(false, "unsupported"); 2757 ShouldNotReachHere(); 2758 } 2759 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2760 // Result in dst 2761 } 2762 2763 bool C2_MacroAssembler::in_scratch_emit_size() { 2764 if (ciEnv::current()->task() != nullptr) { 2765 PhaseOutput* phase_output = Compile::current()->output(); 2766 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2767 return true; 2768 } 2769 } 2770 return MacroAssembler::in_scratch_emit_size(); 2771 }