1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::entry_barrier() { 50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 51 // Dummy labels for just measuring the code size 52 Label dummy_slow_path; 53 Label dummy_continuation; 54 Label dummy_guard; 55 Label* slow_path = &dummy_slow_path; 56 Label* continuation = &dummy_continuation; 57 Label* guard = &dummy_guard; 58 if (!Compile::current()->output()->in_scratch_emit_size()) { 59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 61 Compile::current()->output()->add_stub(stub); 62 slow_path = &stub->entry(); 63 continuation = &stub->continuation(); 64 guard = &stub->guard(); 65 } 66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 68 } 69 70 // jdk.internal.util.ArraysSupport.vectorizedHashCode 71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 72 FloatRegister vdata0, FloatRegister vdata1, 73 FloatRegister vdata2, FloatRegister vdata3, 74 FloatRegister vmul0, FloatRegister vmul1, 75 FloatRegister vmul2, FloatRegister vmul3, 76 FloatRegister vpow, FloatRegister vpowm, 77 BasicType eltype) { 78 ARRAYS_HASHCODE_REGISTERS; 79 80 Register tmp1 = rscratch1, tmp2 = rscratch2; 81 82 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 83 84 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 85 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 86 // use 4H for chars and shorts instead, but using 8H gives better performance. 87 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 88 : eltype == T_CHAR || eltype == T_SHORT ? 8 89 : eltype == T_INT ? 4 90 : 0; 91 guarantee(vf, "unsupported eltype"); 92 93 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 94 const size_t unroll_factor = 4; 95 96 switch (eltype) { 97 case T_BOOLEAN: 98 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 99 break; 100 case T_CHAR: 101 BLOCK_COMMENT("arrays_hashcode(char) {"); 102 break; 103 case T_BYTE: 104 BLOCK_COMMENT("arrays_hashcode(byte) {"); 105 break; 106 case T_SHORT: 107 BLOCK_COMMENT("arrays_hashcode(short) {"); 108 break; 109 case T_INT: 110 BLOCK_COMMENT("arrays_hashcode(int) {"); 111 break; 112 default: 113 ShouldNotReachHere(); 114 } 115 116 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 117 // implemented by the stub executes just once. Call the stub only if at least two iterations will 118 // be executed. 119 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 120 cmpw(cnt, large_threshold); 121 br(Assembler::HS, LARGE); 122 123 bind(TAIL); 124 125 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 126 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 127 // Iteration eats up the remainder, uf elements at a time. 128 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 129 andr(tmp2, cnt, unroll_factor - 1); 130 adr(tmp1, BR_BASE); 131 // For Cortex-A53 offset is 4 because 2 nops are generated. 132 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3); 133 movw(tmp2, 0x1f); 134 br(tmp1); 135 136 bind(LOOP); 137 for (size_t i = 0; i < unroll_factor; ++i) { 138 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 139 maddw(result, result, tmp2, tmp1); 140 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 141 // Generate 2nd nop to have 4 instructions per iteration. 142 if (VM_Version::supports_a53mac()) { 143 nop(); 144 } 145 } 146 bind(BR_BASE); 147 subsw(cnt, cnt, unroll_factor); 148 br(Assembler::HS, LOOP); 149 150 b(DONE); 151 152 bind(LARGE); 153 154 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 155 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 156 address tpc = trampoline_call(stub); 157 if (tpc == nullptr) { 158 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 159 postcond(pc() == badAddress); 160 return nullptr; 161 } 162 163 bind(DONE); 164 165 BLOCK_COMMENT("} // arrays_hashcode"); 166 167 postcond(pc() != badAddress); 168 return pc(); 169 } 170 171 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 172 Register tmp2Reg, Register tmp3Reg) { 173 Register oop = objectReg; 174 Register box = boxReg; 175 Register disp_hdr = tmpReg; 176 Register tmp = tmp2Reg; 177 Label cont; 178 Label object_has_monitor; 179 Label count, no_count; 180 181 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 182 assert_different_registers(oop, box, tmp, disp_hdr, rscratch2); 183 184 // Load markWord from object into displaced_header. 185 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 186 187 if (DiagnoseSyncOnValueBasedClasses != 0) { 188 load_klass(tmp, oop); 189 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 190 tst(tmp, KlassFlags::_misc_is_value_based_class); 191 br(Assembler::NE, cont); 192 } 193 194 // Check for existing monitor 195 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 196 197 if (LockingMode == LM_MONITOR) { 198 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 199 b(cont); 200 } else { 201 assert(LockingMode == LM_LEGACY, "must be"); 202 // Set tmp to be (markWord of object | UNLOCK_VALUE). 203 orr(tmp, disp_hdr, markWord::unlocked_value); 204 205 if (EnableValhalla) { 206 // Mask inline_type bit such that we go to the slow path if object is an inline type 207 andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place)); 208 } 209 210 // Initialize the box. (Must happen before we update the object mark!) 211 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 212 213 // Compare object markWord with an unlocked value (tmp) and if 214 // equal exchange the stack address of our box with object markWord. 215 // On failure disp_hdr contains the possibly locked markWord. 216 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 217 /*release*/ true, /*weak*/ false, disp_hdr); 218 br(Assembler::EQ, cont); 219 220 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 221 222 // If the compare-and-exchange succeeded, then we found an unlocked 223 // object, will have now locked it will continue at label cont 224 225 // Check if the owner is self by comparing the value in the 226 // markWord of object (disp_hdr) with the stack pointer. 227 mov(rscratch1, sp); 228 sub(disp_hdr, disp_hdr, rscratch1); 229 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 230 // If condition is true we are cont and hence we can store 0 as the 231 // displaced header in the box, which indicates that it is a recursive lock. 232 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 233 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 234 b(cont); 235 } 236 237 // Handle existing monitor. 238 bind(object_has_monitor); 239 240 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 241 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 242 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 243 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 244 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 245 246 // Store a non-null value into the box to avoid looking like a re-entrant 247 // lock. The fast-path monitor unlock code checks for 248 // markWord::monitor_value so use markWord::unused_mark which has the 249 // relevant bit set, and also matches ObjectSynchronizer::enter. 250 mov(tmp, (address)markWord::unused_mark().value()); 251 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 252 253 br(Assembler::EQ, cont); // CAS success means locking succeeded 254 255 cmp(tmp3Reg, rscratch2); 256 br(Assembler::NE, cont); // Check for recursive locking 257 258 // Recursive lock case 259 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 260 // flag == EQ still from the cmp above, checking if this is a reentrant lock 261 262 bind(cont); 263 // flag == EQ indicates success 264 // flag == NE indicates failure 265 br(Assembler::NE, no_count); 266 267 bind(count); 268 if (LockingMode == LM_LEGACY) { 269 inc_held_monitor_count(rscratch1); 270 } 271 272 bind(no_count); 273 } 274 275 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 276 Register tmp2Reg) { 277 Register oop = objectReg; 278 Register box = boxReg; 279 Register disp_hdr = tmpReg; 280 Register owner_addr = tmpReg; 281 Register tmp = tmp2Reg; 282 Label cont; 283 Label object_has_monitor; 284 Label count, no_count; 285 Label unlocked; 286 287 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 288 assert_different_registers(oop, box, tmp, disp_hdr); 289 290 if (LockingMode == LM_LEGACY) { 291 // Find the lock address and load the displaced header from the stack. 292 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 293 294 // If the displaced header is 0, we have a recursive unlock. 295 cmp(disp_hdr, zr); 296 br(Assembler::EQ, cont); 297 } 298 299 // Handle existing monitor. 300 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 301 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 302 303 if (LockingMode == LM_MONITOR) { 304 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 305 b(cont); 306 } else { 307 assert(LockingMode == LM_LEGACY, "must be"); 308 // Check if it is still a light weight lock, this is is true if we 309 // see the stack address of the basicLock in the markWord of the 310 // object. 311 312 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 313 /*release*/ true, /*weak*/ false, tmp); 314 b(cont); 315 } 316 317 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 318 319 // Handle existing monitor. 320 bind(object_has_monitor); 321 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 322 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 323 324 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 325 326 Label notRecursive; 327 cbz(disp_hdr, notRecursive); 328 329 // Recursive lock 330 sub(disp_hdr, disp_hdr, 1u); 331 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 332 cmp(disp_hdr, disp_hdr); // Sets flags for result 333 b(cont); 334 335 bind(notRecursive); 336 337 // Compute owner address. 338 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 339 340 // Set owner to null. 341 // Release to satisfy the JMM 342 stlr(zr, owner_addr); 343 // We need a full fence after clearing owner to avoid stranding. 344 // StoreLoad achieves this. 345 membar(StoreLoad); 346 347 // Check if the entry_list is empty. 348 ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset())); 349 cmp(rscratch1, zr); 350 br(Assembler::EQ, cont); // If so we are done. 351 352 // Check if there is a successor. 353 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 354 cmp(rscratch1, zr); 355 br(Assembler::NE, unlocked); // If so we are done. 356 357 // Save the monitor pointer in the current thread, so we can try to 358 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 359 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 360 361 cmp(zr, rthread); // Set Flag to NE => slow path 362 b(cont); 363 364 bind(unlocked); 365 cmp(zr, zr); // Set Flag to EQ => fast path 366 367 // Intentional fall-through 368 369 bind(cont); 370 // flag == EQ indicates success 371 // flag == NE indicates failure 372 br(Assembler::NE, no_count); 373 374 bind(count); 375 if (LockingMode == LM_LEGACY) { 376 dec_held_monitor_count(rscratch1); 377 } 378 379 bind(no_count); 380 } 381 382 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 383 Register t2, Register t3) { 384 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 385 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 386 387 // Handle inflated monitor. 388 Label inflated; 389 // Finish fast lock successfully. MUST branch to with flag == EQ 390 Label locked; 391 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 392 Label slow_path; 393 394 if (UseObjectMonitorTable) { 395 // Clear cache in case fast locking succeeds or we need to take the slow-path. 396 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 397 } 398 399 if (DiagnoseSyncOnValueBasedClasses != 0) { 400 load_klass(t1, obj); 401 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 402 tst(t1, KlassFlags::_misc_is_value_based_class); 403 br(Assembler::NE, slow_path); 404 } 405 406 const Register t1_mark = t1; 407 const Register t3_t = t3; 408 409 { // Lightweight locking 410 411 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 412 Label push; 413 414 const Register t2_top = t2; 415 416 // Check if lock-stack is full. 417 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 418 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 419 br(Assembler::GT, slow_path); 420 421 // Check if recursive. 422 subw(t3_t, t2_top, oopSize); 423 ldr(t3_t, Address(rthread, t3_t)); 424 cmp(obj, t3_t); 425 br(Assembler::EQ, push); 426 427 // Relaxed normal load to check for monitor. Optimization for monitor case. 428 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 429 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 430 431 // Not inflated 432 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 433 434 // Try to lock. Transition lock-bits 0b01 => 0b00 435 orr(t1_mark, t1_mark, markWord::unlocked_value); 436 eor(t3_t, t1_mark, markWord::unlocked_value); 437 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 438 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 439 br(Assembler::NE, slow_path); 440 441 bind(push); 442 // After successful lock, push object on lock-stack. 443 str(obj, Address(rthread, t2_top)); 444 addw(t2_top, t2_top, oopSize); 445 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 446 b(locked); 447 } 448 449 { // Handle inflated monitor. 450 bind(inflated); 451 452 const Register t1_monitor = t1; 453 454 if (!UseObjectMonitorTable) { 455 assert(t1_monitor == t1_mark, "should be the same here"); 456 } else { 457 Label monitor_found; 458 459 // Load cache address 460 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 461 462 const int num_unrolled = 2; 463 for (int i = 0; i < num_unrolled; i++) { 464 ldr(t1, Address(t3_t)); 465 cmp(obj, t1); 466 br(Assembler::EQ, monitor_found); 467 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 468 } 469 470 Label loop; 471 472 // Search for obj in cache. 473 bind(loop); 474 475 // Check for match. 476 ldr(t1, Address(t3_t)); 477 cmp(obj, t1); 478 br(Assembler::EQ, monitor_found); 479 480 // Search until null encountered, guaranteed _null_sentinel at end. 481 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 482 cbnz(t1, loop); 483 // Cache Miss, NE set from cmp above, cbnz does not set flags 484 b(slow_path); 485 486 bind(monitor_found); 487 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 488 } 489 490 const Register t2_owner_addr = t2; 491 const Register t3_owner = t3; 492 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 493 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 494 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 495 496 Label monitor_locked; 497 498 // Compute owner address. 499 lea(t2_owner_addr, owner_address); 500 501 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 502 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 503 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 504 /*release*/ false, /*weak*/ false, t3_owner); 505 br(Assembler::EQ, monitor_locked); 506 507 // Check if recursive. 508 cmp(t3_owner, rscratch2); 509 br(Assembler::NE, slow_path); 510 511 // Recursive. 512 increment(recursions_address, 1); 513 514 bind(monitor_locked); 515 if (UseObjectMonitorTable) { 516 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 517 } 518 } 519 520 bind(locked); 521 522 #ifdef ASSERT 523 // Check that locked label is reached with Flags == EQ. 524 Label flag_correct; 525 br(Assembler::EQ, flag_correct); 526 stop("Fast Lock Flag != EQ"); 527 #endif 528 529 bind(slow_path); 530 #ifdef ASSERT 531 // Check that slow_path label is reached with Flags == NE. 532 br(Assembler::NE, flag_correct); 533 stop("Fast Lock Flag != NE"); 534 bind(flag_correct); 535 #endif 536 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 537 } 538 539 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 540 Register t2, Register t3) { 541 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 542 assert_different_registers(obj, box, t1, t2, t3); 543 544 // Handle inflated monitor. 545 Label inflated, inflated_load_mark; 546 // Finish fast unlock successfully. MUST branch to with flag == EQ 547 Label unlocked; 548 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 549 Label slow_path; 550 551 const Register t1_mark = t1; 552 const Register t2_top = t2; 553 const Register t3_t = t3; 554 555 { // Lightweight unlock 556 557 Label push_and_slow_path; 558 559 // Check if obj is top of lock-stack. 560 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 561 subw(t2_top, t2_top, oopSize); 562 ldr(t3_t, Address(rthread, t2_top)); 563 cmp(obj, t3_t); 564 // Top of lock stack was not obj. Must be monitor. 565 br(Assembler::NE, inflated_load_mark); 566 567 // Pop lock-stack. 568 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 569 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 570 571 // Check if recursive. 572 subw(t3_t, t2_top, oopSize); 573 ldr(t3_t, Address(rthread, t3_t)); 574 cmp(obj, t3_t); 575 br(Assembler::EQ, unlocked); 576 577 // Not recursive. 578 // Load Mark. 579 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 580 581 // Check header for monitor (0b10). 582 // Because we got here by popping (meaning we pushed in locked) 583 // there will be no monitor in the box. So we need to push back the obj 584 // so that the runtime can fix any potential anonymous owner. 585 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 586 587 // Try to unlock. Transition lock bits 0b00 => 0b01 588 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 589 orr(t3_t, t1_mark, markWord::unlocked_value); 590 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 591 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 592 br(Assembler::EQ, unlocked); 593 594 bind(push_and_slow_path); 595 // Compare and exchange failed. 596 // Restore lock-stack and handle the unlock in runtime. 597 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 598 addw(t2_top, t2_top, oopSize); 599 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 600 b(slow_path); 601 } 602 603 604 { // Handle inflated monitor. 605 bind(inflated_load_mark); 606 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 607 #ifdef ASSERT 608 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 609 stop("Fast Unlock not monitor"); 610 #endif 611 612 bind(inflated); 613 614 #ifdef ASSERT 615 Label check_done; 616 subw(t2_top, t2_top, oopSize); 617 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 618 br(Assembler::LT, check_done); 619 ldr(t3_t, Address(rthread, t2_top)); 620 cmp(obj, t3_t); 621 br(Assembler::NE, inflated); 622 stop("Fast Unlock lock on stack"); 623 bind(check_done); 624 #endif 625 626 const Register t1_monitor = t1; 627 628 if (!UseObjectMonitorTable) { 629 assert(t1_monitor == t1_mark, "should be the same here"); 630 631 // Untag the monitor. 632 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 633 } else { 634 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 635 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 636 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 637 br(Assembler::LO, slow_path); 638 } 639 640 const Register t2_recursions = t2; 641 Label not_recursive; 642 643 // Check if recursive. 644 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 645 cbz(t2_recursions, not_recursive); 646 647 // Recursive unlock. 648 sub(t2_recursions, t2_recursions, 1u); 649 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 650 // Set flag == EQ 651 cmp(t2_recursions, t2_recursions); 652 b(unlocked); 653 654 bind(not_recursive); 655 656 const Register t2_owner_addr = t2; 657 658 // Compute owner address. 659 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 660 661 // Set owner to null. 662 // Release to satisfy the JMM 663 stlr(zr, t2_owner_addr); 664 // We need a full fence after clearing owner to avoid stranding. 665 // StoreLoad achieves this. 666 membar(StoreLoad); 667 668 // Check if the entry_list is empty. 669 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset())); 670 cmp(rscratch1, zr); 671 br(Assembler::EQ, unlocked); // If so we are done. 672 673 // Check if there is a successor. 674 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 675 cmp(rscratch1, zr); 676 br(Assembler::NE, unlocked); // If so we are done. 677 678 // Save the monitor pointer in the current thread, so we can try to 679 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 680 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 681 682 cmp(zr, rthread); // Set Flag to NE => slow path 683 b(slow_path); 684 } 685 686 bind(unlocked); 687 cmp(zr, zr); // Set Flags to EQ => fast path 688 689 #ifdef ASSERT 690 // Check that unlocked label is reached with Flags == EQ. 691 Label flag_correct; 692 br(Assembler::EQ, flag_correct); 693 stop("Fast Unlock Flag != EQ"); 694 #endif 695 696 bind(slow_path); 697 #ifdef ASSERT 698 // Check that slow_path label is reached with Flags == NE. 699 br(Assembler::NE, flag_correct); 700 stop("Fast Unlock Flag != NE"); 701 bind(flag_correct); 702 #endif 703 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 704 } 705 706 // Search for str1 in str2 and return index or -1 707 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 708 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 709 Register cnt2, Register cnt1, 710 Register tmp1, Register tmp2, 711 Register tmp3, Register tmp4, 712 Register tmp5, Register tmp6, 713 int icnt1, Register result, int ae) { 714 // NOTE: tmp5, tmp6 can be zr depending on specific method version 715 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 716 717 Register ch1 = rscratch1; 718 Register ch2 = rscratch2; 719 Register cnt1tmp = tmp1; 720 Register cnt2tmp = tmp2; 721 Register cnt1_neg = cnt1; 722 Register cnt2_neg = cnt2; 723 Register result_tmp = tmp4; 724 725 bool isL = ae == StrIntrinsicNode::LL; 726 727 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 728 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 729 int str1_chr_shift = str1_isL ? 0:1; 730 int str2_chr_shift = str2_isL ? 0:1; 731 int str1_chr_size = str1_isL ? 1:2; 732 int str2_chr_size = str2_isL ? 1:2; 733 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 734 (chr_insn)&MacroAssembler::ldrh; 735 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 736 (chr_insn)&MacroAssembler::ldrh; 737 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 738 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 739 740 // Note, inline_string_indexOf() generates checks: 741 // if (substr.count > string.count) return -1; 742 // if (substr.count == 0) return 0; 743 744 // We have two strings, a source string in str2, cnt2 and a pattern string 745 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 746 747 // For larger pattern and source we use a simplified Boyer Moore algorithm. 748 // With a small pattern and source we use linear scan. 749 750 if (icnt1 == -1) { 751 sub(result_tmp, cnt2, cnt1); 752 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 753 br(LT, LINEARSEARCH); 754 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 755 subs(zr, cnt1, 256); 756 lsr(tmp1, cnt2, 2); 757 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 758 br(GE, LINEARSTUB); 759 } 760 761 // The Boyer Moore alogorithm is based on the description here:- 762 // 763 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 764 // 765 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 766 // and the 'Good Suffix' rule. 767 // 768 // These rules are essentially heuristics for how far we can shift the 769 // pattern along the search string. 770 // 771 // The implementation here uses the 'Bad Character' rule only because of the 772 // complexity of initialisation for the 'Good Suffix' rule. 773 // 774 // This is also known as the Boyer-Moore-Horspool algorithm:- 775 // 776 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 777 // 778 // This particular implementation has few java-specific optimizations. 779 // 780 // #define ASIZE 256 781 // 782 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 783 // int i, j; 784 // unsigned c; 785 // unsigned char bc[ASIZE]; 786 // 787 // /* Preprocessing */ 788 // for (i = 0; i < ASIZE; ++i) 789 // bc[i] = m; 790 // for (i = 0; i < m - 1; ) { 791 // c = x[i]; 792 // ++i; 793 // // c < 256 for Latin1 string, so, no need for branch 794 // #ifdef PATTERN_STRING_IS_LATIN1 795 // bc[c] = m - i; 796 // #else 797 // if (c < ASIZE) bc[c] = m - i; 798 // #endif 799 // } 800 // 801 // /* Searching */ 802 // j = 0; 803 // while (j <= n - m) { 804 // c = y[i+j]; 805 // if (x[m-1] == c) 806 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 807 // if (i < 0) return j; 808 // // c < 256 for Latin1 string, so, no need for branch 809 // #ifdef SOURCE_STRING_IS_LATIN1 810 // // LL case: (c< 256) always true. Remove branch 811 // j += bc[y[j+m-1]]; 812 // #endif 813 // #ifndef PATTERN_STRING_IS_UTF 814 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 815 // if (c < ASIZE) 816 // j += bc[y[j+m-1]]; 817 // else 818 // j += 1 819 // #endif 820 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 821 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 822 // if (c < ASIZE) 823 // j += bc[y[j+m-1]]; 824 // else 825 // j += m 826 // #endif 827 // } 828 // } 829 830 if (icnt1 == -1) { 831 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 832 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 833 Register cnt1end = tmp2; 834 Register str2end = cnt2; 835 Register skipch = tmp2; 836 837 // str1 length is >=8, so, we can read at least 1 register for cases when 838 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 839 // UL case. We'll re-read last character in inner pre-loop code to have 840 // single outer pre-loop load 841 const int firstStep = isL ? 7 : 3; 842 843 const int ASIZE = 256; 844 const int STORED_BYTES = 32; // amount of bytes stored per instruction 845 sub(sp, sp, ASIZE); 846 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 847 mov(ch1, sp); 848 BIND(BM_INIT_LOOP); 849 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 850 subs(tmp5, tmp5, 1); 851 br(GT, BM_INIT_LOOP); 852 853 sub(cnt1tmp, cnt1, 1); 854 mov(tmp5, str2); 855 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 856 sub(ch2, cnt1, 1); 857 mov(tmp3, str1); 858 BIND(BCLOOP); 859 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 860 if (!str1_isL) { 861 subs(zr, ch1, ASIZE); 862 br(HS, BCSKIP); 863 } 864 strb(ch2, Address(sp, ch1)); 865 BIND(BCSKIP); 866 subs(ch2, ch2, 1); 867 br(GT, BCLOOP); 868 869 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 870 if (str1_isL == str2_isL) { 871 // load last 8 bytes (8LL/4UU symbols) 872 ldr(tmp6, Address(tmp6, -wordSize)); 873 } else { 874 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 875 // convert Latin1 to UTF. We'll have to wait until load completed, but 876 // it's still faster than per-character loads+checks 877 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 878 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 879 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 880 andr(tmp6, tmp6, 0xFF); // str1[N-4] 881 orr(ch2, ch1, ch2, LSL, 16); 882 orr(tmp6, tmp6, tmp3, LSL, 48); 883 orr(tmp6, tmp6, ch2, LSL, 16); 884 } 885 BIND(BMLOOPSTR2); 886 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 887 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 888 if (str1_isL == str2_isL) { 889 // re-init tmp3. It's for free because it's executed in parallel with 890 // load above. Alternative is to initialize it before loop, but it'll 891 // affect performance on in-order systems with 2 or more ld/st pipelines 892 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 893 } 894 if (!isL) { // UU/UL case 895 lsl(ch2, cnt1tmp, 1); // offset in bytes 896 } 897 cmp(tmp3, skipch); 898 br(NE, BMSKIP); 899 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 900 mov(ch1, tmp6); 901 if (isL) { 902 b(BMLOOPSTR1_AFTER_LOAD); 903 } else { 904 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 905 b(BMLOOPSTR1_CMP); 906 } 907 BIND(BMLOOPSTR1); 908 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 909 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 910 BIND(BMLOOPSTR1_AFTER_LOAD); 911 subs(cnt1tmp, cnt1tmp, 1); 912 br(LT, BMLOOPSTR1_LASTCMP); 913 BIND(BMLOOPSTR1_CMP); 914 cmp(ch1, ch2); 915 br(EQ, BMLOOPSTR1); 916 BIND(BMSKIP); 917 if (!isL) { 918 // if we've met UTF symbol while searching Latin1 pattern, then we can 919 // skip cnt1 symbols 920 if (str1_isL != str2_isL) { 921 mov(result_tmp, cnt1); 922 } else { 923 mov(result_tmp, 1); 924 } 925 subs(zr, skipch, ASIZE); 926 br(HS, BMADV); 927 } 928 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 929 BIND(BMADV); 930 sub(cnt1tmp, cnt1, 1); 931 add(str2, str2, result_tmp, LSL, str2_chr_shift); 932 cmp(str2, str2end); 933 br(LE, BMLOOPSTR2); 934 add(sp, sp, ASIZE); 935 b(NOMATCH); 936 BIND(BMLOOPSTR1_LASTCMP); 937 cmp(ch1, ch2); 938 br(NE, BMSKIP); 939 BIND(BMMATCH); 940 sub(result, str2, tmp5); 941 if (!str2_isL) lsr(result, result, 1); 942 add(sp, sp, ASIZE); 943 b(DONE); 944 945 BIND(LINEARSTUB); 946 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 947 br(LT, LINEAR_MEDIUM); 948 mov(result, zr); 949 RuntimeAddress stub = nullptr; 950 if (isL) { 951 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 952 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 953 } else if (str1_isL) { 954 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 955 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 956 } else { 957 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 958 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 959 } 960 address call = trampoline_call(stub); 961 if (call == nullptr) { 962 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 963 ciEnv::current()->record_failure("CodeCache is full"); 964 return; 965 } 966 b(DONE); 967 } 968 969 BIND(LINEARSEARCH); 970 { 971 Label DO1, DO2, DO3; 972 973 Register str2tmp = tmp2; 974 Register first = tmp3; 975 976 if (icnt1 == -1) 977 { 978 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 979 980 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 981 br(LT, DOSHORT); 982 BIND(LINEAR_MEDIUM); 983 (this->*str1_load_1chr)(first, Address(str1)); 984 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 985 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 986 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 987 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 988 989 BIND(FIRST_LOOP); 990 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 991 cmp(first, ch2); 992 br(EQ, STR1_LOOP); 993 BIND(STR2_NEXT); 994 adds(cnt2_neg, cnt2_neg, str2_chr_size); 995 br(LE, FIRST_LOOP); 996 b(NOMATCH); 997 998 BIND(STR1_LOOP); 999 adds(cnt1tmp, cnt1_neg, str1_chr_size); 1000 add(cnt2tmp, cnt2_neg, str2_chr_size); 1001 br(GE, MATCH); 1002 1003 BIND(STR1_NEXT); 1004 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 1005 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1006 cmp(ch1, ch2); 1007 br(NE, STR2_NEXT); 1008 adds(cnt1tmp, cnt1tmp, str1_chr_size); 1009 add(cnt2tmp, cnt2tmp, str2_chr_size); 1010 br(LT, STR1_NEXT); 1011 b(MATCH); 1012 1013 BIND(DOSHORT); 1014 if (str1_isL == str2_isL) { 1015 cmp(cnt1, (u1)2); 1016 br(LT, DO1); 1017 br(GT, DO3); 1018 } 1019 } 1020 1021 if (icnt1 == 4) { 1022 Label CH1_LOOP; 1023 1024 (this->*load_4chr)(ch1, str1); 1025 sub(result_tmp, cnt2, 4); 1026 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1027 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1028 1029 BIND(CH1_LOOP); 1030 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 1031 cmp(ch1, ch2); 1032 br(EQ, MATCH); 1033 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1034 br(LE, CH1_LOOP); 1035 b(NOMATCH); 1036 } 1037 1038 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1039 Label CH1_LOOP; 1040 1041 BIND(DO2); 1042 (this->*load_2chr)(ch1, str1); 1043 if (icnt1 == 2) { 1044 sub(result_tmp, cnt2, 2); 1045 } 1046 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1047 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1048 BIND(CH1_LOOP); 1049 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1050 cmp(ch1, ch2); 1051 br(EQ, MATCH); 1052 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1053 br(LE, CH1_LOOP); 1054 b(NOMATCH); 1055 } 1056 1057 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1058 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1059 1060 BIND(DO3); 1061 (this->*load_2chr)(first, str1); 1062 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1063 if (icnt1 == 3) { 1064 sub(result_tmp, cnt2, 3); 1065 } 1066 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1067 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1068 BIND(FIRST_LOOP); 1069 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1070 cmpw(first, ch2); 1071 br(EQ, STR1_LOOP); 1072 BIND(STR2_NEXT); 1073 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1074 br(LE, FIRST_LOOP); 1075 b(NOMATCH); 1076 1077 BIND(STR1_LOOP); 1078 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1079 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1080 cmp(ch1, ch2); 1081 br(NE, STR2_NEXT); 1082 b(MATCH); 1083 } 1084 1085 if (icnt1 == -1 || icnt1 == 1) { 1086 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1087 1088 BIND(DO1); 1089 (this->*str1_load_1chr)(ch1, str1); 1090 cmp(cnt2, (u1)8); 1091 br(LT, DO1_SHORT); 1092 1093 sub(result_tmp, cnt2, 8/str2_chr_size); 1094 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1095 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1096 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1097 1098 if (str2_isL) { 1099 orr(ch1, ch1, ch1, LSL, 8); 1100 } 1101 orr(ch1, ch1, ch1, LSL, 16); 1102 orr(ch1, ch1, ch1, LSL, 32); 1103 BIND(CH1_LOOP); 1104 ldr(ch2, Address(str2, cnt2_neg)); 1105 eor(ch2, ch1, ch2); 1106 sub(tmp1, ch2, tmp3); 1107 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1108 bics(tmp1, tmp1, tmp2); 1109 br(NE, HAS_ZERO); 1110 adds(cnt2_neg, cnt2_neg, 8); 1111 br(LT, CH1_LOOP); 1112 1113 cmp(cnt2_neg, (u1)8); 1114 mov(cnt2_neg, 0); 1115 br(LT, CH1_LOOP); 1116 b(NOMATCH); 1117 1118 BIND(HAS_ZERO); 1119 rev(tmp1, tmp1); 1120 clz(tmp1, tmp1); 1121 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1122 b(MATCH); 1123 1124 BIND(DO1_SHORT); 1125 mov(result_tmp, cnt2); 1126 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1127 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1128 BIND(DO1_LOOP); 1129 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1130 cmpw(ch1, ch2); 1131 br(EQ, MATCH); 1132 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1133 br(LT, DO1_LOOP); 1134 } 1135 } 1136 BIND(NOMATCH); 1137 mov(result, -1); 1138 b(DONE); 1139 BIND(MATCH); 1140 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1141 BIND(DONE); 1142 } 1143 1144 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1145 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1146 1147 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1148 Register ch, Register result, 1149 Register tmp1, Register tmp2, Register tmp3) 1150 { 1151 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1152 Register cnt1_neg = cnt1; 1153 Register ch1 = rscratch1; 1154 Register result_tmp = rscratch2; 1155 1156 cbz(cnt1, NOMATCH); 1157 1158 cmp(cnt1, (u1)4); 1159 br(LT, DO1_SHORT); 1160 1161 orr(ch, ch, ch, LSL, 16); 1162 orr(ch, ch, ch, LSL, 32); 1163 1164 sub(cnt1, cnt1, 4); 1165 mov(result_tmp, cnt1); 1166 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1167 sub(cnt1_neg, zr, cnt1, LSL, 1); 1168 1169 mov(tmp3, 0x0001000100010001); 1170 1171 BIND(CH1_LOOP); 1172 ldr(ch1, Address(str1, cnt1_neg)); 1173 eor(ch1, ch, ch1); 1174 sub(tmp1, ch1, tmp3); 1175 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1176 bics(tmp1, tmp1, tmp2); 1177 br(NE, HAS_ZERO); 1178 adds(cnt1_neg, cnt1_neg, 8); 1179 br(LT, CH1_LOOP); 1180 1181 cmp(cnt1_neg, (u1)8); 1182 mov(cnt1_neg, 0); 1183 br(LT, CH1_LOOP); 1184 b(NOMATCH); 1185 1186 BIND(HAS_ZERO); 1187 rev(tmp1, tmp1); 1188 clz(tmp1, tmp1); 1189 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1190 b(MATCH); 1191 1192 BIND(DO1_SHORT); 1193 mov(result_tmp, cnt1); 1194 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1195 sub(cnt1_neg, zr, cnt1, LSL, 1); 1196 BIND(DO1_LOOP); 1197 ldrh(ch1, Address(str1, cnt1_neg)); 1198 cmpw(ch, ch1); 1199 br(EQ, MATCH); 1200 adds(cnt1_neg, cnt1_neg, 2); 1201 br(LT, DO1_LOOP); 1202 BIND(NOMATCH); 1203 mov(result, -1); 1204 b(DONE); 1205 BIND(MATCH); 1206 add(result, result_tmp, cnt1_neg, ASR, 1); 1207 BIND(DONE); 1208 } 1209 1210 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1211 Register ch, Register result, 1212 FloatRegister ztmp1, 1213 FloatRegister ztmp2, 1214 PRegister tmp_pg, 1215 PRegister tmp_pdn, bool isL) 1216 { 1217 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1218 assert(tmp_pg->is_governing(), 1219 "this register has to be a governing predicate register"); 1220 1221 Label LOOP, MATCH, DONE, NOMATCH; 1222 Register vec_len = rscratch1; 1223 Register idx = rscratch2; 1224 1225 SIMD_RegVariant T = (isL == true) ? B : H; 1226 1227 cbz(cnt1, NOMATCH); 1228 1229 // Assign the particular char throughout the vector. 1230 sve_dup(ztmp2, T, ch); 1231 if (isL) { 1232 sve_cntb(vec_len); 1233 } else { 1234 sve_cnth(vec_len); 1235 } 1236 mov(idx, 0); 1237 1238 // Generate a predicate to control the reading of input string. 1239 sve_whilelt(tmp_pg, T, idx, cnt1); 1240 1241 BIND(LOOP); 1242 // Read a vector of 8- or 16-bit data depending on the string type. Note 1243 // that inactive elements indicated by the predicate register won't cause 1244 // a data read from memory to the destination vector. 1245 if (isL) { 1246 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1247 } else { 1248 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1249 } 1250 add(idx, idx, vec_len); 1251 1252 // Perform the comparison. An element of the destination predicate is set 1253 // to active if the particular char is matched. 1254 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1255 1256 // Branch if the particular char is found. 1257 br(NE, MATCH); 1258 1259 sve_whilelt(tmp_pg, T, idx, cnt1); 1260 1261 // Loop back if the particular char not found. 1262 br(MI, LOOP); 1263 1264 BIND(NOMATCH); 1265 mov(result, -1); 1266 b(DONE); 1267 1268 BIND(MATCH); 1269 // Undo the index increment. 1270 sub(idx, idx, vec_len); 1271 1272 // Crop the vector to find its location. 1273 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1274 add(result, idx, -1); 1275 sve_incp(result, T, tmp_pdn); 1276 BIND(DONE); 1277 } 1278 1279 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1280 Register ch, Register result, 1281 Register tmp1, Register tmp2, Register tmp3) 1282 { 1283 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1284 Register cnt1_neg = cnt1; 1285 Register ch1 = rscratch1; 1286 Register result_tmp = rscratch2; 1287 1288 cbz(cnt1, NOMATCH); 1289 1290 cmp(cnt1, (u1)8); 1291 br(LT, DO1_SHORT); 1292 1293 orr(ch, ch, ch, LSL, 8); 1294 orr(ch, ch, ch, LSL, 16); 1295 orr(ch, ch, ch, LSL, 32); 1296 1297 sub(cnt1, cnt1, 8); 1298 mov(result_tmp, cnt1); 1299 lea(str1, Address(str1, cnt1)); 1300 sub(cnt1_neg, zr, cnt1); 1301 1302 mov(tmp3, 0x0101010101010101); 1303 1304 BIND(CH1_LOOP); 1305 ldr(ch1, Address(str1, cnt1_neg)); 1306 eor(ch1, ch, ch1); 1307 sub(tmp1, ch1, tmp3); 1308 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1309 bics(tmp1, tmp1, tmp2); 1310 br(NE, HAS_ZERO); 1311 adds(cnt1_neg, cnt1_neg, 8); 1312 br(LT, CH1_LOOP); 1313 1314 cmp(cnt1_neg, (u1)8); 1315 mov(cnt1_neg, 0); 1316 br(LT, CH1_LOOP); 1317 b(NOMATCH); 1318 1319 BIND(HAS_ZERO); 1320 rev(tmp1, tmp1); 1321 clz(tmp1, tmp1); 1322 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1323 b(MATCH); 1324 1325 BIND(DO1_SHORT); 1326 mov(result_tmp, cnt1); 1327 lea(str1, Address(str1, cnt1)); 1328 sub(cnt1_neg, zr, cnt1); 1329 BIND(DO1_LOOP); 1330 ldrb(ch1, Address(str1, cnt1_neg)); 1331 cmp(ch, ch1); 1332 br(EQ, MATCH); 1333 adds(cnt1_neg, cnt1_neg, 1); 1334 br(LT, DO1_LOOP); 1335 BIND(NOMATCH); 1336 mov(result, -1); 1337 b(DONE); 1338 BIND(MATCH); 1339 add(result, result_tmp, cnt1_neg); 1340 BIND(DONE); 1341 } 1342 1343 // Compare strings. 1344 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1345 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1346 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1347 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1348 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1349 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1350 SHORT_LOOP_START, TAIL_CHECK; 1351 1352 bool isLL = ae == StrIntrinsicNode::LL; 1353 bool isLU = ae == StrIntrinsicNode::LU; 1354 bool isUL = ae == StrIntrinsicNode::UL; 1355 1356 // The stub threshold for LL strings is: 72 (64 + 8) chars 1357 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1358 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1359 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1360 1361 bool str1_isL = isLL || isLU; 1362 bool str2_isL = isLL || isUL; 1363 1364 int str1_chr_shift = str1_isL ? 0 : 1; 1365 int str2_chr_shift = str2_isL ? 0 : 1; 1366 int str1_chr_size = str1_isL ? 1 : 2; 1367 int str2_chr_size = str2_isL ? 1 : 2; 1368 int minCharsInWord = isLL ? wordSize : wordSize/2; 1369 1370 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1371 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1372 (chr_insn)&MacroAssembler::ldrh; 1373 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1374 (chr_insn)&MacroAssembler::ldrh; 1375 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1376 (uxt_insn)&MacroAssembler::uxthw; 1377 1378 BLOCK_COMMENT("string_compare {"); 1379 1380 // Bizarrely, the counts are passed in bytes, regardless of whether they 1381 // are L or U strings, however the result is always in characters. 1382 if (!str1_isL) asrw(cnt1, cnt1, 1); 1383 if (!str2_isL) asrw(cnt2, cnt2, 1); 1384 1385 // Compute the minimum of the string lengths and save the difference. 1386 subsw(result, cnt1, cnt2); 1387 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1388 1389 // A very short string 1390 cmpw(cnt2, minCharsInWord); 1391 br(Assembler::LE, SHORT_STRING); 1392 1393 // Compare longwords 1394 // load first parts of strings and finish initialization while loading 1395 { 1396 if (str1_isL == str2_isL) { // LL or UU 1397 ldr(tmp1, Address(str1)); 1398 cmp(str1, str2); 1399 br(Assembler::EQ, DONE); 1400 ldr(tmp2, Address(str2)); 1401 cmp(cnt2, stub_threshold); 1402 br(GE, STUB); 1403 subsw(cnt2, cnt2, minCharsInWord); 1404 br(EQ, TAIL_CHECK); 1405 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1406 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1407 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1408 } else if (isLU) { 1409 ldrs(vtmp, Address(str1)); 1410 ldr(tmp2, Address(str2)); 1411 cmp(cnt2, stub_threshold); 1412 br(GE, STUB); 1413 subw(cnt2, cnt2, 4); 1414 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1415 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1416 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1417 zip1(vtmp, T8B, vtmp, vtmpZ); 1418 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1419 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1420 add(cnt1, cnt1, 4); 1421 fmovd(tmp1, vtmp); 1422 } else { // UL case 1423 ldr(tmp1, Address(str1)); 1424 ldrs(vtmp, Address(str2)); 1425 cmp(cnt2, stub_threshold); 1426 br(GE, STUB); 1427 subw(cnt2, cnt2, 4); 1428 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1429 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1430 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1431 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1432 zip1(vtmp, T8B, vtmp, vtmpZ); 1433 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1434 add(cnt1, cnt1, 8); 1435 fmovd(tmp2, vtmp); 1436 } 1437 adds(cnt2, cnt2, isUL ? 4 : 8); 1438 br(GE, TAIL); 1439 eor(rscratch2, tmp1, tmp2); 1440 cbnz(rscratch2, DIFF); 1441 // main loop 1442 bind(NEXT_WORD); 1443 if (str1_isL == str2_isL) { 1444 ldr(tmp1, Address(str1, cnt2)); 1445 ldr(tmp2, Address(str2, cnt2)); 1446 adds(cnt2, cnt2, 8); 1447 } else if (isLU) { 1448 ldrs(vtmp, Address(str1, cnt1)); 1449 ldr(tmp2, Address(str2, cnt2)); 1450 add(cnt1, cnt1, 4); 1451 zip1(vtmp, T8B, vtmp, vtmpZ); 1452 fmovd(tmp1, vtmp); 1453 adds(cnt2, cnt2, 8); 1454 } else { // UL 1455 ldrs(vtmp, Address(str2, cnt2)); 1456 ldr(tmp1, Address(str1, cnt1)); 1457 zip1(vtmp, T8B, vtmp, vtmpZ); 1458 add(cnt1, cnt1, 8); 1459 fmovd(tmp2, vtmp); 1460 adds(cnt2, cnt2, 4); 1461 } 1462 br(GE, TAIL); 1463 1464 eor(rscratch2, tmp1, tmp2); 1465 cbz(rscratch2, NEXT_WORD); 1466 b(DIFF); 1467 bind(TAIL); 1468 eor(rscratch2, tmp1, tmp2); 1469 cbnz(rscratch2, DIFF); 1470 // Last longword. In the case where length == 4 we compare the 1471 // same longword twice, but that's still faster than another 1472 // conditional branch. 1473 if (str1_isL == str2_isL) { 1474 ldr(tmp1, Address(str1)); 1475 ldr(tmp2, Address(str2)); 1476 } else if (isLU) { 1477 ldrs(vtmp, Address(str1)); 1478 ldr(tmp2, Address(str2)); 1479 zip1(vtmp, T8B, vtmp, vtmpZ); 1480 fmovd(tmp1, vtmp); 1481 } else { // UL 1482 ldrs(vtmp, Address(str2)); 1483 ldr(tmp1, Address(str1)); 1484 zip1(vtmp, T8B, vtmp, vtmpZ); 1485 fmovd(tmp2, vtmp); 1486 } 1487 bind(TAIL_CHECK); 1488 eor(rscratch2, tmp1, tmp2); 1489 cbz(rscratch2, DONE); 1490 1491 // Find the first different characters in the longwords and 1492 // compute their difference. 1493 bind(DIFF); 1494 rev(rscratch2, rscratch2); 1495 clz(rscratch2, rscratch2); 1496 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1497 lsrv(tmp1, tmp1, rscratch2); 1498 (this->*ext_chr)(tmp1, tmp1); 1499 lsrv(tmp2, tmp2, rscratch2); 1500 (this->*ext_chr)(tmp2, tmp2); 1501 subw(result, tmp1, tmp2); 1502 b(DONE); 1503 } 1504 1505 bind(STUB); 1506 RuntimeAddress stub = nullptr; 1507 switch(ae) { 1508 case StrIntrinsicNode::LL: 1509 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1510 break; 1511 case StrIntrinsicNode::UU: 1512 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1513 break; 1514 case StrIntrinsicNode::LU: 1515 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1516 break; 1517 case StrIntrinsicNode::UL: 1518 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1519 break; 1520 default: 1521 ShouldNotReachHere(); 1522 } 1523 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1524 address call = trampoline_call(stub); 1525 if (call == nullptr) { 1526 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1527 ciEnv::current()->record_failure("CodeCache is full"); 1528 return; 1529 } 1530 b(DONE); 1531 1532 bind(SHORT_STRING); 1533 // Is the minimum length zero? 1534 cbz(cnt2, DONE); 1535 // arrange code to do most branches while loading and loading next characters 1536 // while comparing previous 1537 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1538 subs(cnt2, cnt2, 1); 1539 br(EQ, SHORT_LAST_INIT); 1540 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1541 b(SHORT_LOOP_START); 1542 bind(SHORT_LOOP); 1543 subs(cnt2, cnt2, 1); 1544 br(EQ, SHORT_LAST); 1545 bind(SHORT_LOOP_START); 1546 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1547 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1548 cmp(tmp1, cnt1); 1549 br(NE, SHORT_LOOP_TAIL); 1550 subs(cnt2, cnt2, 1); 1551 br(EQ, SHORT_LAST2); 1552 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1553 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1554 cmp(tmp2, rscratch1); 1555 br(EQ, SHORT_LOOP); 1556 sub(result, tmp2, rscratch1); 1557 b(DONE); 1558 bind(SHORT_LOOP_TAIL); 1559 sub(result, tmp1, cnt1); 1560 b(DONE); 1561 bind(SHORT_LAST2); 1562 cmp(tmp2, rscratch1); 1563 br(EQ, DONE); 1564 sub(result, tmp2, rscratch1); 1565 1566 b(DONE); 1567 bind(SHORT_LAST_INIT); 1568 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1569 bind(SHORT_LAST); 1570 cmp(tmp1, cnt1); 1571 br(EQ, DONE); 1572 sub(result, tmp1, cnt1); 1573 1574 bind(DONE); 1575 1576 BLOCK_COMMENT("} string_compare"); 1577 } 1578 1579 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1580 FloatRegister src2, Condition cond, bool isQ) { 1581 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1582 FloatRegister zn = src1, zm = src2; 1583 bool needs_negation = false; 1584 switch (cond) { 1585 case LT: cond = GT; zn = src2; zm = src1; break; 1586 case LE: cond = GE; zn = src2; zm = src1; break; 1587 case LO: cond = HI; zn = src2; zm = src1; break; 1588 case LS: cond = HS; zn = src2; zm = src1; break; 1589 case NE: cond = EQ; needs_negation = true; break; 1590 default: 1591 break; 1592 } 1593 1594 if (is_floating_point_type(bt)) { 1595 fcm(cond, dst, size, zn, zm); 1596 } else { 1597 cm(cond, dst, size, zn, zm); 1598 } 1599 1600 if (needs_negation) { 1601 notr(dst, isQ ? T16B : T8B, dst); 1602 } 1603 } 1604 1605 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1606 Condition cond, bool isQ) { 1607 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1608 if (bt == T_FLOAT || bt == T_DOUBLE) { 1609 if (cond == Assembler::NE) { 1610 fcm(Assembler::EQ, dst, size, src); 1611 notr(dst, isQ ? T16B : T8B, dst); 1612 } else { 1613 fcm(cond, dst, size, src); 1614 } 1615 } else { 1616 if (cond == Assembler::NE) { 1617 cm(Assembler::EQ, dst, size, src); 1618 notr(dst, isQ ? T16B : T8B, dst); 1619 } else { 1620 cm(cond, dst, size, src); 1621 } 1622 } 1623 } 1624 1625 // Compress the least significant bit of each byte to the rightmost and clear 1626 // the higher garbage bits. 1627 void C2_MacroAssembler::bytemask_compress(Register dst) { 1628 // Example input, dst = 0x01 00 00 00 01 01 00 01 1629 // The "??" bytes are garbage. 1630 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1631 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1632 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1633 andr(dst, dst, 0xff); // dst = 0x8D 1634 } 1635 1636 // Pack the lowest-numbered bit of each mask element in src into a long value 1637 // in dst, at most the first 64 lane elements. 1638 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1639 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1640 FloatRegister vtmp1, FloatRegister vtmp2) { 1641 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1642 assert_different_registers(dst, rscratch1); 1643 assert_different_registers(vtmp1, vtmp2); 1644 1645 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1646 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1647 // Expected: dst = 0x658D 1648 1649 // Convert the mask into vector with sequential bytes. 1650 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1651 sve_cpy(vtmp1, size, src, 1, false); 1652 if (bt != T_BYTE) { 1653 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1654 } 1655 1656 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1657 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1658 // is to compress each significant bit of the byte in a cross-lane way. Due 1659 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1660 // (bit-compress in each lane) with the biggest lane size (T = D) then 1661 // concatenate the results. 1662 1663 // The second source input of BEXT, initialized with 0x01 in each byte. 1664 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1665 sve_dup(vtmp2, B, 1); 1666 1667 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1668 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1669 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1670 // --------------------------------------- 1671 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1672 sve_bext(vtmp1, D, vtmp1, vtmp2); 1673 1674 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1675 // result to dst. 1676 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1677 // dst = 0x658D 1678 if (lane_cnt <= 8) { 1679 // No need to concatenate. 1680 umov(dst, vtmp1, B, 0); 1681 } else if (lane_cnt <= 16) { 1682 ins(vtmp1, B, vtmp1, 1, 8); 1683 umov(dst, vtmp1, H, 0); 1684 } else { 1685 // As the lane count is 64 at most, the final expected value must be in 1686 // the lowest 64 bits after narrowing vtmp1 from D to B. 1687 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1688 umov(dst, vtmp1, D, 0); 1689 } 1690 } else if (UseSVE > 0) { 1691 // Compress the lowest 8 bytes. 1692 fmovd(dst, vtmp1); 1693 bytemask_compress(dst); 1694 if (lane_cnt <= 8) return; 1695 1696 // Repeat on higher bytes and join the results. 1697 // Compress 8 bytes in each iteration. 1698 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1699 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1700 bytemask_compress(rscratch1); 1701 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1702 } 1703 } else { 1704 assert(false, "unsupported"); 1705 ShouldNotReachHere(); 1706 } 1707 } 1708 1709 // Unpack the mask, a long value in src, into predicate register dst based on the 1710 // corresponding data type. Note that dst can support at most 64 lanes. 1711 // Below example gives the expected dst predicate register in different types, with 1712 // a valid src(0x658D) on a 1024-bit vector size machine. 1713 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1714 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1715 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1716 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1717 // 1718 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1719 // has 24 significant bits would be an invalid input if dst predicate register refers to 1720 // a LONG type 1024-bit vector, which has at most 16 lanes. 1721 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1722 FloatRegister vtmp1, FloatRegister vtmp2) { 1723 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1724 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1725 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1726 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1727 // Expected: dst = 0b01101001 10001101 1728 1729 // Put long value from general purpose register into the first lane of vector. 1730 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1731 sve_dup(vtmp1, B, 0); 1732 mov(vtmp1, D, 0, src); 1733 1734 // As sve_cmp generates mask value with the minimum unit in byte, we should 1735 // transform the value in the first lane which is mask in bit now to the 1736 // mask in byte, which can be done by SVE2's BDEP instruction. 1737 1738 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1739 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1740 if (lane_cnt <= 8) { 1741 // Nothing. As only one byte exsits. 1742 } else if (lane_cnt <= 16) { 1743 ins(vtmp1, B, vtmp1, 8, 1); 1744 mov(vtmp1, B, 1, zr); 1745 } else { 1746 sve_vector_extend(vtmp1, D, vtmp1, B); 1747 } 1748 1749 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1750 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1751 sve_dup(vtmp2, B, 1); 1752 1753 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1754 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1755 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1756 // --------------------------------------- 1757 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1758 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1759 1760 if (bt != T_BYTE) { 1761 sve_vector_extend(vtmp1, size, vtmp1, B); 1762 } 1763 // Generate mask according to the given vector, in which the elements have been 1764 // extended to expected type. 1765 // dst = 0b01101001 10001101 1766 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1767 } 1768 1769 // Clobbers: rflags 1770 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1771 FloatRegister zn, FloatRegister zm, Condition cond) { 1772 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1773 FloatRegister z1 = zn, z2 = zm; 1774 switch (cond) { 1775 case LE: z1 = zm; z2 = zn; cond = GE; break; 1776 case LT: z1 = zm; z2 = zn; cond = GT; break; 1777 case LO: z1 = zm; z2 = zn; cond = HI; break; 1778 case LS: z1 = zm; z2 = zn; cond = HS; break; 1779 default: 1780 break; 1781 } 1782 1783 SIMD_RegVariant size = elemType_to_regVariant(bt); 1784 if (is_floating_point_type(bt)) { 1785 sve_fcm(cond, pd, size, pg, z1, z2); 1786 } else { 1787 assert(is_integral_type(bt), "unsupported element type"); 1788 sve_cmp(cond, pd, size, pg, z1, z2); 1789 } 1790 } 1791 1792 // Get index of the last mask lane that is set 1793 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1794 SIMD_RegVariant size = elemType_to_regVariant(bt); 1795 sve_rev(ptmp, size, src); 1796 sve_brkb(ptmp, ptrue, ptmp, false); 1797 sve_cntp(dst, size, ptrue, ptmp); 1798 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1799 subw(dst, rscratch1, dst); 1800 } 1801 1802 // Extend integer vector src to dst with the same lane count 1803 // but larger element size, e.g. 4B -> 4I 1804 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1805 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1806 if (src_bt == T_BYTE) { 1807 if (dst_bt == T_SHORT) { 1808 // 4B/8B to 4S/8S 1809 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1810 } else { 1811 // 4B to 4I 1812 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1813 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1814 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1815 } 1816 } else if (src_bt == T_SHORT) { 1817 // 4S to 4I 1818 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1819 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1820 } else if (src_bt == T_INT) { 1821 // 2I to 2L 1822 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1823 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1824 } else { 1825 ShouldNotReachHere(); 1826 } 1827 } 1828 1829 // Narrow integer vector src down to dst with the same lane count 1830 // but smaller element size, e.g. 4I -> 4B 1831 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1832 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1833 if (src_bt == T_SHORT) { 1834 // 4S/8S to 4B/8B 1835 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1836 assert(dst_bt == T_BYTE, "unsupported"); 1837 xtn(dst, T8B, src, T8H); 1838 } else if (src_bt == T_INT) { 1839 // 4I to 4B/4S 1840 assert(src_vlen_in_bytes == 16, "unsupported"); 1841 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1842 xtn(dst, T4H, src, T4S); 1843 if (dst_bt == T_BYTE) { 1844 xtn(dst, T8B, dst, T8H); 1845 } 1846 } else if (src_bt == T_LONG) { 1847 // 2L to 2I 1848 assert(src_vlen_in_bytes == 16, "unsupported"); 1849 assert(dst_bt == T_INT, "unsupported"); 1850 xtn(dst, T2S, src, T2D); 1851 } else { 1852 ShouldNotReachHere(); 1853 } 1854 } 1855 1856 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1857 FloatRegister src, SIMD_RegVariant src_size, 1858 bool is_unsigned) { 1859 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1860 1861 if (src_size == B) { 1862 switch (dst_size) { 1863 case H: 1864 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1865 break; 1866 case S: 1867 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1868 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1869 break; 1870 case D: 1871 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1872 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1873 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1874 break; 1875 default: 1876 ShouldNotReachHere(); 1877 } 1878 } else if (src_size == H) { 1879 if (dst_size == S) { 1880 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1881 } else { // D 1882 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1883 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1884 } 1885 } else if (src_size == S) { 1886 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1887 } 1888 } 1889 1890 // Vector narrow from src to dst with specified element sizes. 1891 // High part of dst vector will be filled with zero. 1892 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1893 FloatRegister src, SIMD_RegVariant src_size, 1894 FloatRegister tmp) { 1895 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1896 assert_different_registers(src, tmp); 1897 sve_dup(tmp, src_size, 0); 1898 if (src_size == D) { 1899 switch (dst_size) { 1900 case S: 1901 sve_uzp1(dst, S, src, tmp); 1902 break; 1903 case H: 1904 assert_different_registers(dst, tmp); 1905 sve_uzp1(dst, S, src, tmp); 1906 sve_uzp1(dst, H, dst, tmp); 1907 break; 1908 case B: 1909 assert_different_registers(dst, tmp); 1910 sve_uzp1(dst, S, src, tmp); 1911 sve_uzp1(dst, H, dst, tmp); 1912 sve_uzp1(dst, B, dst, tmp); 1913 break; 1914 default: 1915 ShouldNotReachHere(); 1916 } 1917 } else if (src_size == S) { 1918 if (dst_size == H) { 1919 sve_uzp1(dst, H, src, tmp); 1920 } else { // B 1921 assert_different_registers(dst, tmp); 1922 sve_uzp1(dst, H, src, tmp); 1923 sve_uzp1(dst, B, dst, tmp); 1924 } 1925 } else if (src_size == H) { 1926 sve_uzp1(dst, B, src, tmp); 1927 } 1928 } 1929 1930 // Extend src predicate to dst predicate with the same lane count but larger 1931 // element size, e.g. 64Byte -> 512Long 1932 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1933 uint dst_element_length_in_bytes, 1934 uint src_element_length_in_bytes) { 1935 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1936 sve_punpklo(dst, src); 1937 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1938 sve_punpklo(dst, src); 1939 sve_punpklo(dst, dst); 1940 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1941 sve_punpklo(dst, src); 1942 sve_punpklo(dst, dst); 1943 sve_punpklo(dst, dst); 1944 } else { 1945 assert(false, "unsupported"); 1946 ShouldNotReachHere(); 1947 } 1948 } 1949 1950 // Narrow src predicate to dst predicate with the same lane count but 1951 // smaller element size, e.g. 512Long -> 64Byte 1952 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1953 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1954 // The insignificant bits in src predicate are expected to be zero. 1955 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1956 // passed as the second argument. An example narrowing operation with a given mask would be - 1957 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1958 // Mask (for 2 Longs) : TF 1959 // Predicate register for the above mask (16 bits) : 00000001 00000000 1960 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1961 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1962 assert_different_registers(src, ptmp); 1963 assert_different_registers(dst, ptmp); 1964 sve_pfalse(ptmp); 1965 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1966 sve_uzp1(dst, B, src, ptmp); 1967 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1968 sve_uzp1(dst, H, src, ptmp); 1969 sve_uzp1(dst, B, dst, ptmp); 1970 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1971 sve_uzp1(dst, S, src, ptmp); 1972 sve_uzp1(dst, H, dst, ptmp); 1973 sve_uzp1(dst, B, dst, ptmp); 1974 } else { 1975 assert(false, "unsupported"); 1976 ShouldNotReachHere(); 1977 } 1978 } 1979 1980 // Vector reduction add for integral type with ASIMD instructions. 1981 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1982 Register isrc, FloatRegister vsrc, 1983 unsigned vector_length_in_bytes, 1984 FloatRegister vtmp) { 1985 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1986 assert_different_registers(dst, isrc); 1987 bool isQ = vector_length_in_bytes == 16; 1988 1989 BLOCK_COMMENT("neon_reduce_add_integral {"); 1990 switch(bt) { 1991 case T_BYTE: 1992 addv(vtmp, isQ ? T16B : T8B, vsrc); 1993 smov(dst, vtmp, B, 0); 1994 addw(dst, dst, isrc, ext::sxtb); 1995 break; 1996 case T_SHORT: 1997 addv(vtmp, isQ ? T8H : T4H, vsrc); 1998 smov(dst, vtmp, H, 0); 1999 addw(dst, dst, isrc, ext::sxth); 2000 break; 2001 case T_INT: 2002 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 2003 umov(dst, vtmp, S, 0); 2004 addw(dst, dst, isrc); 2005 break; 2006 case T_LONG: 2007 assert(isQ, "unsupported"); 2008 addpd(vtmp, vsrc); 2009 umov(dst, vtmp, D, 0); 2010 add(dst, dst, isrc); 2011 break; 2012 default: 2013 assert(false, "unsupported"); 2014 ShouldNotReachHere(); 2015 } 2016 BLOCK_COMMENT("} neon_reduce_add_integral"); 2017 } 2018 2019 // Vector reduction multiply for integral type with ASIMD instructions. 2020 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 2021 // Clobbers: rscratch1 2022 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 2023 Register isrc, FloatRegister vsrc, 2024 unsigned vector_length_in_bytes, 2025 FloatRegister vtmp1, FloatRegister vtmp2) { 2026 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2027 bool isQ = vector_length_in_bytes == 16; 2028 2029 BLOCK_COMMENT("neon_reduce_mul_integral {"); 2030 switch(bt) { 2031 case T_BYTE: 2032 if (isQ) { 2033 // Multiply the lower half and higher half of vector iteratively. 2034 // vtmp1 = vsrc[8:15] 2035 ins(vtmp1, D, vsrc, 0, 1); 2036 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2037 mulv(vtmp1, T8B, vtmp1, vsrc); 2038 // vtmp2 = vtmp1[4:7] 2039 ins(vtmp2, S, vtmp1, 0, 1); 2040 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2041 mulv(vtmp1, T8B, vtmp2, vtmp1); 2042 } else { 2043 ins(vtmp1, S, vsrc, 0, 1); 2044 mulv(vtmp1, T8B, vtmp1, vsrc); 2045 } 2046 // vtmp2 = vtmp1[2:3] 2047 ins(vtmp2, H, vtmp1, 0, 1); 2048 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2049 mulv(vtmp2, T8B, vtmp2, vtmp1); 2050 // dst = vtmp2[0] * isrc * vtmp2[1] 2051 umov(rscratch1, vtmp2, B, 0); 2052 mulw(dst, rscratch1, isrc); 2053 sxtb(dst, dst); 2054 umov(rscratch1, vtmp2, B, 1); 2055 mulw(dst, rscratch1, dst); 2056 sxtb(dst, dst); 2057 break; 2058 case T_SHORT: 2059 if (isQ) { 2060 ins(vtmp2, D, vsrc, 0, 1); 2061 mulv(vtmp2, T4H, vtmp2, vsrc); 2062 ins(vtmp1, S, vtmp2, 0, 1); 2063 mulv(vtmp1, T4H, vtmp1, vtmp2); 2064 } else { 2065 ins(vtmp1, S, vsrc, 0, 1); 2066 mulv(vtmp1, T4H, vtmp1, vsrc); 2067 } 2068 umov(rscratch1, vtmp1, H, 0); 2069 mulw(dst, rscratch1, isrc); 2070 sxth(dst, dst); 2071 umov(rscratch1, vtmp1, H, 1); 2072 mulw(dst, rscratch1, dst); 2073 sxth(dst, dst); 2074 break; 2075 case T_INT: 2076 if (isQ) { 2077 ins(vtmp1, D, vsrc, 0, 1); 2078 mulv(vtmp1, T2S, vtmp1, vsrc); 2079 } else { 2080 vtmp1 = vsrc; 2081 } 2082 umov(rscratch1, vtmp1, S, 0); 2083 mul(dst, rscratch1, isrc); 2084 umov(rscratch1, vtmp1, S, 1); 2085 mul(dst, rscratch1, dst); 2086 break; 2087 case T_LONG: 2088 umov(rscratch1, vsrc, D, 0); 2089 mul(dst, isrc, rscratch1); 2090 umov(rscratch1, vsrc, D, 1); 2091 mul(dst, dst, rscratch1); 2092 break; 2093 default: 2094 assert(false, "unsupported"); 2095 ShouldNotReachHere(); 2096 } 2097 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2098 } 2099 2100 // Vector reduction multiply for floating-point type with ASIMD instructions. 2101 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2102 FloatRegister fsrc, FloatRegister vsrc, 2103 unsigned vector_length_in_bytes, 2104 FloatRegister vtmp) { 2105 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2106 bool isQ = vector_length_in_bytes == 16; 2107 2108 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2109 switch(bt) { 2110 case T_FLOAT: 2111 fmuls(dst, fsrc, vsrc); 2112 ins(vtmp, S, vsrc, 0, 1); 2113 fmuls(dst, dst, vtmp); 2114 if (isQ) { 2115 ins(vtmp, S, vsrc, 0, 2); 2116 fmuls(dst, dst, vtmp); 2117 ins(vtmp, S, vsrc, 0, 3); 2118 fmuls(dst, dst, vtmp); 2119 } 2120 break; 2121 case T_DOUBLE: 2122 assert(isQ, "unsupported"); 2123 fmuld(dst, fsrc, vsrc); 2124 ins(vtmp, D, vsrc, 0, 1); 2125 fmuld(dst, dst, vtmp); 2126 break; 2127 default: 2128 assert(false, "unsupported"); 2129 ShouldNotReachHere(); 2130 } 2131 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2132 } 2133 2134 // Helper to select logical instruction 2135 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2136 Register Rn, Register Rm, 2137 enum shift_kind kind, unsigned shift) { 2138 switch(opc) { 2139 case Op_AndReductionV: 2140 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2141 break; 2142 case Op_OrReductionV: 2143 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2144 break; 2145 case Op_XorReductionV: 2146 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2147 break; 2148 default: 2149 assert(false, "unsupported"); 2150 ShouldNotReachHere(); 2151 } 2152 } 2153 2154 // Vector reduction logical operations And, Or, Xor 2155 // Clobbers: rscratch1 2156 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2157 Register isrc, FloatRegister vsrc, 2158 unsigned vector_length_in_bytes) { 2159 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2160 "unsupported"); 2161 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2162 assert_different_registers(dst, isrc); 2163 bool isQ = vector_length_in_bytes == 16; 2164 2165 BLOCK_COMMENT("neon_reduce_logical {"); 2166 umov(rscratch1, vsrc, isQ ? D : S, 0); 2167 umov(dst, vsrc, isQ ? D : S, 1); 2168 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2169 switch(bt) { 2170 case T_BYTE: 2171 if (isQ) { 2172 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2173 } 2174 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2175 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2176 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2177 sxtb(dst, dst); 2178 break; 2179 case T_SHORT: 2180 if (isQ) { 2181 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2182 } 2183 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2184 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2185 sxth(dst, dst); 2186 break; 2187 case T_INT: 2188 if (isQ) { 2189 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2190 } 2191 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2192 break; 2193 case T_LONG: 2194 assert(isQ, "unsupported"); 2195 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2196 break; 2197 default: 2198 assert(false, "unsupported"); 2199 ShouldNotReachHere(); 2200 } 2201 BLOCK_COMMENT("} neon_reduce_logical"); 2202 } 2203 2204 // Vector reduction min/max for integral type with ASIMD instructions. 2205 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2206 // Clobbers: rscratch1, rflags 2207 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2208 Register isrc, FloatRegister vsrc, 2209 unsigned vector_length_in_bytes, 2210 FloatRegister vtmp) { 2211 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2212 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2213 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2214 assert_different_registers(dst, isrc); 2215 bool isQ = vector_length_in_bytes == 16; 2216 bool is_min = opc == Op_MinReductionV; 2217 2218 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2219 if (bt == T_LONG) { 2220 assert(vtmp == fnoreg, "should be"); 2221 assert(isQ, "should be"); 2222 umov(rscratch1, vsrc, D, 0); 2223 cmp(isrc, rscratch1); 2224 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2225 umov(rscratch1, vsrc, D, 1); 2226 cmp(dst, rscratch1); 2227 csel(dst, dst, rscratch1, is_min ? LT : GT); 2228 } else { 2229 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2230 if (size == T2S) { 2231 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2232 } else { 2233 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2234 } 2235 if (bt == T_INT) { 2236 umov(dst, vtmp, S, 0); 2237 } else { 2238 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2239 } 2240 cmpw(dst, isrc); 2241 cselw(dst, dst, isrc, is_min ? LT : GT); 2242 } 2243 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2244 } 2245 2246 // Vector reduction for integral type with SVE instruction. 2247 // Supported operations are Add, And, Or, Xor, Max, Min. 2248 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2249 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2250 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2251 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2252 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2253 assert_different_registers(src1, dst); 2254 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2255 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2256 switch (opc) { 2257 case Op_AddReductionVI: { 2258 sve_uaddv(tmp, size, pg, src2); 2259 if (bt == T_BYTE) { 2260 smov(dst, tmp, size, 0); 2261 addw(dst, src1, dst, ext::sxtb); 2262 } else if (bt == T_SHORT) { 2263 smov(dst, tmp, size, 0); 2264 addw(dst, src1, dst, ext::sxth); 2265 } else { 2266 umov(dst, tmp, size, 0); 2267 addw(dst, dst, src1); 2268 } 2269 break; 2270 } 2271 case Op_AddReductionVL: { 2272 sve_uaddv(tmp, size, pg, src2); 2273 umov(dst, tmp, size, 0); 2274 add(dst, dst, src1); 2275 break; 2276 } 2277 case Op_AndReductionV: { 2278 sve_andv(tmp, size, pg, src2); 2279 if (bt == T_INT || bt == T_LONG) { 2280 umov(dst, tmp, size, 0); 2281 } else { 2282 smov(dst, tmp, size, 0); 2283 } 2284 if (bt == T_LONG) { 2285 andr(dst, dst, src1); 2286 } else { 2287 andw(dst, dst, src1); 2288 } 2289 break; 2290 } 2291 case Op_OrReductionV: { 2292 sve_orv(tmp, size, pg, src2); 2293 if (bt == T_INT || bt == T_LONG) { 2294 umov(dst, tmp, size, 0); 2295 } else { 2296 smov(dst, tmp, size, 0); 2297 } 2298 if (bt == T_LONG) { 2299 orr(dst, dst, src1); 2300 } else { 2301 orrw(dst, dst, src1); 2302 } 2303 break; 2304 } 2305 case Op_XorReductionV: { 2306 sve_eorv(tmp, size, pg, src2); 2307 if (bt == T_INT || bt == T_LONG) { 2308 umov(dst, tmp, size, 0); 2309 } else { 2310 smov(dst, tmp, size, 0); 2311 } 2312 if (bt == T_LONG) { 2313 eor(dst, dst, src1); 2314 } else { 2315 eorw(dst, dst, src1); 2316 } 2317 break; 2318 } 2319 case Op_MaxReductionV: { 2320 sve_smaxv(tmp, size, pg, src2); 2321 if (bt == T_INT || bt == T_LONG) { 2322 umov(dst, tmp, size, 0); 2323 } else { 2324 smov(dst, tmp, size, 0); 2325 } 2326 if (bt == T_LONG) { 2327 cmp(dst, src1); 2328 csel(dst, dst, src1, Assembler::GT); 2329 } else { 2330 cmpw(dst, src1); 2331 cselw(dst, dst, src1, Assembler::GT); 2332 } 2333 break; 2334 } 2335 case Op_MinReductionV: { 2336 sve_sminv(tmp, size, pg, src2); 2337 if (bt == T_INT || bt == T_LONG) { 2338 umov(dst, tmp, size, 0); 2339 } else { 2340 smov(dst, tmp, size, 0); 2341 } 2342 if (bt == T_LONG) { 2343 cmp(dst, src1); 2344 csel(dst, dst, src1, Assembler::LT); 2345 } else { 2346 cmpw(dst, src1); 2347 cselw(dst, dst, src1, Assembler::LT); 2348 } 2349 break; 2350 } 2351 default: 2352 assert(false, "unsupported"); 2353 ShouldNotReachHere(); 2354 } 2355 2356 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2357 if (bt == T_BYTE) { 2358 sxtb(dst, dst); 2359 } else if (bt == T_SHORT) { 2360 sxth(dst, dst); 2361 } 2362 } 2363 } 2364 2365 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2366 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2367 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2368 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2369 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2370 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2371 2372 // Set all elements to false if the input "lane_cnt" is zero. 2373 if (lane_cnt == 0) { 2374 sve_pfalse(dst); 2375 return; 2376 } 2377 2378 SIMD_RegVariant size = elemType_to_regVariant(bt); 2379 assert(size != Q, "invalid size"); 2380 2381 // Set all true if "lane_cnt" equals to the max lane count. 2382 if (lane_cnt == max_vector_length) { 2383 sve_ptrue(dst, size, /* ALL */ 0b11111); 2384 return; 2385 } 2386 2387 // Fixed numbers for "ptrue". 2388 switch(lane_cnt) { 2389 case 1: /* VL1 */ 2390 case 2: /* VL2 */ 2391 case 3: /* VL3 */ 2392 case 4: /* VL4 */ 2393 case 5: /* VL5 */ 2394 case 6: /* VL6 */ 2395 case 7: /* VL7 */ 2396 case 8: /* VL8 */ 2397 sve_ptrue(dst, size, lane_cnt); 2398 return; 2399 case 16: 2400 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2401 return; 2402 case 32: 2403 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2404 return; 2405 case 64: 2406 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2407 return; 2408 case 128: 2409 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2410 return; 2411 case 256: 2412 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2413 return; 2414 default: 2415 break; 2416 } 2417 2418 // Special patterns for "ptrue". 2419 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2420 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2421 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2422 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2423 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2424 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2425 } else { 2426 // Encode to "whileltw" for the remaining cases. 2427 mov(rscratch1, lane_cnt); 2428 sve_whileltw(dst, size, zr, rscratch1); 2429 } 2430 } 2431 2432 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2433 // Any remaining elements of dst will be filled with zero. 2434 // Clobbers: rscratch1 2435 // Preserves: src, mask 2436 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2437 FloatRegister vtmp1, FloatRegister vtmp2, 2438 PRegister pgtmp) { 2439 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2440 assert_different_registers(dst, src, vtmp1, vtmp2); 2441 assert_different_registers(mask, pgtmp); 2442 2443 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2444 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2445 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2446 sve_dup(vtmp2, H, 0); 2447 2448 // Extend lowest half to type INT. 2449 // dst = 00004444 00003333 00002222 00001111 2450 sve_uunpklo(dst, S, src); 2451 // pgtmp = 00000001 00000000 00000001 00000001 2452 sve_punpklo(pgtmp, mask); 2453 // Pack the active elements in size of type INT to the right, 2454 // and fill the remainings with zero. 2455 // dst = 00000000 00004444 00002222 00001111 2456 sve_compact(dst, S, dst, pgtmp); 2457 // Narrow the result back to type SHORT. 2458 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2459 sve_uzp1(dst, H, dst, vtmp2); 2460 // Count the active elements of lowest half. 2461 // rscratch1 = 3 2462 sve_cntp(rscratch1, S, ptrue, pgtmp); 2463 2464 // Repeat to the highest half. 2465 // pgtmp = 00000001 00000000 00000000 00000001 2466 sve_punpkhi(pgtmp, mask); 2467 // vtmp1 = 00008888 00007777 00006666 00005555 2468 sve_uunpkhi(vtmp1, S, src); 2469 // vtmp1 = 00000000 00000000 00008888 00005555 2470 sve_compact(vtmp1, S, vtmp1, pgtmp); 2471 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2472 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2473 2474 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2475 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2476 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2477 // TRUE_CNT is the number of active elements in the compressed low. 2478 neg(rscratch1, rscratch1); 2479 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2480 sve_index(vtmp2, H, rscratch1, 1); 2481 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2482 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2483 2484 // Combine the compressed high(after shifted) with the compressed low. 2485 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2486 sve_orr(dst, dst, vtmp1); 2487 } 2488 2489 // Clobbers: rscratch1, rscratch2 2490 // Preserves: src, mask 2491 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2492 FloatRegister vtmp1, FloatRegister vtmp2, 2493 FloatRegister vtmp3, FloatRegister vtmp4, 2494 PRegister ptmp, PRegister pgtmp) { 2495 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2496 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2497 assert_different_registers(mask, ptmp, pgtmp); 2498 // Example input: src = 88 77 66 55 44 33 22 11 2499 // mask = 01 00 00 01 01 00 01 01 2500 // Expected result: dst = 00 00 00 88 55 44 22 11 2501 2502 sve_dup(vtmp4, B, 0); 2503 // Extend lowest half to type SHORT. 2504 // vtmp1 = 0044 0033 0022 0011 2505 sve_uunpklo(vtmp1, H, src); 2506 // ptmp = 0001 0000 0001 0001 2507 sve_punpklo(ptmp, mask); 2508 // Count the active elements of lowest half. 2509 // rscratch2 = 3 2510 sve_cntp(rscratch2, H, ptrue, ptmp); 2511 // Pack the active elements in size of type SHORT to the right, 2512 // and fill the remainings with zero. 2513 // dst = 0000 0044 0022 0011 2514 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2515 // Narrow the result back to type BYTE. 2516 // dst = 00 00 00 00 00 44 22 11 2517 sve_uzp1(dst, B, dst, vtmp4); 2518 2519 // Repeat to the highest half. 2520 // ptmp = 0001 0000 0000 0001 2521 sve_punpkhi(ptmp, mask); 2522 // vtmp1 = 0088 0077 0066 0055 2523 sve_uunpkhi(vtmp2, H, src); 2524 // vtmp1 = 0000 0000 0088 0055 2525 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2526 2527 sve_dup(vtmp4, B, 0); 2528 // vtmp1 = 00 00 00 00 00 00 88 55 2529 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2530 2531 // Compressed low: dst = 00 00 00 00 00 44 22 11 2532 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2533 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2534 // TRUE_CNT is the number of active elements in the compressed low. 2535 neg(rscratch2, rscratch2); 2536 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2537 sve_index(vtmp2, B, rscratch2, 1); 2538 // vtmp1 = 00 00 00 88 55 00 00 00 2539 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2540 // Combine the compressed high(after shifted) with the compressed low. 2541 // dst = 00 00 00 88 55 44 22 11 2542 sve_orr(dst, dst, vtmp1); 2543 } 2544 2545 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2546 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2547 SIMD_Arrangement size = isQ ? T16B : T8B; 2548 if (bt == T_BYTE) { 2549 rbit(dst, size, src); 2550 } else { 2551 neon_reverse_bytes(dst, src, bt, isQ); 2552 rbit(dst, size, dst); 2553 } 2554 } 2555 2556 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2557 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2558 SIMD_Arrangement size = isQ ? T16B : T8B; 2559 switch (bt) { 2560 case T_BYTE: 2561 if (dst != src) { 2562 orr(dst, size, src, src); 2563 } 2564 break; 2565 case T_SHORT: 2566 rev16(dst, size, src); 2567 break; 2568 case T_INT: 2569 rev32(dst, size, src); 2570 break; 2571 case T_LONG: 2572 rev64(dst, size, src); 2573 break; 2574 default: 2575 assert(false, "unsupported"); 2576 ShouldNotReachHere(); 2577 } 2578 } 2579 2580 // VectorRearrange implementation for short/int/float/long/double types with NEON 2581 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. 2582 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group. 2583 // For VectorRearrange long/double, we compare the shuffle input with iota indices, 2584 // and use bsl to implement the operation. 2585 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, 2586 FloatRegister shuffle, FloatRegister tmp, 2587 BasicType bt, bool isQ) { 2588 assert_different_registers(dst, src, shuffle, tmp); 2589 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2590 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2591 2592 // Here is an example that rearranges a NEON vector with 4 ints: 2593 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] 2594 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. 2595 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector 2596 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get 2597 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. 2598 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], 2599 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] 2600 // 4. Use Vm as index register, and use V1 as table register. 2601 // Then get V2 as the result by tbl NEON instructions. 2602 switch (bt) { 2603 case T_SHORT: 2604 mov(tmp, size1, 0x02); 2605 mulv(dst, size2, shuffle, tmp); 2606 mov(tmp, size2, 0x0100); 2607 addv(dst, size1, dst, tmp); 2608 tbl(dst, size1, src, 1, dst); 2609 break; 2610 case T_INT: 2611 case T_FLOAT: 2612 mov(tmp, size1, 0x04); 2613 mulv(dst, size2, shuffle, tmp); 2614 mov(tmp, size2, 0x03020100); 2615 addv(dst, size1, dst, tmp); 2616 tbl(dst, size1, src, 1, dst); 2617 break; 2618 case T_LONG: 2619 case T_DOUBLE: 2620 // Load the iota indices for Long type. The indices are ordered by 2621 // type B/S/I/L/F/D, and the offset between two types is 16; Hence 2622 // the offset for L is 48. 2623 lea(rscratch1, 2624 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); 2625 ldrq(tmp, rscratch1); 2626 // Check whether the input "shuffle" is the same with iota indices. 2627 // Return "src" if true, otherwise swap the two elements of "src". 2628 cm(EQ, dst, size2, shuffle, tmp); 2629 ext(tmp, size1, src, src, 8); 2630 bsl(dst, size1, src, tmp); 2631 break; 2632 default: 2633 assert(false, "unsupported element type"); 2634 ShouldNotReachHere(); 2635 } 2636 } 2637 2638 // Extract a scalar element from an sve vector at position 'idx'. 2639 // The input elements in src are expected to be of integral type. 2640 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2641 int idx, FloatRegister vtmp) { 2642 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2643 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2644 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2645 if (bt == T_INT || bt == T_LONG) { 2646 umov(dst, src, size, idx); 2647 } else { 2648 smov(dst, src, size, idx); 2649 } 2650 } else { 2651 sve_orr(vtmp, src, src); 2652 sve_ext(vtmp, vtmp, idx << size); 2653 if (bt == T_INT || bt == T_LONG) { 2654 umov(dst, vtmp, size, 0); 2655 } else { 2656 smov(dst, vtmp, size, 0); 2657 } 2658 } 2659 } 2660 2661 // java.lang.Math::round intrinsics 2662 2663 // Clobbers: rscratch1, rflags 2664 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2665 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2666 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2667 switch (T) { 2668 case T2S: 2669 case T4S: 2670 fmovs(tmp1, T, 0.5f); 2671 mov(rscratch1, jint_cast(0x1.0p23f)); 2672 break; 2673 case T2D: 2674 fmovd(tmp1, T, 0.5); 2675 mov(rscratch1, julong_cast(0x1.0p52)); 2676 break; 2677 default: 2678 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2679 } 2680 fadd(tmp1, T, tmp1, src); 2681 fcvtms(tmp1, T, tmp1); 2682 // tmp1 = floor(src + 0.5, ties to even) 2683 2684 fcvtas(dst, T, src); 2685 // dst = round(src), ties to away 2686 2687 fneg(tmp3, T, src); 2688 dup(tmp2, T, rscratch1); 2689 cm(HS, tmp3, T, tmp3, tmp2); 2690 // tmp3 is now a set of flags 2691 2692 bif(dst, T16B, tmp1, tmp3); 2693 // result in dst 2694 } 2695 2696 // Clobbers: rscratch1, rflags 2697 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2698 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2699 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2700 assert_different_registers(tmp1, tmp2, src, dst); 2701 2702 switch (T) { 2703 case S: 2704 mov(rscratch1, jint_cast(0x1.0p23f)); 2705 break; 2706 case D: 2707 mov(rscratch1, julong_cast(0x1.0p52)); 2708 break; 2709 default: 2710 assert(T == S || T == D, "invalid register variant"); 2711 } 2712 2713 sve_frinta(dst, T, ptrue, src); 2714 // dst = round(src), ties to away 2715 2716 Label none; 2717 2718 sve_fneg(tmp1, T, ptrue, src); 2719 sve_dup(tmp2, T, rscratch1); 2720 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2721 br(EQ, none); 2722 { 2723 sve_cpy(tmp1, T, pgtmp, 0.5); 2724 sve_fadd(tmp1, T, pgtmp, src); 2725 sve_frintm(dst, T, pgtmp, tmp1); 2726 // dst = floor(src + 0.5, ties to even) 2727 } 2728 bind(none); 2729 2730 sve_fcvtzs(dst, T, ptrue, dst, T); 2731 // result in dst 2732 } 2733 2734 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2735 FloatRegister one, SIMD_Arrangement T) { 2736 assert_different_registers(dst, src, zero, one); 2737 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2738 2739 facgt(dst, T, src, zero); 2740 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2741 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2742 } 2743 2744 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2745 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2746 assert_different_registers(dst, src, zero, one, vtmp); 2747 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2748 2749 sve_orr(vtmp, src, src); 2750 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2751 switch (T) { 2752 case S: 2753 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2754 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2755 // on the sign of the float value 2756 break; 2757 case D: 2758 sve_and(vtmp, T, min_jlong); 2759 sve_orr(vtmp, T, jlong_cast(1.0)); 2760 break; 2761 default: 2762 assert(false, "unsupported"); 2763 ShouldNotReachHere(); 2764 } 2765 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2766 // Result in dst 2767 } 2768 2769 bool C2_MacroAssembler::in_scratch_emit_size() { 2770 if (ciEnv::current()->task() != nullptr) { 2771 PhaseOutput* phase_output = Compile::current()->output(); 2772 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2773 return true; 2774 } 2775 } 2776 return MacroAssembler::in_scratch_emit_size(); 2777 } 2778 2779 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 2780 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 2781 } 2782 2783 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) { 2784 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2785 if (t == TypeInt::INT) { 2786 return; 2787 } 2788 BLOCK_COMMENT("verify_int_in_range {"); 2789 Label L_success, L_failure; 2790 2791 jint lo = t->_lo; 2792 jint hi = t->_hi; 2793 2794 if (lo != min_jint && hi != max_jint) { 2795 subsw(rtmp, rval, lo); 2796 br(Assembler::LT, L_failure); 2797 subsw(rtmp, rval, hi); 2798 br(Assembler::LE, L_success); 2799 } else if (lo != min_jint) { 2800 subsw(rtmp, rval, lo); 2801 br(Assembler::GE, L_success); 2802 } else if (hi != max_jint) { 2803 subsw(rtmp, rval, hi); 2804 br(Assembler::LE, L_success); 2805 } else { 2806 ShouldNotReachHere(); 2807 } 2808 2809 bind(L_failure); 2810 movw(c_rarg0, idx); 2811 mov(c_rarg1, rval); 2812 movw(c_rarg2, lo); 2813 movw(c_rarg3, hi); 2814 reconstruct_frame_pointer(rtmp); 2815 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp); 2816 hlt(0); 2817 2818 bind(L_success); 2819 BLOCK_COMMENT("} verify_int_in_range"); 2820 } 2821 2822 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 2823 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 2824 } 2825 2826 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) { 2827 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2828 if (t == TypeLong::LONG) { 2829 return; 2830 } 2831 BLOCK_COMMENT("verify_long_in_range {"); 2832 Label L_success, L_failure; 2833 2834 jlong lo = t->_lo; 2835 jlong hi = t->_hi; 2836 2837 if (lo != min_jlong && hi != max_jlong) { 2838 subs(rtmp, rval, lo); 2839 br(Assembler::LT, L_failure); 2840 subs(rtmp, rval, hi); 2841 br(Assembler::LE, L_success); 2842 } else if (lo != min_jlong) { 2843 subs(rtmp, rval, lo); 2844 br(Assembler::GE, L_success); 2845 } else if (hi != max_jlong) { 2846 subs(rtmp, rval, hi); 2847 br(Assembler::LE, L_success); 2848 } else { 2849 ShouldNotReachHere(); 2850 } 2851 2852 bind(L_failure); 2853 movw(c_rarg0, idx); 2854 mov(c_rarg1, rval); 2855 mov(c_rarg2, lo); 2856 mov(c_rarg3, hi); 2857 reconstruct_frame_pointer(rtmp); 2858 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp); 2859 hlt(0); 2860 2861 bind(L_success); 2862 BLOCK_COMMENT("} verify_long_in_range"); 2863 } 2864 2865 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 2866 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 2867 if (PreserveFramePointer) { 2868 // frame pointer is valid 2869 #ifdef ASSERT 2870 // Verify frame pointer value in rfp. 2871 add(rtmp, sp, framesize - 2 * wordSize); 2872 Label L_success; 2873 cmp(rfp, rtmp); 2874 br(Assembler::EQ, L_success); 2875 stop("frame pointer mismatch"); 2876 bind(L_success); 2877 #endif // ASSERT 2878 } else { 2879 add(rfp, sp, framesize - 2 * wordSize); 2880 } 2881 }