1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::entry_barrier() { 50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 51 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 52 // Dummy labels for just measuring the code size 53 Label dummy_slow_path; 54 Label dummy_continuation; 55 Label dummy_guard; 56 Label* slow_path = &dummy_slow_path; 57 Label* continuation = &dummy_continuation; 58 Label* guard = &dummy_guard; 59 if (!Compile::current()->output()->in_scratch_emit_size()) { 60 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 61 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 62 Compile::current()->output()->add_stub(stub); 63 slow_path = &stub->entry(); 64 continuation = &stub->continuation(); 65 guard = &stub->guard(); 66 } 67 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 68 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 69 } 70 } 71 72 // jdk.internal.util.ArraysSupport.vectorizedHashCode 73 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 74 FloatRegister vdata0, FloatRegister vdata1, 75 FloatRegister vdata2, FloatRegister vdata3, 76 FloatRegister vmul0, FloatRegister vmul1, 77 FloatRegister vmul2, FloatRegister vmul3, 78 FloatRegister vpow, FloatRegister vpowm, 79 BasicType eltype) { 80 ARRAYS_HASHCODE_REGISTERS; 81 82 Register tmp1 = rscratch1, tmp2 = rscratch2; 83 84 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 85 86 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 87 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 88 // use 4H for chars and shorts instead, but using 8H gives better performance. 89 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 90 : eltype == T_CHAR || eltype == T_SHORT ? 8 91 : eltype == T_INT ? 4 92 : 0; 93 guarantee(vf, "unsupported eltype"); 94 95 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 96 const size_t unroll_factor = 4; 97 98 switch (eltype) { 99 case T_BOOLEAN: 100 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 101 break; 102 case T_CHAR: 103 BLOCK_COMMENT("arrays_hashcode(char) {"); 104 break; 105 case T_BYTE: 106 BLOCK_COMMENT("arrays_hashcode(byte) {"); 107 break; 108 case T_SHORT: 109 BLOCK_COMMENT("arrays_hashcode(short) {"); 110 break; 111 case T_INT: 112 BLOCK_COMMENT("arrays_hashcode(int) {"); 113 break; 114 default: 115 ShouldNotReachHere(); 116 } 117 118 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 119 // implemented by the stub executes just once. Call the stub only if at least two iterations will 120 // be executed. 121 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 122 cmpw(cnt, large_threshold); 123 br(Assembler::HS, LARGE); 124 125 bind(TAIL); 126 127 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 128 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 129 // Iteration eats up the remainder, uf elements at a time. 130 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 131 andr(tmp2, cnt, unroll_factor - 1); 132 adr(tmp1, BR_BASE); 133 sub(tmp1, tmp1, tmp2, ext::sxtw, 3); 134 movw(tmp2, 0x1f); 135 br(tmp1); 136 137 bind(LOOP); 138 for (size_t i = 0; i < unroll_factor; ++i) { 139 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 140 maddw(result, result, tmp2, tmp1); 141 } 142 bind(BR_BASE); 143 subsw(cnt, cnt, unroll_factor); 144 br(Assembler::HS, LOOP); 145 146 b(DONE); 147 148 bind(LARGE); 149 150 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 151 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 152 address tpc = trampoline_call(stub); 153 if (tpc == nullptr) { 154 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 155 postcond(pc() == badAddress); 156 return nullptr; 157 } 158 159 bind(DONE); 160 161 BLOCK_COMMENT("} // arrays_hashcode"); 162 163 postcond(pc() != badAddress); 164 return pc(); 165 } 166 167 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 168 Register tmp2Reg, Register tmp3Reg) { 169 Register oop = objectReg; 170 Register box = boxReg; 171 Register disp_hdr = tmpReg; 172 Register tmp = tmp2Reg; 173 Label cont; 174 Label object_has_monitor; 175 Label count, no_count; 176 177 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 178 assert_different_registers(oop, box, tmp, disp_hdr, rscratch2); 179 180 // Load markWord from object into displaced_header. 181 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 182 183 if (DiagnoseSyncOnValueBasedClasses != 0) { 184 load_klass(tmp, oop); 185 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 186 tst(tmp, KlassFlags::_misc_is_value_based_class); 187 br(Assembler::NE, cont); 188 } 189 190 // Check for existing monitor 191 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 192 193 if (LockingMode == LM_MONITOR) { 194 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 195 b(cont); 196 } else { 197 assert(LockingMode == LM_LEGACY, "must be"); 198 // Set tmp to be (markWord of object | UNLOCK_VALUE). 199 orr(tmp, disp_hdr, markWord::unlocked_value); 200 201 if (EnableValhalla) { 202 // Mask inline_type bit such that we go to the slow path if object is an inline type 203 andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place)); 204 } 205 206 // Initialize the box. (Must happen before we update the object mark!) 207 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 208 209 // Compare object markWord with an unlocked value (tmp) and if 210 // equal exchange the stack address of our box with object markWord. 211 // On failure disp_hdr contains the possibly locked markWord. 212 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 213 /*release*/ true, /*weak*/ false, disp_hdr); 214 br(Assembler::EQ, cont); 215 216 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 217 218 // If the compare-and-exchange succeeded, then we found an unlocked 219 // object, will have now locked it will continue at label cont 220 221 // Check if the owner is self by comparing the value in the 222 // markWord of object (disp_hdr) with the stack pointer. 223 mov(rscratch1, sp); 224 sub(disp_hdr, disp_hdr, rscratch1); 225 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 226 // If condition is true we are cont and hence we can store 0 as the 227 // displaced header in the box, which indicates that it is a recursive lock. 228 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 229 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 230 b(cont); 231 } 232 233 // Handle existing monitor. 234 bind(object_has_monitor); 235 236 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 237 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 238 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 239 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 240 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 241 242 // Store a non-null value into the box to avoid looking like a re-entrant 243 // lock. The fast-path monitor unlock code checks for 244 // markWord::monitor_value so use markWord::unused_mark which has the 245 // relevant bit set, and also matches ObjectSynchronizer::enter. 246 mov(tmp, (address)markWord::unused_mark().value()); 247 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 248 249 br(Assembler::EQ, cont); // CAS success means locking succeeded 250 251 cmp(tmp3Reg, rscratch2); 252 br(Assembler::NE, cont); // Check for recursive locking 253 254 // Recursive lock case 255 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 256 // flag == EQ still from the cmp above, checking if this is a reentrant lock 257 258 bind(cont); 259 // flag == EQ indicates success 260 // flag == NE indicates failure 261 br(Assembler::NE, no_count); 262 263 bind(count); 264 if (LockingMode == LM_LEGACY) { 265 inc_held_monitor_count(rscratch1); 266 } 267 268 bind(no_count); 269 } 270 271 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 272 Register tmp2Reg) { 273 Register oop = objectReg; 274 Register box = boxReg; 275 Register disp_hdr = tmpReg; 276 Register owner_addr = tmpReg; 277 Register tmp = tmp2Reg; 278 Label cont; 279 Label object_has_monitor; 280 Label count, no_count; 281 Label unlocked; 282 283 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 284 assert_different_registers(oop, box, tmp, disp_hdr); 285 286 if (LockingMode == LM_LEGACY) { 287 // Find the lock address and load the displaced header from the stack. 288 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 289 290 // If the displaced header is 0, we have a recursive unlock. 291 cmp(disp_hdr, zr); 292 br(Assembler::EQ, cont); 293 } 294 295 // Handle existing monitor. 296 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 297 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 298 299 if (LockingMode == LM_MONITOR) { 300 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 301 b(cont); 302 } else { 303 assert(LockingMode == LM_LEGACY, "must be"); 304 // Check if it is still a light weight lock, this is is true if we 305 // see the stack address of the basicLock in the markWord of the 306 // object. 307 308 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 309 /*release*/ true, /*weak*/ false, tmp); 310 b(cont); 311 } 312 313 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 314 315 // Handle existing monitor. 316 bind(object_has_monitor); 317 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 318 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 319 320 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 321 322 Label notRecursive; 323 cbz(disp_hdr, notRecursive); 324 325 // Recursive lock 326 sub(disp_hdr, disp_hdr, 1u); 327 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 328 cmp(disp_hdr, disp_hdr); // Sets flags for result 329 b(cont); 330 331 bind(notRecursive); 332 333 // Compute owner address. 334 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 335 336 // Set owner to null. 337 // Release to satisfy the JMM 338 stlr(zr, owner_addr); 339 // We need a full fence after clearing owner to avoid stranding. 340 // StoreLoad achieves this. 341 membar(StoreLoad); 342 343 // Check if the entry lists are empty (EntryList first - by convention). 344 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 345 ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset())); 346 orr(rscratch1, rscratch1, tmpReg); 347 cmp(rscratch1, zr); 348 br(Assembler::EQ, cont); // If so we are done. 349 350 // Check if there is a successor. 351 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 352 cmp(rscratch1, zr); 353 br(Assembler::NE, unlocked); // If so we are done. 354 355 // Save the monitor pointer in the current thread, so we can try to 356 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 357 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 358 359 cmp(zr, rthread); // Set Flag to NE => slow path 360 b(cont); 361 362 bind(unlocked); 363 cmp(zr, zr); // Set Flag to EQ => fast path 364 365 // Intentional fall-through 366 367 bind(cont); 368 // flag == EQ indicates success 369 // flag == NE indicates failure 370 br(Assembler::NE, no_count); 371 372 bind(count); 373 if (LockingMode == LM_LEGACY) { 374 dec_held_monitor_count(rscratch1); 375 } 376 377 bind(no_count); 378 } 379 380 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 381 Register t2, Register t3) { 382 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 383 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 384 385 // Handle inflated monitor. 386 Label inflated; 387 // Finish fast lock successfully. MUST branch to with flag == EQ 388 Label locked; 389 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 390 Label slow_path; 391 392 if (UseObjectMonitorTable) { 393 // Clear cache in case fast locking succeeds. 394 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 395 } 396 397 if (DiagnoseSyncOnValueBasedClasses != 0) { 398 load_klass(t1, obj); 399 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 400 tst(t1, KlassFlags::_misc_is_value_based_class); 401 br(Assembler::NE, slow_path); 402 } 403 404 const Register t1_mark = t1; 405 const Register t3_t = t3; 406 407 { // Lightweight locking 408 409 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 410 Label push; 411 412 const Register t2_top = t2; 413 414 // Check if lock-stack is full. 415 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 416 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 417 br(Assembler::GT, slow_path); 418 419 // Check if recursive. 420 subw(t3_t, t2_top, oopSize); 421 ldr(t3_t, Address(rthread, t3_t)); 422 cmp(obj, t3_t); 423 br(Assembler::EQ, push); 424 425 // Relaxed normal load to check for monitor. Optimization for monitor case. 426 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 427 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 428 429 // Not inflated 430 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 431 432 // Try to lock. Transition lock-bits 0b01 => 0b00 433 orr(t1_mark, t1_mark, markWord::unlocked_value); 434 eor(t3_t, t1_mark, markWord::unlocked_value); 435 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 436 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 437 br(Assembler::NE, slow_path); 438 439 bind(push); 440 // After successful lock, push object on lock-stack. 441 str(obj, Address(rthread, t2_top)); 442 addw(t2_top, t2_top, oopSize); 443 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 444 b(locked); 445 } 446 447 { // Handle inflated monitor. 448 bind(inflated); 449 450 const Register t1_monitor = t1; 451 452 if (!UseObjectMonitorTable) { 453 assert(t1_monitor == t1_mark, "should be the same here"); 454 } else { 455 Label monitor_found; 456 457 // Load cache address 458 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 459 460 const int num_unrolled = 2; 461 for (int i = 0; i < num_unrolled; i++) { 462 ldr(t1, Address(t3_t)); 463 cmp(obj, t1); 464 br(Assembler::EQ, monitor_found); 465 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 466 } 467 468 Label loop; 469 470 // Search for obj in cache. 471 bind(loop); 472 473 // Check for match. 474 ldr(t1, Address(t3_t)); 475 cmp(obj, t1); 476 br(Assembler::EQ, monitor_found); 477 478 // Search until null encountered, guaranteed _null_sentinel at end. 479 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 480 cbnz(t1, loop); 481 // Cache Miss, NE set from cmp above, cbnz does not set flags 482 b(slow_path); 483 484 bind(monitor_found); 485 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 486 } 487 488 const Register t2_owner_addr = t2; 489 const Register t3_owner = t3; 490 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 491 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 492 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 493 494 Label monitor_locked; 495 496 // Compute owner address. 497 lea(t2_owner_addr, owner_address); 498 499 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 500 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 501 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 502 /*release*/ false, /*weak*/ false, t3_owner); 503 br(Assembler::EQ, monitor_locked); 504 505 // Check if recursive. 506 cmp(t3_owner, rscratch2); 507 br(Assembler::NE, slow_path); 508 509 // Recursive. 510 increment(recursions_address, 1); 511 512 bind(monitor_locked); 513 if (UseObjectMonitorTable) { 514 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 515 } 516 } 517 518 bind(locked); 519 520 #ifdef ASSERT 521 // Check that locked label is reached with Flags == EQ. 522 Label flag_correct; 523 br(Assembler::EQ, flag_correct); 524 stop("Fast Lock Flag != EQ"); 525 #endif 526 527 bind(slow_path); 528 #ifdef ASSERT 529 // Check that slow_path label is reached with Flags == NE. 530 br(Assembler::NE, flag_correct); 531 stop("Fast Lock Flag != NE"); 532 bind(flag_correct); 533 #endif 534 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 535 } 536 537 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 538 Register t2, Register t3) { 539 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 540 assert_different_registers(obj, box, t1, t2, t3); 541 542 // Handle inflated monitor. 543 Label inflated, inflated_load_mark; 544 // Finish fast unlock successfully. MUST branch to with flag == EQ 545 Label unlocked; 546 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 547 Label slow_path; 548 549 const Register t1_mark = t1; 550 const Register t2_top = t2; 551 const Register t3_t = t3; 552 553 { // Lightweight unlock 554 555 Label push_and_slow_path; 556 557 // Check if obj is top of lock-stack. 558 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 559 subw(t2_top, t2_top, oopSize); 560 ldr(t3_t, Address(rthread, t2_top)); 561 cmp(obj, t3_t); 562 // Top of lock stack was not obj. Must be monitor. 563 br(Assembler::NE, inflated_load_mark); 564 565 // Pop lock-stack. 566 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 567 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 568 569 // Check if recursive. 570 subw(t3_t, t2_top, oopSize); 571 ldr(t3_t, Address(rthread, t3_t)); 572 cmp(obj, t3_t); 573 br(Assembler::EQ, unlocked); 574 575 // Not recursive. 576 // Load Mark. 577 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 578 579 // Check header for monitor (0b10). 580 // Because we got here by popping (meaning we pushed in locked) 581 // there will be no monitor in the box. So we need to push back the obj 582 // so that the runtime can fix any potential anonymous owner. 583 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 584 585 // Try to unlock. Transition lock bits 0b00 => 0b01 586 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 587 orr(t3_t, t1_mark, markWord::unlocked_value); 588 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 589 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 590 br(Assembler::EQ, unlocked); 591 592 bind(push_and_slow_path); 593 // Compare and exchange failed. 594 // Restore lock-stack and handle the unlock in runtime. 595 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 596 addw(t2_top, t2_top, oopSize); 597 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 598 b(slow_path); 599 } 600 601 602 { // Handle inflated monitor. 603 bind(inflated_load_mark); 604 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 605 #ifdef ASSERT 606 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 607 stop("Fast Unlock not monitor"); 608 #endif 609 610 bind(inflated); 611 612 #ifdef ASSERT 613 Label check_done; 614 subw(t2_top, t2_top, oopSize); 615 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 616 br(Assembler::LT, check_done); 617 ldr(t3_t, Address(rthread, t2_top)); 618 cmp(obj, t3_t); 619 br(Assembler::NE, inflated); 620 stop("Fast Unlock lock on stack"); 621 bind(check_done); 622 #endif 623 624 const Register t1_monitor = t1; 625 626 if (!UseObjectMonitorTable) { 627 assert(t1_monitor == t1_mark, "should be the same here"); 628 629 // Untag the monitor. 630 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 631 } else { 632 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 633 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 634 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 635 br(Assembler::LO, slow_path); 636 } 637 638 const Register t2_recursions = t2; 639 Label not_recursive; 640 641 // Check if recursive. 642 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 643 cbz(t2_recursions, not_recursive); 644 645 // Recursive unlock. 646 sub(t2_recursions, t2_recursions, 1u); 647 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 648 // Set flag == EQ 649 cmp(t2_recursions, t2_recursions); 650 b(unlocked); 651 652 bind(not_recursive); 653 654 const Register t2_owner_addr = t2; 655 656 // Compute owner address. 657 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 658 659 // Set owner to null. 660 // Release to satisfy the JMM 661 stlr(zr, t2_owner_addr); 662 // We need a full fence after clearing owner to avoid stranding. 663 // StoreLoad achieves this. 664 membar(StoreLoad); 665 666 // Check if the entry lists are empty (EntryList first - by convention). 667 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 668 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 669 orr(rscratch1, rscratch1, t3_t); 670 cmp(rscratch1, zr); 671 br(Assembler::EQ, unlocked); // If so we are done. 672 673 // Check if there is a successor. 674 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 675 cmp(rscratch1, zr); 676 br(Assembler::NE, unlocked); // If so we are done. 677 678 // Save the monitor pointer in the current thread, so we can try to 679 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 680 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 681 682 cmp(zr, rthread); // Set Flag to NE => slow path 683 b(slow_path); 684 } 685 686 bind(unlocked); 687 cmp(zr, zr); // Set Flags to EQ => fast path 688 689 #ifdef ASSERT 690 // Check that unlocked label is reached with Flags == EQ. 691 Label flag_correct; 692 br(Assembler::EQ, flag_correct); 693 stop("Fast Unlock Flag != EQ"); 694 #endif 695 696 bind(slow_path); 697 #ifdef ASSERT 698 // Check that slow_path label is reached with Flags == NE. 699 br(Assembler::NE, flag_correct); 700 stop("Fast Unlock Flag != NE"); 701 bind(flag_correct); 702 #endif 703 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 704 } 705 706 // Search for str1 in str2 and return index or -1 707 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 708 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 709 Register cnt2, Register cnt1, 710 Register tmp1, Register tmp2, 711 Register tmp3, Register tmp4, 712 Register tmp5, Register tmp6, 713 int icnt1, Register result, int ae) { 714 // NOTE: tmp5, tmp6 can be zr depending on specific method version 715 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 716 717 Register ch1 = rscratch1; 718 Register ch2 = rscratch2; 719 Register cnt1tmp = tmp1; 720 Register cnt2tmp = tmp2; 721 Register cnt1_neg = cnt1; 722 Register cnt2_neg = cnt2; 723 Register result_tmp = tmp4; 724 725 bool isL = ae == StrIntrinsicNode::LL; 726 727 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 728 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 729 int str1_chr_shift = str1_isL ? 0:1; 730 int str2_chr_shift = str2_isL ? 0:1; 731 int str1_chr_size = str1_isL ? 1:2; 732 int str2_chr_size = str2_isL ? 1:2; 733 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 734 (chr_insn)&MacroAssembler::ldrh; 735 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 736 (chr_insn)&MacroAssembler::ldrh; 737 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 738 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 739 740 // Note, inline_string_indexOf() generates checks: 741 // if (substr.count > string.count) return -1; 742 // if (substr.count == 0) return 0; 743 744 // We have two strings, a source string in str2, cnt2 and a pattern string 745 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 746 747 // For larger pattern and source we use a simplified Boyer Moore algorithm. 748 // With a small pattern and source we use linear scan. 749 750 if (icnt1 == -1) { 751 sub(result_tmp, cnt2, cnt1); 752 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 753 br(LT, LINEARSEARCH); 754 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 755 subs(zr, cnt1, 256); 756 lsr(tmp1, cnt2, 2); 757 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 758 br(GE, LINEARSTUB); 759 } 760 761 // The Boyer Moore alogorithm is based on the description here:- 762 // 763 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 764 // 765 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 766 // and the 'Good Suffix' rule. 767 // 768 // These rules are essentially heuristics for how far we can shift the 769 // pattern along the search string. 770 // 771 // The implementation here uses the 'Bad Character' rule only because of the 772 // complexity of initialisation for the 'Good Suffix' rule. 773 // 774 // This is also known as the Boyer-Moore-Horspool algorithm:- 775 // 776 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 777 // 778 // This particular implementation has few java-specific optimizations. 779 // 780 // #define ASIZE 256 781 // 782 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 783 // int i, j; 784 // unsigned c; 785 // unsigned char bc[ASIZE]; 786 // 787 // /* Preprocessing */ 788 // for (i = 0; i < ASIZE; ++i) 789 // bc[i] = m; 790 // for (i = 0; i < m - 1; ) { 791 // c = x[i]; 792 // ++i; 793 // // c < 256 for Latin1 string, so, no need for branch 794 // #ifdef PATTERN_STRING_IS_LATIN1 795 // bc[c] = m - i; 796 // #else 797 // if (c < ASIZE) bc[c] = m - i; 798 // #endif 799 // } 800 // 801 // /* Searching */ 802 // j = 0; 803 // while (j <= n - m) { 804 // c = y[i+j]; 805 // if (x[m-1] == c) 806 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 807 // if (i < 0) return j; 808 // // c < 256 for Latin1 string, so, no need for branch 809 // #ifdef SOURCE_STRING_IS_LATIN1 810 // // LL case: (c< 256) always true. Remove branch 811 // j += bc[y[j+m-1]]; 812 // #endif 813 // #ifndef PATTERN_STRING_IS_UTF 814 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 815 // if (c < ASIZE) 816 // j += bc[y[j+m-1]]; 817 // else 818 // j += 1 819 // #endif 820 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 821 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 822 // if (c < ASIZE) 823 // j += bc[y[j+m-1]]; 824 // else 825 // j += m 826 // #endif 827 // } 828 // } 829 830 if (icnt1 == -1) { 831 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 832 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 833 Register cnt1end = tmp2; 834 Register str2end = cnt2; 835 Register skipch = tmp2; 836 837 // str1 length is >=8, so, we can read at least 1 register for cases when 838 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 839 // UL case. We'll re-read last character in inner pre-loop code to have 840 // single outer pre-loop load 841 const int firstStep = isL ? 7 : 3; 842 843 const int ASIZE = 256; 844 const int STORED_BYTES = 32; // amount of bytes stored per instruction 845 sub(sp, sp, ASIZE); 846 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 847 mov(ch1, sp); 848 BIND(BM_INIT_LOOP); 849 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 850 subs(tmp5, tmp5, 1); 851 br(GT, BM_INIT_LOOP); 852 853 sub(cnt1tmp, cnt1, 1); 854 mov(tmp5, str2); 855 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 856 sub(ch2, cnt1, 1); 857 mov(tmp3, str1); 858 BIND(BCLOOP); 859 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 860 if (!str1_isL) { 861 subs(zr, ch1, ASIZE); 862 br(HS, BCSKIP); 863 } 864 strb(ch2, Address(sp, ch1)); 865 BIND(BCSKIP); 866 subs(ch2, ch2, 1); 867 br(GT, BCLOOP); 868 869 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 870 if (str1_isL == str2_isL) { 871 // load last 8 bytes (8LL/4UU symbols) 872 ldr(tmp6, Address(tmp6, -wordSize)); 873 } else { 874 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 875 // convert Latin1 to UTF. We'll have to wait until load completed, but 876 // it's still faster than per-character loads+checks 877 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 878 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 879 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 880 andr(tmp6, tmp6, 0xFF); // str1[N-4] 881 orr(ch2, ch1, ch2, LSL, 16); 882 orr(tmp6, tmp6, tmp3, LSL, 48); 883 orr(tmp6, tmp6, ch2, LSL, 16); 884 } 885 BIND(BMLOOPSTR2); 886 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 887 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 888 if (str1_isL == str2_isL) { 889 // re-init tmp3. It's for free because it's executed in parallel with 890 // load above. Alternative is to initialize it before loop, but it'll 891 // affect performance on in-order systems with 2 or more ld/st pipelines 892 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 893 } 894 if (!isL) { // UU/UL case 895 lsl(ch2, cnt1tmp, 1); // offset in bytes 896 } 897 cmp(tmp3, skipch); 898 br(NE, BMSKIP); 899 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 900 mov(ch1, tmp6); 901 if (isL) { 902 b(BMLOOPSTR1_AFTER_LOAD); 903 } else { 904 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 905 b(BMLOOPSTR1_CMP); 906 } 907 BIND(BMLOOPSTR1); 908 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 909 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 910 BIND(BMLOOPSTR1_AFTER_LOAD); 911 subs(cnt1tmp, cnt1tmp, 1); 912 br(LT, BMLOOPSTR1_LASTCMP); 913 BIND(BMLOOPSTR1_CMP); 914 cmp(ch1, ch2); 915 br(EQ, BMLOOPSTR1); 916 BIND(BMSKIP); 917 if (!isL) { 918 // if we've met UTF symbol while searching Latin1 pattern, then we can 919 // skip cnt1 symbols 920 if (str1_isL != str2_isL) { 921 mov(result_tmp, cnt1); 922 } else { 923 mov(result_tmp, 1); 924 } 925 subs(zr, skipch, ASIZE); 926 br(HS, BMADV); 927 } 928 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 929 BIND(BMADV); 930 sub(cnt1tmp, cnt1, 1); 931 add(str2, str2, result_tmp, LSL, str2_chr_shift); 932 cmp(str2, str2end); 933 br(LE, BMLOOPSTR2); 934 add(sp, sp, ASIZE); 935 b(NOMATCH); 936 BIND(BMLOOPSTR1_LASTCMP); 937 cmp(ch1, ch2); 938 br(NE, BMSKIP); 939 BIND(BMMATCH); 940 sub(result, str2, tmp5); 941 if (!str2_isL) lsr(result, result, 1); 942 add(sp, sp, ASIZE); 943 b(DONE); 944 945 BIND(LINEARSTUB); 946 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 947 br(LT, LINEAR_MEDIUM); 948 mov(result, zr); 949 RuntimeAddress stub = nullptr; 950 if (isL) { 951 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 952 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 953 } else if (str1_isL) { 954 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 955 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 956 } else { 957 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 958 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 959 } 960 address call = trampoline_call(stub); 961 if (call == nullptr) { 962 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 963 ciEnv::current()->record_failure("CodeCache is full"); 964 return; 965 } 966 b(DONE); 967 } 968 969 BIND(LINEARSEARCH); 970 { 971 Label DO1, DO2, DO3; 972 973 Register str2tmp = tmp2; 974 Register first = tmp3; 975 976 if (icnt1 == -1) 977 { 978 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 979 980 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 981 br(LT, DOSHORT); 982 BIND(LINEAR_MEDIUM); 983 (this->*str1_load_1chr)(first, Address(str1)); 984 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 985 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 986 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 987 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 988 989 BIND(FIRST_LOOP); 990 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 991 cmp(first, ch2); 992 br(EQ, STR1_LOOP); 993 BIND(STR2_NEXT); 994 adds(cnt2_neg, cnt2_neg, str2_chr_size); 995 br(LE, FIRST_LOOP); 996 b(NOMATCH); 997 998 BIND(STR1_LOOP); 999 adds(cnt1tmp, cnt1_neg, str1_chr_size); 1000 add(cnt2tmp, cnt2_neg, str2_chr_size); 1001 br(GE, MATCH); 1002 1003 BIND(STR1_NEXT); 1004 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 1005 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1006 cmp(ch1, ch2); 1007 br(NE, STR2_NEXT); 1008 adds(cnt1tmp, cnt1tmp, str1_chr_size); 1009 add(cnt2tmp, cnt2tmp, str2_chr_size); 1010 br(LT, STR1_NEXT); 1011 b(MATCH); 1012 1013 BIND(DOSHORT); 1014 if (str1_isL == str2_isL) { 1015 cmp(cnt1, (u1)2); 1016 br(LT, DO1); 1017 br(GT, DO3); 1018 } 1019 } 1020 1021 if (icnt1 == 4) { 1022 Label CH1_LOOP; 1023 1024 (this->*load_4chr)(ch1, str1); 1025 sub(result_tmp, cnt2, 4); 1026 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1027 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1028 1029 BIND(CH1_LOOP); 1030 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 1031 cmp(ch1, ch2); 1032 br(EQ, MATCH); 1033 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1034 br(LE, CH1_LOOP); 1035 b(NOMATCH); 1036 } 1037 1038 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1039 Label CH1_LOOP; 1040 1041 BIND(DO2); 1042 (this->*load_2chr)(ch1, str1); 1043 if (icnt1 == 2) { 1044 sub(result_tmp, cnt2, 2); 1045 } 1046 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1047 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1048 BIND(CH1_LOOP); 1049 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1050 cmp(ch1, ch2); 1051 br(EQ, MATCH); 1052 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1053 br(LE, CH1_LOOP); 1054 b(NOMATCH); 1055 } 1056 1057 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1058 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1059 1060 BIND(DO3); 1061 (this->*load_2chr)(first, str1); 1062 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1063 if (icnt1 == 3) { 1064 sub(result_tmp, cnt2, 3); 1065 } 1066 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1067 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1068 BIND(FIRST_LOOP); 1069 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1070 cmpw(first, ch2); 1071 br(EQ, STR1_LOOP); 1072 BIND(STR2_NEXT); 1073 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1074 br(LE, FIRST_LOOP); 1075 b(NOMATCH); 1076 1077 BIND(STR1_LOOP); 1078 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1079 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1080 cmp(ch1, ch2); 1081 br(NE, STR2_NEXT); 1082 b(MATCH); 1083 } 1084 1085 if (icnt1 == -1 || icnt1 == 1) { 1086 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1087 1088 BIND(DO1); 1089 (this->*str1_load_1chr)(ch1, str1); 1090 cmp(cnt2, (u1)8); 1091 br(LT, DO1_SHORT); 1092 1093 sub(result_tmp, cnt2, 8/str2_chr_size); 1094 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1095 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1096 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1097 1098 if (str2_isL) { 1099 orr(ch1, ch1, ch1, LSL, 8); 1100 } 1101 orr(ch1, ch1, ch1, LSL, 16); 1102 orr(ch1, ch1, ch1, LSL, 32); 1103 BIND(CH1_LOOP); 1104 ldr(ch2, Address(str2, cnt2_neg)); 1105 eor(ch2, ch1, ch2); 1106 sub(tmp1, ch2, tmp3); 1107 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1108 bics(tmp1, tmp1, tmp2); 1109 br(NE, HAS_ZERO); 1110 adds(cnt2_neg, cnt2_neg, 8); 1111 br(LT, CH1_LOOP); 1112 1113 cmp(cnt2_neg, (u1)8); 1114 mov(cnt2_neg, 0); 1115 br(LT, CH1_LOOP); 1116 b(NOMATCH); 1117 1118 BIND(HAS_ZERO); 1119 rev(tmp1, tmp1); 1120 clz(tmp1, tmp1); 1121 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1122 b(MATCH); 1123 1124 BIND(DO1_SHORT); 1125 mov(result_tmp, cnt2); 1126 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1127 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1128 BIND(DO1_LOOP); 1129 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1130 cmpw(ch1, ch2); 1131 br(EQ, MATCH); 1132 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1133 br(LT, DO1_LOOP); 1134 } 1135 } 1136 BIND(NOMATCH); 1137 mov(result, -1); 1138 b(DONE); 1139 BIND(MATCH); 1140 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1141 BIND(DONE); 1142 } 1143 1144 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1145 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1146 1147 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1148 Register ch, Register result, 1149 Register tmp1, Register tmp2, Register tmp3) 1150 { 1151 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1152 Register cnt1_neg = cnt1; 1153 Register ch1 = rscratch1; 1154 Register result_tmp = rscratch2; 1155 1156 cbz(cnt1, NOMATCH); 1157 1158 cmp(cnt1, (u1)4); 1159 br(LT, DO1_SHORT); 1160 1161 orr(ch, ch, ch, LSL, 16); 1162 orr(ch, ch, ch, LSL, 32); 1163 1164 sub(cnt1, cnt1, 4); 1165 mov(result_tmp, cnt1); 1166 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1167 sub(cnt1_neg, zr, cnt1, LSL, 1); 1168 1169 mov(tmp3, 0x0001000100010001); 1170 1171 BIND(CH1_LOOP); 1172 ldr(ch1, Address(str1, cnt1_neg)); 1173 eor(ch1, ch, ch1); 1174 sub(tmp1, ch1, tmp3); 1175 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1176 bics(tmp1, tmp1, tmp2); 1177 br(NE, HAS_ZERO); 1178 adds(cnt1_neg, cnt1_neg, 8); 1179 br(LT, CH1_LOOP); 1180 1181 cmp(cnt1_neg, (u1)8); 1182 mov(cnt1_neg, 0); 1183 br(LT, CH1_LOOP); 1184 b(NOMATCH); 1185 1186 BIND(HAS_ZERO); 1187 rev(tmp1, tmp1); 1188 clz(tmp1, tmp1); 1189 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1190 b(MATCH); 1191 1192 BIND(DO1_SHORT); 1193 mov(result_tmp, cnt1); 1194 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1195 sub(cnt1_neg, zr, cnt1, LSL, 1); 1196 BIND(DO1_LOOP); 1197 ldrh(ch1, Address(str1, cnt1_neg)); 1198 cmpw(ch, ch1); 1199 br(EQ, MATCH); 1200 adds(cnt1_neg, cnt1_neg, 2); 1201 br(LT, DO1_LOOP); 1202 BIND(NOMATCH); 1203 mov(result, -1); 1204 b(DONE); 1205 BIND(MATCH); 1206 add(result, result_tmp, cnt1_neg, ASR, 1); 1207 BIND(DONE); 1208 } 1209 1210 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1211 Register ch, Register result, 1212 FloatRegister ztmp1, 1213 FloatRegister ztmp2, 1214 PRegister tmp_pg, 1215 PRegister tmp_pdn, bool isL) 1216 { 1217 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1218 assert(tmp_pg->is_governing(), 1219 "this register has to be a governing predicate register"); 1220 1221 Label LOOP, MATCH, DONE, NOMATCH; 1222 Register vec_len = rscratch1; 1223 Register idx = rscratch2; 1224 1225 SIMD_RegVariant T = (isL == true) ? B : H; 1226 1227 cbz(cnt1, NOMATCH); 1228 1229 // Assign the particular char throughout the vector. 1230 sve_dup(ztmp2, T, ch); 1231 if (isL) { 1232 sve_cntb(vec_len); 1233 } else { 1234 sve_cnth(vec_len); 1235 } 1236 mov(idx, 0); 1237 1238 // Generate a predicate to control the reading of input string. 1239 sve_whilelt(tmp_pg, T, idx, cnt1); 1240 1241 BIND(LOOP); 1242 // Read a vector of 8- or 16-bit data depending on the string type. Note 1243 // that inactive elements indicated by the predicate register won't cause 1244 // a data read from memory to the destination vector. 1245 if (isL) { 1246 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1247 } else { 1248 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1249 } 1250 add(idx, idx, vec_len); 1251 1252 // Perform the comparison. An element of the destination predicate is set 1253 // to active if the particular char is matched. 1254 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1255 1256 // Branch if the particular char is found. 1257 br(NE, MATCH); 1258 1259 sve_whilelt(tmp_pg, T, idx, cnt1); 1260 1261 // Loop back if the particular char not found. 1262 br(MI, LOOP); 1263 1264 BIND(NOMATCH); 1265 mov(result, -1); 1266 b(DONE); 1267 1268 BIND(MATCH); 1269 // Undo the index increment. 1270 sub(idx, idx, vec_len); 1271 1272 // Crop the vector to find its location. 1273 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1274 add(result, idx, -1); 1275 sve_incp(result, T, tmp_pdn); 1276 BIND(DONE); 1277 } 1278 1279 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1280 Register ch, Register result, 1281 Register tmp1, Register tmp2, Register tmp3) 1282 { 1283 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1284 Register cnt1_neg = cnt1; 1285 Register ch1 = rscratch1; 1286 Register result_tmp = rscratch2; 1287 1288 cbz(cnt1, NOMATCH); 1289 1290 cmp(cnt1, (u1)8); 1291 br(LT, DO1_SHORT); 1292 1293 orr(ch, ch, ch, LSL, 8); 1294 orr(ch, ch, ch, LSL, 16); 1295 orr(ch, ch, ch, LSL, 32); 1296 1297 sub(cnt1, cnt1, 8); 1298 mov(result_tmp, cnt1); 1299 lea(str1, Address(str1, cnt1)); 1300 sub(cnt1_neg, zr, cnt1); 1301 1302 mov(tmp3, 0x0101010101010101); 1303 1304 BIND(CH1_LOOP); 1305 ldr(ch1, Address(str1, cnt1_neg)); 1306 eor(ch1, ch, ch1); 1307 sub(tmp1, ch1, tmp3); 1308 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1309 bics(tmp1, tmp1, tmp2); 1310 br(NE, HAS_ZERO); 1311 adds(cnt1_neg, cnt1_neg, 8); 1312 br(LT, CH1_LOOP); 1313 1314 cmp(cnt1_neg, (u1)8); 1315 mov(cnt1_neg, 0); 1316 br(LT, CH1_LOOP); 1317 b(NOMATCH); 1318 1319 BIND(HAS_ZERO); 1320 rev(tmp1, tmp1); 1321 clz(tmp1, tmp1); 1322 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1323 b(MATCH); 1324 1325 BIND(DO1_SHORT); 1326 mov(result_tmp, cnt1); 1327 lea(str1, Address(str1, cnt1)); 1328 sub(cnt1_neg, zr, cnt1); 1329 BIND(DO1_LOOP); 1330 ldrb(ch1, Address(str1, cnt1_neg)); 1331 cmp(ch, ch1); 1332 br(EQ, MATCH); 1333 adds(cnt1_neg, cnt1_neg, 1); 1334 br(LT, DO1_LOOP); 1335 BIND(NOMATCH); 1336 mov(result, -1); 1337 b(DONE); 1338 BIND(MATCH); 1339 add(result, result_tmp, cnt1_neg); 1340 BIND(DONE); 1341 } 1342 1343 // Compare strings. 1344 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1345 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1346 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1347 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1348 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1349 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1350 SHORT_LOOP_START, TAIL_CHECK; 1351 1352 bool isLL = ae == StrIntrinsicNode::LL; 1353 bool isLU = ae == StrIntrinsicNode::LU; 1354 bool isUL = ae == StrIntrinsicNode::UL; 1355 1356 // The stub threshold for LL strings is: 72 (64 + 8) chars 1357 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1358 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1359 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1360 1361 bool str1_isL = isLL || isLU; 1362 bool str2_isL = isLL || isUL; 1363 1364 int str1_chr_shift = str1_isL ? 0 : 1; 1365 int str2_chr_shift = str2_isL ? 0 : 1; 1366 int str1_chr_size = str1_isL ? 1 : 2; 1367 int str2_chr_size = str2_isL ? 1 : 2; 1368 int minCharsInWord = isLL ? wordSize : wordSize/2; 1369 1370 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1371 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1372 (chr_insn)&MacroAssembler::ldrh; 1373 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1374 (chr_insn)&MacroAssembler::ldrh; 1375 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1376 (uxt_insn)&MacroAssembler::uxthw; 1377 1378 BLOCK_COMMENT("string_compare {"); 1379 1380 // Bizarrely, the counts are passed in bytes, regardless of whether they 1381 // are L or U strings, however the result is always in characters. 1382 if (!str1_isL) asrw(cnt1, cnt1, 1); 1383 if (!str2_isL) asrw(cnt2, cnt2, 1); 1384 1385 // Compute the minimum of the string lengths and save the difference. 1386 subsw(result, cnt1, cnt2); 1387 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1388 1389 // A very short string 1390 cmpw(cnt2, minCharsInWord); 1391 br(Assembler::LE, SHORT_STRING); 1392 1393 // Compare longwords 1394 // load first parts of strings and finish initialization while loading 1395 { 1396 if (str1_isL == str2_isL) { // LL or UU 1397 ldr(tmp1, Address(str1)); 1398 cmp(str1, str2); 1399 br(Assembler::EQ, DONE); 1400 ldr(tmp2, Address(str2)); 1401 cmp(cnt2, stub_threshold); 1402 br(GE, STUB); 1403 subsw(cnt2, cnt2, minCharsInWord); 1404 br(EQ, TAIL_CHECK); 1405 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1406 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1407 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1408 } else if (isLU) { 1409 ldrs(vtmp, Address(str1)); 1410 ldr(tmp2, Address(str2)); 1411 cmp(cnt2, stub_threshold); 1412 br(GE, STUB); 1413 subw(cnt2, cnt2, 4); 1414 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1415 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1416 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1417 zip1(vtmp, T8B, vtmp, vtmpZ); 1418 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1419 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1420 add(cnt1, cnt1, 4); 1421 fmovd(tmp1, vtmp); 1422 } else { // UL case 1423 ldr(tmp1, Address(str1)); 1424 ldrs(vtmp, Address(str2)); 1425 cmp(cnt2, stub_threshold); 1426 br(GE, STUB); 1427 subw(cnt2, cnt2, 4); 1428 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1429 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1430 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1431 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1432 zip1(vtmp, T8B, vtmp, vtmpZ); 1433 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1434 add(cnt1, cnt1, 8); 1435 fmovd(tmp2, vtmp); 1436 } 1437 adds(cnt2, cnt2, isUL ? 4 : 8); 1438 br(GE, TAIL); 1439 eor(rscratch2, tmp1, tmp2); 1440 cbnz(rscratch2, DIFF); 1441 // main loop 1442 bind(NEXT_WORD); 1443 if (str1_isL == str2_isL) { 1444 ldr(tmp1, Address(str1, cnt2)); 1445 ldr(tmp2, Address(str2, cnt2)); 1446 adds(cnt2, cnt2, 8); 1447 } else if (isLU) { 1448 ldrs(vtmp, Address(str1, cnt1)); 1449 ldr(tmp2, Address(str2, cnt2)); 1450 add(cnt1, cnt1, 4); 1451 zip1(vtmp, T8B, vtmp, vtmpZ); 1452 fmovd(tmp1, vtmp); 1453 adds(cnt2, cnt2, 8); 1454 } else { // UL 1455 ldrs(vtmp, Address(str2, cnt2)); 1456 ldr(tmp1, Address(str1, cnt1)); 1457 zip1(vtmp, T8B, vtmp, vtmpZ); 1458 add(cnt1, cnt1, 8); 1459 fmovd(tmp2, vtmp); 1460 adds(cnt2, cnt2, 4); 1461 } 1462 br(GE, TAIL); 1463 1464 eor(rscratch2, tmp1, tmp2); 1465 cbz(rscratch2, NEXT_WORD); 1466 b(DIFF); 1467 bind(TAIL); 1468 eor(rscratch2, tmp1, tmp2); 1469 cbnz(rscratch2, DIFF); 1470 // Last longword. In the case where length == 4 we compare the 1471 // same longword twice, but that's still faster than another 1472 // conditional branch. 1473 if (str1_isL == str2_isL) { 1474 ldr(tmp1, Address(str1)); 1475 ldr(tmp2, Address(str2)); 1476 } else if (isLU) { 1477 ldrs(vtmp, Address(str1)); 1478 ldr(tmp2, Address(str2)); 1479 zip1(vtmp, T8B, vtmp, vtmpZ); 1480 fmovd(tmp1, vtmp); 1481 } else { // UL 1482 ldrs(vtmp, Address(str2)); 1483 ldr(tmp1, Address(str1)); 1484 zip1(vtmp, T8B, vtmp, vtmpZ); 1485 fmovd(tmp2, vtmp); 1486 } 1487 bind(TAIL_CHECK); 1488 eor(rscratch2, tmp1, tmp2); 1489 cbz(rscratch2, DONE); 1490 1491 // Find the first different characters in the longwords and 1492 // compute their difference. 1493 bind(DIFF); 1494 rev(rscratch2, rscratch2); 1495 clz(rscratch2, rscratch2); 1496 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1497 lsrv(tmp1, tmp1, rscratch2); 1498 (this->*ext_chr)(tmp1, tmp1); 1499 lsrv(tmp2, tmp2, rscratch2); 1500 (this->*ext_chr)(tmp2, tmp2); 1501 subw(result, tmp1, tmp2); 1502 b(DONE); 1503 } 1504 1505 bind(STUB); 1506 RuntimeAddress stub = nullptr; 1507 switch(ae) { 1508 case StrIntrinsicNode::LL: 1509 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1510 break; 1511 case StrIntrinsicNode::UU: 1512 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1513 break; 1514 case StrIntrinsicNode::LU: 1515 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1516 break; 1517 case StrIntrinsicNode::UL: 1518 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1519 break; 1520 default: 1521 ShouldNotReachHere(); 1522 } 1523 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1524 address call = trampoline_call(stub); 1525 if (call == nullptr) { 1526 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1527 ciEnv::current()->record_failure("CodeCache is full"); 1528 return; 1529 } 1530 b(DONE); 1531 1532 bind(SHORT_STRING); 1533 // Is the minimum length zero? 1534 cbz(cnt2, DONE); 1535 // arrange code to do most branches while loading and loading next characters 1536 // while comparing previous 1537 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1538 subs(cnt2, cnt2, 1); 1539 br(EQ, SHORT_LAST_INIT); 1540 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1541 b(SHORT_LOOP_START); 1542 bind(SHORT_LOOP); 1543 subs(cnt2, cnt2, 1); 1544 br(EQ, SHORT_LAST); 1545 bind(SHORT_LOOP_START); 1546 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1547 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1548 cmp(tmp1, cnt1); 1549 br(NE, SHORT_LOOP_TAIL); 1550 subs(cnt2, cnt2, 1); 1551 br(EQ, SHORT_LAST2); 1552 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1553 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1554 cmp(tmp2, rscratch1); 1555 br(EQ, SHORT_LOOP); 1556 sub(result, tmp2, rscratch1); 1557 b(DONE); 1558 bind(SHORT_LOOP_TAIL); 1559 sub(result, tmp1, cnt1); 1560 b(DONE); 1561 bind(SHORT_LAST2); 1562 cmp(tmp2, rscratch1); 1563 br(EQ, DONE); 1564 sub(result, tmp2, rscratch1); 1565 1566 b(DONE); 1567 bind(SHORT_LAST_INIT); 1568 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1569 bind(SHORT_LAST); 1570 cmp(tmp1, cnt1); 1571 br(EQ, DONE); 1572 sub(result, tmp1, cnt1); 1573 1574 bind(DONE); 1575 1576 BLOCK_COMMENT("} string_compare"); 1577 } 1578 1579 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1580 FloatRegister src2, Condition cond, bool isQ) { 1581 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1582 FloatRegister zn = src1, zm = src2; 1583 bool needs_negation = false; 1584 switch (cond) { 1585 case LT: cond = GT; zn = src2; zm = src1; break; 1586 case LE: cond = GE; zn = src2; zm = src1; break; 1587 case LO: cond = HI; zn = src2; zm = src1; break; 1588 case LS: cond = HS; zn = src2; zm = src1; break; 1589 case NE: cond = EQ; needs_negation = true; break; 1590 default: 1591 break; 1592 } 1593 1594 if (is_floating_point_type(bt)) { 1595 fcm(cond, dst, size, zn, zm); 1596 } else { 1597 cm(cond, dst, size, zn, zm); 1598 } 1599 1600 if (needs_negation) { 1601 notr(dst, isQ ? T16B : T8B, dst); 1602 } 1603 } 1604 1605 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1606 Condition cond, bool isQ) { 1607 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1608 if (bt == T_FLOAT || bt == T_DOUBLE) { 1609 if (cond == Assembler::NE) { 1610 fcm(Assembler::EQ, dst, size, src); 1611 notr(dst, isQ ? T16B : T8B, dst); 1612 } else { 1613 fcm(cond, dst, size, src); 1614 } 1615 } else { 1616 if (cond == Assembler::NE) { 1617 cm(Assembler::EQ, dst, size, src); 1618 notr(dst, isQ ? T16B : T8B, dst); 1619 } else { 1620 cm(cond, dst, size, src); 1621 } 1622 } 1623 } 1624 1625 // Compress the least significant bit of each byte to the rightmost and clear 1626 // the higher garbage bits. 1627 void C2_MacroAssembler::bytemask_compress(Register dst) { 1628 // Example input, dst = 0x01 00 00 00 01 01 00 01 1629 // The "??" bytes are garbage. 1630 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1631 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1632 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1633 andr(dst, dst, 0xff); // dst = 0x8D 1634 } 1635 1636 // Pack the lowest-numbered bit of each mask element in src into a long value 1637 // in dst, at most the first 64 lane elements. 1638 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1639 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1640 FloatRegister vtmp1, FloatRegister vtmp2) { 1641 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1642 assert_different_registers(dst, rscratch1); 1643 assert_different_registers(vtmp1, vtmp2); 1644 1645 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1646 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1647 // Expected: dst = 0x658D 1648 1649 // Convert the mask into vector with sequential bytes. 1650 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1651 sve_cpy(vtmp1, size, src, 1, false); 1652 if (bt != T_BYTE) { 1653 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1654 } 1655 1656 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1657 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1658 // is to compress each significant bit of the byte in a cross-lane way. Due 1659 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1660 // (bit-compress in each lane) with the biggest lane size (T = D) then 1661 // concatenate the results. 1662 1663 // The second source input of BEXT, initialized with 0x01 in each byte. 1664 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1665 sve_dup(vtmp2, B, 1); 1666 1667 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1668 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1669 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1670 // --------------------------------------- 1671 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1672 sve_bext(vtmp1, D, vtmp1, vtmp2); 1673 1674 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1675 // result to dst. 1676 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1677 // dst = 0x658D 1678 if (lane_cnt <= 8) { 1679 // No need to concatenate. 1680 umov(dst, vtmp1, B, 0); 1681 } else if (lane_cnt <= 16) { 1682 ins(vtmp1, B, vtmp1, 1, 8); 1683 umov(dst, vtmp1, H, 0); 1684 } else { 1685 // As the lane count is 64 at most, the final expected value must be in 1686 // the lowest 64 bits after narrowing vtmp1 from D to B. 1687 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1688 umov(dst, vtmp1, D, 0); 1689 } 1690 } else if (UseSVE > 0) { 1691 // Compress the lowest 8 bytes. 1692 fmovd(dst, vtmp1); 1693 bytemask_compress(dst); 1694 if (lane_cnt <= 8) return; 1695 1696 // Repeat on higher bytes and join the results. 1697 // Compress 8 bytes in each iteration. 1698 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1699 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1700 bytemask_compress(rscratch1); 1701 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1702 } 1703 } else { 1704 assert(false, "unsupported"); 1705 ShouldNotReachHere(); 1706 } 1707 } 1708 1709 // Unpack the mask, a long value in src, into predicate register dst based on the 1710 // corresponding data type. Note that dst can support at most 64 lanes. 1711 // Below example gives the expected dst predicate register in different types, with 1712 // a valid src(0x658D) on a 1024-bit vector size machine. 1713 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1714 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1715 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1716 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1717 // 1718 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1719 // has 24 significant bits would be an invalid input if dst predicate register refers to 1720 // a LONG type 1024-bit vector, which has at most 16 lanes. 1721 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1722 FloatRegister vtmp1, FloatRegister vtmp2) { 1723 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1724 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1725 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1726 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1727 // Expected: dst = 0b01101001 10001101 1728 1729 // Put long value from general purpose register into the first lane of vector. 1730 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1731 sve_dup(vtmp1, B, 0); 1732 mov(vtmp1, D, 0, src); 1733 1734 // As sve_cmp generates mask value with the minimum unit in byte, we should 1735 // transform the value in the first lane which is mask in bit now to the 1736 // mask in byte, which can be done by SVE2's BDEP instruction. 1737 1738 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1739 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1740 if (lane_cnt <= 8) { 1741 // Nothing. As only one byte exsits. 1742 } else if (lane_cnt <= 16) { 1743 ins(vtmp1, B, vtmp1, 8, 1); 1744 mov(vtmp1, B, 1, zr); 1745 } else { 1746 sve_vector_extend(vtmp1, D, vtmp1, B); 1747 } 1748 1749 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1750 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1751 sve_dup(vtmp2, B, 1); 1752 1753 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1754 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1755 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1756 // --------------------------------------- 1757 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1758 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1759 1760 if (bt != T_BYTE) { 1761 sve_vector_extend(vtmp1, size, vtmp1, B); 1762 } 1763 // Generate mask according to the given vector, in which the elements have been 1764 // extended to expected type. 1765 // dst = 0b01101001 10001101 1766 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1767 } 1768 1769 // Clobbers: rflags 1770 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1771 FloatRegister zn, FloatRegister zm, Condition cond) { 1772 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1773 FloatRegister z1 = zn, z2 = zm; 1774 switch (cond) { 1775 case LE: z1 = zm; z2 = zn; cond = GE; break; 1776 case LT: z1 = zm; z2 = zn; cond = GT; break; 1777 case LO: z1 = zm; z2 = zn; cond = HI; break; 1778 case LS: z1 = zm; z2 = zn; cond = HS; break; 1779 default: 1780 break; 1781 } 1782 1783 SIMD_RegVariant size = elemType_to_regVariant(bt); 1784 if (is_floating_point_type(bt)) { 1785 sve_fcm(cond, pd, size, pg, z1, z2); 1786 } else { 1787 assert(is_integral_type(bt), "unsupported element type"); 1788 sve_cmp(cond, pd, size, pg, z1, z2); 1789 } 1790 } 1791 1792 // Get index of the last mask lane that is set 1793 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1794 SIMD_RegVariant size = elemType_to_regVariant(bt); 1795 sve_rev(ptmp, size, src); 1796 sve_brkb(ptmp, ptrue, ptmp, false); 1797 sve_cntp(dst, size, ptrue, ptmp); 1798 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1799 subw(dst, rscratch1, dst); 1800 } 1801 1802 // Extend integer vector src to dst with the same lane count 1803 // but larger element size, e.g. 4B -> 4I 1804 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1805 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1806 if (src_bt == T_BYTE) { 1807 if (dst_bt == T_SHORT) { 1808 // 4B/8B to 4S/8S 1809 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1810 } else { 1811 // 4B to 4I 1812 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1813 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1814 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1815 } 1816 } else if (src_bt == T_SHORT) { 1817 // 4S to 4I 1818 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1819 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1820 } else if (src_bt == T_INT) { 1821 // 2I to 2L 1822 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1823 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1824 } else { 1825 ShouldNotReachHere(); 1826 } 1827 } 1828 1829 // Narrow integer vector src down to dst with the same lane count 1830 // but smaller element size, e.g. 4I -> 4B 1831 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1832 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1833 if (src_bt == T_SHORT) { 1834 // 4S/8S to 4B/8B 1835 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1836 assert(dst_bt == T_BYTE, "unsupported"); 1837 xtn(dst, T8B, src, T8H); 1838 } else if (src_bt == T_INT) { 1839 // 4I to 4B/4S 1840 assert(src_vlen_in_bytes == 16, "unsupported"); 1841 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1842 xtn(dst, T4H, src, T4S); 1843 if (dst_bt == T_BYTE) { 1844 xtn(dst, T8B, dst, T8H); 1845 } 1846 } else if (src_bt == T_LONG) { 1847 // 2L to 2I 1848 assert(src_vlen_in_bytes == 16, "unsupported"); 1849 assert(dst_bt == T_INT, "unsupported"); 1850 xtn(dst, T2S, src, T2D); 1851 } else { 1852 ShouldNotReachHere(); 1853 } 1854 } 1855 1856 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1857 FloatRegister src, SIMD_RegVariant src_size, 1858 bool is_unsigned) { 1859 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1860 1861 if (src_size == B) { 1862 switch (dst_size) { 1863 case H: 1864 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1865 break; 1866 case S: 1867 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1868 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1869 break; 1870 case D: 1871 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1872 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1873 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1874 break; 1875 default: 1876 ShouldNotReachHere(); 1877 } 1878 } else if (src_size == H) { 1879 if (dst_size == S) { 1880 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1881 } else { // D 1882 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1883 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1884 } 1885 } else if (src_size == S) { 1886 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1887 } 1888 } 1889 1890 // Vector narrow from src to dst with specified element sizes. 1891 // High part of dst vector will be filled with zero. 1892 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1893 FloatRegister src, SIMD_RegVariant src_size, 1894 FloatRegister tmp) { 1895 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1896 assert_different_registers(src, tmp); 1897 sve_dup(tmp, src_size, 0); 1898 if (src_size == D) { 1899 switch (dst_size) { 1900 case S: 1901 sve_uzp1(dst, S, src, tmp); 1902 break; 1903 case H: 1904 assert_different_registers(dst, tmp); 1905 sve_uzp1(dst, S, src, tmp); 1906 sve_uzp1(dst, H, dst, tmp); 1907 break; 1908 case B: 1909 assert_different_registers(dst, tmp); 1910 sve_uzp1(dst, S, src, tmp); 1911 sve_uzp1(dst, H, dst, tmp); 1912 sve_uzp1(dst, B, dst, tmp); 1913 break; 1914 default: 1915 ShouldNotReachHere(); 1916 } 1917 } else if (src_size == S) { 1918 if (dst_size == H) { 1919 sve_uzp1(dst, H, src, tmp); 1920 } else { // B 1921 assert_different_registers(dst, tmp); 1922 sve_uzp1(dst, H, src, tmp); 1923 sve_uzp1(dst, B, dst, tmp); 1924 } 1925 } else if (src_size == H) { 1926 sve_uzp1(dst, B, src, tmp); 1927 } 1928 } 1929 1930 // Extend src predicate to dst predicate with the same lane count but larger 1931 // element size, e.g. 64Byte -> 512Long 1932 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1933 uint dst_element_length_in_bytes, 1934 uint src_element_length_in_bytes) { 1935 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1936 sve_punpklo(dst, src); 1937 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1938 sve_punpklo(dst, src); 1939 sve_punpklo(dst, dst); 1940 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1941 sve_punpklo(dst, src); 1942 sve_punpklo(dst, dst); 1943 sve_punpklo(dst, dst); 1944 } else { 1945 assert(false, "unsupported"); 1946 ShouldNotReachHere(); 1947 } 1948 } 1949 1950 // Narrow src predicate to dst predicate with the same lane count but 1951 // smaller element size, e.g. 512Long -> 64Byte 1952 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1953 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1954 // The insignificant bits in src predicate are expected to be zero. 1955 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1956 // passed as the second argument. An example narrowing operation with a given mask would be - 1957 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1958 // Mask (for 2 Longs) : TF 1959 // Predicate register for the above mask (16 bits) : 00000001 00000000 1960 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1961 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1962 assert_different_registers(src, ptmp); 1963 assert_different_registers(dst, ptmp); 1964 sve_pfalse(ptmp); 1965 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1966 sve_uzp1(dst, B, src, ptmp); 1967 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1968 sve_uzp1(dst, H, src, ptmp); 1969 sve_uzp1(dst, B, dst, ptmp); 1970 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1971 sve_uzp1(dst, S, src, ptmp); 1972 sve_uzp1(dst, H, dst, ptmp); 1973 sve_uzp1(dst, B, dst, ptmp); 1974 } else { 1975 assert(false, "unsupported"); 1976 ShouldNotReachHere(); 1977 } 1978 } 1979 1980 // Vector reduction add for integral type with ASIMD instructions. 1981 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1982 Register isrc, FloatRegister vsrc, 1983 unsigned vector_length_in_bytes, 1984 FloatRegister vtmp) { 1985 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1986 assert_different_registers(dst, isrc); 1987 bool isQ = vector_length_in_bytes == 16; 1988 1989 BLOCK_COMMENT("neon_reduce_add_integral {"); 1990 switch(bt) { 1991 case T_BYTE: 1992 addv(vtmp, isQ ? T16B : T8B, vsrc); 1993 smov(dst, vtmp, B, 0); 1994 addw(dst, dst, isrc, ext::sxtb); 1995 break; 1996 case T_SHORT: 1997 addv(vtmp, isQ ? T8H : T4H, vsrc); 1998 smov(dst, vtmp, H, 0); 1999 addw(dst, dst, isrc, ext::sxth); 2000 break; 2001 case T_INT: 2002 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 2003 umov(dst, vtmp, S, 0); 2004 addw(dst, dst, isrc); 2005 break; 2006 case T_LONG: 2007 assert(isQ, "unsupported"); 2008 addpd(vtmp, vsrc); 2009 umov(dst, vtmp, D, 0); 2010 add(dst, dst, isrc); 2011 break; 2012 default: 2013 assert(false, "unsupported"); 2014 ShouldNotReachHere(); 2015 } 2016 BLOCK_COMMENT("} neon_reduce_add_integral"); 2017 } 2018 2019 // Vector reduction multiply for integral type with ASIMD instructions. 2020 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 2021 // Clobbers: rscratch1 2022 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 2023 Register isrc, FloatRegister vsrc, 2024 unsigned vector_length_in_bytes, 2025 FloatRegister vtmp1, FloatRegister vtmp2) { 2026 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2027 bool isQ = vector_length_in_bytes == 16; 2028 2029 BLOCK_COMMENT("neon_reduce_mul_integral {"); 2030 switch(bt) { 2031 case T_BYTE: 2032 if (isQ) { 2033 // Multiply the lower half and higher half of vector iteratively. 2034 // vtmp1 = vsrc[8:15] 2035 ins(vtmp1, D, vsrc, 0, 1); 2036 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2037 mulv(vtmp1, T8B, vtmp1, vsrc); 2038 // vtmp2 = vtmp1[4:7] 2039 ins(vtmp2, S, vtmp1, 0, 1); 2040 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2041 mulv(vtmp1, T8B, vtmp2, vtmp1); 2042 } else { 2043 ins(vtmp1, S, vsrc, 0, 1); 2044 mulv(vtmp1, T8B, vtmp1, vsrc); 2045 } 2046 // vtmp2 = vtmp1[2:3] 2047 ins(vtmp2, H, vtmp1, 0, 1); 2048 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2049 mulv(vtmp2, T8B, vtmp2, vtmp1); 2050 // dst = vtmp2[0] * isrc * vtmp2[1] 2051 umov(rscratch1, vtmp2, B, 0); 2052 mulw(dst, rscratch1, isrc); 2053 sxtb(dst, dst); 2054 umov(rscratch1, vtmp2, B, 1); 2055 mulw(dst, rscratch1, dst); 2056 sxtb(dst, dst); 2057 break; 2058 case T_SHORT: 2059 if (isQ) { 2060 ins(vtmp2, D, vsrc, 0, 1); 2061 mulv(vtmp2, T4H, vtmp2, vsrc); 2062 ins(vtmp1, S, vtmp2, 0, 1); 2063 mulv(vtmp1, T4H, vtmp1, vtmp2); 2064 } else { 2065 ins(vtmp1, S, vsrc, 0, 1); 2066 mulv(vtmp1, T4H, vtmp1, vsrc); 2067 } 2068 umov(rscratch1, vtmp1, H, 0); 2069 mulw(dst, rscratch1, isrc); 2070 sxth(dst, dst); 2071 umov(rscratch1, vtmp1, H, 1); 2072 mulw(dst, rscratch1, dst); 2073 sxth(dst, dst); 2074 break; 2075 case T_INT: 2076 if (isQ) { 2077 ins(vtmp1, D, vsrc, 0, 1); 2078 mulv(vtmp1, T2S, vtmp1, vsrc); 2079 } else { 2080 vtmp1 = vsrc; 2081 } 2082 umov(rscratch1, vtmp1, S, 0); 2083 mul(dst, rscratch1, isrc); 2084 umov(rscratch1, vtmp1, S, 1); 2085 mul(dst, rscratch1, dst); 2086 break; 2087 case T_LONG: 2088 umov(rscratch1, vsrc, D, 0); 2089 mul(dst, isrc, rscratch1); 2090 umov(rscratch1, vsrc, D, 1); 2091 mul(dst, dst, rscratch1); 2092 break; 2093 default: 2094 assert(false, "unsupported"); 2095 ShouldNotReachHere(); 2096 } 2097 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2098 } 2099 2100 // Vector reduction multiply for floating-point type with ASIMD instructions. 2101 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2102 FloatRegister fsrc, FloatRegister vsrc, 2103 unsigned vector_length_in_bytes, 2104 FloatRegister vtmp) { 2105 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2106 bool isQ = vector_length_in_bytes == 16; 2107 2108 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2109 switch(bt) { 2110 case T_FLOAT: 2111 fmuls(dst, fsrc, vsrc); 2112 ins(vtmp, S, vsrc, 0, 1); 2113 fmuls(dst, dst, vtmp); 2114 if (isQ) { 2115 ins(vtmp, S, vsrc, 0, 2); 2116 fmuls(dst, dst, vtmp); 2117 ins(vtmp, S, vsrc, 0, 3); 2118 fmuls(dst, dst, vtmp); 2119 } 2120 break; 2121 case T_DOUBLE: 2122 assert(isQ, "unsupported"); 2123 fmuld(dst, fsrc, vsrc); 2124 ins(vtmp, D, vsrc, 0, 1); 2125 fmuld(dst, dst, vtmp); 2126 break; 2127 default: 2128 assert(false, "unsupported"); 2129 ShouldNotReachHere(); 2130 } 2131 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2132 } 2133 2134 // Helper to select logical instruction 2135 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2136 Register Rn, Register Rm, 2137 enum shift_kind kind, unsigned shift) { 2138 switch(opc) { 2139 case Op_AndReductionV: 2140 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2141 break; 2142 case Op_OrReductionV: 2143 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2144 break; 2145 case Op_XorReductionV: 2146 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2147 break; 2148 default: 2149 assert(false, "unsupported"); 2150 ShouldNotReachHere(); 2151 } 2152 } 2153 2154 // Vector reduction logical operations And, Or, Xor 2155 // Clobbers: rscratch1 2156 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2157 Register isrc, FloatRegister vsrc, 2158 unsigned vector_length_in_bytes) { 2159 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2160 "unsupported"); 2161 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2162 assert_different_registers(dst, isrc); 2163 bool isQ = vector_length_in_bytes == 16; 2164 2165 BLOCK_COMMENT("neon_reduce_logical {"); 2166 umov(rscratch1, vsrc, isQ ? D : S, 0); 2167 umov(dst, vsrc, isQ ? D : S, 1); 2168 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2169 switch(bt) { 2170 case T_BYTE: 2171 if (isQ) { 2172 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2173 } 2174 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2175 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2176 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2177 sxtb(dst, dst); 2178 break; 2179 case T_SHORT: 2180 if (isQ) { 2181 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2182 } 2183 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2184 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2185 sxth(dst, dst); 2186 break; 2187 case T_INT: 2188 if (isQ) { 2189 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2190 } 2191 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2192 break; 2193 case T_LONG: 2194 assert(isQ, "unsupported"); 2195 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2196 break; 2197 default: 2198 assert(false, "unsupported"); 2199 ShouldNotReachHere(); 2200 } 2201 BLOCK_COMMENT("} neon_reduce_logical"); 2202 } 2203 2204 // Vector reduction min/max for integral type with ASIMD instructions. 2205 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2206 // Clobbers: rscratch1, rflags 2207 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2208 Register isrc, FloatRegister vsrc, 2209 unsigned vector_length_in_bytes, 2210 FloatRegister vtmp) { 2211 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2212 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2213 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2214 assert_different_registers(dst, isrc); 2215 bool isQ = vector_length_in_bytes == 16; 2216 bool is_min = opc == Op_MinReductionV; 2217 2218 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2219 if (bt == T_LONG) { 2220 assert(vtmp == fnoreg, "should be"); 2221 assert(isQ, "should be"); 2222 umov(rscratch1, vsrc, D, 0); 2223 cmp(isrc, rscratch1); 2224 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2225 umov(rscratch1, vsrc, D, 1); 2226 cmp(dst, rscratch1); 2227 csel(dst, dst, rscratch1, is_min ? LT : GT); 2228 } else { 2229 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2230 if (size == T2S) { 2231 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2232 } else { 2233 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2234 } 2235 if (bt == T_INT) { 2236 umov(dst, vtmp, S, 0); 2237 } else { 2238 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2239 } 2240 cmpw(dst, isrc); 2241 cselw(dst, dst, isrc, is_min ? LT : GT); 2242 } 2243 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2244 } 2245 2246 // Vector reduction for integral type with SVE instruction. 2247 // Supported operations are Add, And, Or, Xor, Max, Min. 2248 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2249 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2250 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2251 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2252 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2253 assert_different_registers(src1, dst); 2254 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2255 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2256 switch (opc) { 2257 case Op_AddReductionVI: { 2258 sve_uaddv(tmp, size, pg, src2); 2259 if (bt == T_BYTE) { 2260 smov(dst, tmp, size, 0); 2261 addw(dst, src1, dst, ext::sxtb); 2262 } else if (bt == T_SHORT) { 2263 smov(dst, tmp, size, 0); 2264 addw(dst, src1, dst, ext::sxth); 2265 } else { 2266 umov(dst, tmp, size, 0); 2267 addw(dst, dst, src1); 2268 } 2269 break; 2270 } 2271 case Op_AddReductionVL: { 2272 sve_uaddv(tmp, size, pg, src2); 2273 umov(dst, tmp, size, 0); 2274 add(dst, dst, src1); 2275 break; 2276 } 2277 case Op_AndReductionV: { 2278 sve_andv(tmp, size, pg, src2); 2279 if (bt == T_INT || bt == T_LONG) { 2280 umov(dst, tmp, size, 0); 2281 } else { 2282 smov(dst, tmp, size, 0); 2283 } 2284 if (bt == T_LONG) { 2285 andr(dst, dst, src1); 2286 } else { 2287 andw(dst, dst, src1); 2288 } 2289 break; 2290 } 2291 case Op_OrReductionV: { 2292 sve_orv(tmp, size, pg, src2); 2293 if (bt == T_INT || bt == T_LONG) { 2294 umov(dst, tmp, size, 0); 2295 } else { 2296 smov(dst, tmp, size, 0); 2297 } 2298 if (bt == T_LONG) { 2299 orr(dst, dst, src1); 2300 } else { 2301 orrw(dst, dst, src1); 2302 } 2303 break; 2304 } 2305 case Op_XorReductionV: { 2306 sve_eorv(tmp, size, pg, src2); 2307 if (bt == T_INT || bt == T_LONG) { 2308 umov(dst, tmp, size, 0); 2309 } else { 2310 smov(dst, tmp, size, 0); 2311 } 2312 if (bt == T_LONG) { 2313 eor(dst, dst, src1); 2314 } else { 2315 eorw(dst, dst, src1); 2316 } 2317 break; 2318 } 2319 case Op_MaxReductionV: { 2320 sve_smaxv(tmp, size, pg, src2); 2321 if (bt == T_INT || bt == T_LONG) { 2322 umov(dst, tmp, size, 0); 2323 } else { 2324 smov(dst, tmp, size, 0); 2325 } 2326 if (bt == T_LONG) { 2327 cmp(dst, src1); 2328 csel(dst, dst, src1, Assembler::GT); 2329 } else { 2330 cmpw(dst, src1); 2331 cselw(dst, dst, src1, Assembler::GT); 2332 } 2333 break; 2334 } 2335 case Op_MinReductionV: { 2336 sve_sminv(tmp, size, pg, src2); 2337 if (bt == T_INT || bt == T_LONG) { 2338 umov(dst, tmp, size, 0); 2339 } else { 2340 smov(dst, tmp, size, 0); 2341 } 2342 if (bt == T_LONG) { 2343 cmp(dst, src1); 2344 csel(dst, dst, src1, Assembler::LT); 2345 } else { 2346 cmpw(dst, src1); 2347 cselw(dst, dst, src1, Assembler::LT); 2348 } 2349 break; 2350 } 2351 default: 2352 assert(false, "unsupported"); 2353 ShouldNotReachHere(); 2354 } 2355 2356 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2357 if (bt == T_BYTE) { 2358 sxtb(dst, dst); 2359 } else if (bt == T_SHORT) { 2360 sxth(dst, dst); 2361 } 2362 } 2363 } 2364 2365 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2366 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2367 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2368 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2369 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2370 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2371 2372 // Set all elements to false if the input "lane_cnt" is zero. 2373 if (lane_cnt == 0) { 2374 sve_pfalse(dst); 2375 return; 2376 } 2377 2378 SIMD_RegVariant size = elemType_to_regVariant(bt); 2379 assert(size != Q, "invalid size"); 2380 2381 // Set all true if "lane_cnt" equals to the max lane count. 2382 if (lane_cnt == max_vector_length) { 2383 sve_ptrue(dst, size, /* ALL */ 0b11111); 2384 return; 2385 } 2386 2387 // Fixed numbers for "ptrue". 2388 switch(lane_cnt) { 2389 case 1: /* VL1 */ 2390 case 2: /* VL2 */ 2391 case 3: /* VL3 */ 2392 case 4: /* VL4 */ 2393 case 5: /* VL5 */ 2394 case 6: /* VL6 */ 2395 case 7: /* VL7 */ 2396 case 8: /* VL8 */ 2397 sve_ptrue(dst, size, lane_cnt); 2398 return; 2399 case 16: 2400 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2401 return; 2402 case 32: 2403 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2404 return; 2405 case 64: 2406 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2407 return; 2408 case 128: 2409 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2410 return; 2411 case 256: 2412 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2413 return; 2414 default: 2415 break; 2416 } 2417 2418 // Special patterns for "ptrue". 2419 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2420 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2421 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2422 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2423 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2424 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2425 } else { 2426 // Encode to "whileltw" for the remaining cases. 2427 mov(rscratch1, lane_cnt); 2428 sve_whileltw(dst, size, zr, rscratch1); 2429 } 2430 } 2431 2432 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2433 // Any remaining elements of dst will be filled with zero. 2434 // Clobbers: rscratch1 2435 // Preserves: src, mask 2436 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2437 FloatRegister vtmp1, FloatRegister vtmp2, 2438 PRegister pgtmp) { 2439 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2440 assert_different_registers(dst, src, vtmp1, vtmp2); 2441 assert_different_registers(mask, pgtmp); 2442 2443 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2444 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2445 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2446 sve_dup(vtmp2, H, 0); 2447 2448 // Extend lowest half to type INT. 2449 // dst = 00004444 00003333 00002222 00001111 2450 sve_uunpklo(dst, S, src); 2451 // pgtmp = 00000001 00000000 00000001 00000001 2452 sve_punpklo(pgtmp, mask); 2453 // Pack the active elements in size of type INT to the right, 2454 // and fill the remainings with zero. 2455 // dst = 00000000 00004444 00002222 00001111 2456 sve_compact(dst, S, dst, pgtmp); 2457 // Narrow the result back to type SHORT. 2458 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2459 sve_uzp1(dst, H, dst, vtmp2); 2460 // Count the active elements of lowest half. 2461 // rscratch1 = 3 2462 sve_cntp(rscratch1, S, ptrue, pgtmp); 2463 2464 // Repeat to the highest half. 2465 // pgtmp = 00000001 00000000 00000000 00000001 2466 sve_punpkhi(pgtmp, mask); 2467 // vtmp1 = 00008888 00007777 00006666 00005555 2468 sve_uunpkhi(vtmp1, S, src); 2469 // vtmp1 = 00000000 00000000 00008888 00005555 2470 sve_compact(vtmp1, S, vtmp1, pgtmp); 2471 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2472 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2473 2474 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2475 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2476 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2477 // TRUE_CNT is the number of active elements in the compressed low. 2478 neg(rscratch1, rscratch1); 2479 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2480 sve_index(vtmp2, H, rscratch1, 1); 2481 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2482 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2483 2484 // Combine the compressed high(after shifted) with the compressed low. 2485 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2486 sve_orr(dst, dst, vtmp1); 2487 } 2488 2489 // Clobbers: rscratch1, rscratch2 2490 // Preserves: src, mask 2491 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2492 FloatRegister vtmp1, FloatRegister vtmp2, 2493 FloatRegister vtmp3, FloatRegister vtmp4, 2494 PRegister ptmp, PRegister pgtmp) { 2495 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2496 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2497 assert_different_registers(mask, ptmp, pgtmp); 2498 // Example input: src = 88 77 66 55 44 33 22 11 2499 // mask = 01 00 00 01 01 00 01 01 2500 // Expected result: dst = 00 00 00 88 55 44 22 11 2501 2502 sve_dup(vtmp4, B, 0); 2503 // Extend lowest half to type SHORT. 2504 // vtmp1 = 0044 0033 0022 0011 2505 sve_uunpklo(vtmp1, H, src); 2506 // ptmp = 0001 0000 0001 0001 2507 sve_punpklo(ptmp, mask); 2508 // Count the active elements of lowest half. 2509 // rscratch2 = 3 2510 sve_cntp(rscratch2, H, ptrue, ptmp); 2511 // Pack the active elements in size of type SHORT to the right, 2512 // and fill the remainings with zero. 2513 // dst = 0000 0044 0022 0011 2514 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2515 // Narrow the result back to type BYTE. 2516 // dst = 00 00 00 00 00 44 22 11 2517 sve_uzp1(dst, B, dst, vtmp4); 2518 2519 // Repeat to the highest half. 2520 // ptmp = 0001 0000 0000 0001 2521 sve_punpkhi(ptmp, mask); 2522 // vtmp1 = 0088 0077 0066 0055 2523 sve_uunpkhi(vtmp2, H, src); 2524 // vtmp1 = 0000 0000 0088 0055 2525 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2526 2527 sve_dup(vtmp4, B, 0); 2528 // vtmp1 = 00 00 00 00 00 00 88 55 2529 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2530 2531 // Compressed low: dst = 00 00 00 00 00 44 22 11 2532 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2533 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2534 // TRUE_CNT is the number of active elements in the compressed low. 2535 neg(rscratch2, rscratch2); 2536 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2537 sve_index(vtmp2, B, rscratch2, 1); 2538 // vtmp1 = 00 00 00 88 55 00 00 00 2539 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2540 // Combine the compressed high(after shifted) with the compressed low. 2541 // dst = 00 00 00 88 55 44 22 11 2542 sve_orr(dst, dst, vtmp1); 2543 } 2544 2545 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2546 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2547 SIMD_Arrangement size = isQ ? T16B : T8B; 2548 if (bt == T_BYTE) { 2549 rbit(dst, size, src); 2550 } else { 2551 neon_reverse_bytes(dst, src, bt, isQ); 2552 rbit(dst, size, dst); 2553 } 2554 } 2555 2556 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2557 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2558 SIMD_Arrangement size = isQ ? T16B : T8B; 2559 switch (bt) { 2560 case T_BYTE: 2561 if (dst != src) { 2562 orr(dst, size, src, src); 2563 } 2564 break; 2565 case T_SHORT: 2566 rev16(dst, size, src); 2567 break; 2568 case T_INT: 2569 rev32(dst, size, src); 2570 break; 2571 case T_LONG: 2572 rev64(dst, size, src); 2573 break; 2574 default: 2575 assert(false, "unsupported"); 2576 ShouldNotReachHere(); 2577 } 2578 } 2579 2580 // Extract a scalar element from an sve vector at position 'idx'. 2581 // The input elements in src are expected to be of integral type. 2582 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2583 int idx, FloatRegister vtmp) { 2584 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2585 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2586 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2587 if (bt == T_INT || bt == T_LONG) { 2588 umov(dst, src, size, idx); 2589 } else { 2590 smov(dst, src, size, idx); 2591 } 2592 } else { 2593 sve_orr(vtmp, src, src); 2594 sve_ext(vtmp, vtmp, idx << size); 2595 if (bt == T_INT || bt == T_LONG) { 2596 umov(dst, vtmp, size, 0); 2597 } else { 2598 smov(dst, vtmp, size, 0); 2599 } 2600 } 2601 } 2602 2603 // java.lang.Math::round intrinsics 2604 2605 // Clobbers: rscratch1, rflags 2606 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2607 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2608 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2609 switch (T) { 2610 case T2S: 2611 case T4S: 2612 fmovs(tmp1, T, 0.5f); 2613 mov(rscratch1, jint_cast(0x1.0p23f)); 2614 break; 2615 case T2D: 2616 fmovd(tmp1, T, 0.5); 2617 mov(rscratch1, julong_cast(0x1.0p52)); 2618 break; 2619 default: 2620 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2621 } 2622 fadd(tmp1, T, tmp1, src); 2623 fcvtms(tmp1, T, tmp1); 2624 // tmp1 = floor(src + 0.5, ties to even) 2625 2626 fcvtas(dst, T, src); 2627 // dst = round(src), ties to away 2628 2629 fneg(tmp3, T, src); 2630 dup(tmp2, T, rscratch1); 2631 cm(HS, tmp3, T, tmp3, tmp2); 2632 // tmp3 is now a set of flags 2633 2634 bif(dst, T16B, tmp1, tmp3); 2635 // result in dst 2636 } 2637 2638 // Clobbers: rscratch1, rflags 2639 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2640 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2641 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2642 assert_different_registers(tmp1, tmp2, src, dst); 2643 2644 switch (T) { 2645 case S: 2646 mov(rscratch1, jint_cast(0x1.0p23f)); 2647 break; 2648 case D: 2649 mov(rscratch1, julong_cast(0x1.0p52)); 2650 break; 2651 default: 2652 assert(T == S || T == D, "invalid register variant"); 2653 } 2654 2655 sve_frinta(dst, T, ptrue, src); 2656 // dst = round(src), ties to away 2657 2658 Label none; 2659 2660 sve_fneg(tmp1, T, ptrue, src); 2661 sve_dup(tmp2, T, rscratch1); 2662 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2663 br(EQ, none); 2664 { 2665 sve_cpy(tmp1, T, pgtmp, 0.5); 2666 sve_fadd(tmp1, T, pgtmp, src); 2667 sve_frintm(dst, T, pgtmp, tmp1); 2668 // dst = floor(src + 0.5, ties to even) 2669 } 2670 bind(none); 2671 2672 sve_fcvtzs(dst, T, ptrue, dst, T); 2673 // result in dst 2674 } 2675 2676 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2677 FloatRegister one, SIMD_Arrangement T) { 2678 assert_different_registers(dst, src, zero, one); 2679 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2680 2681 facgt(dst, T, src, zero); 2682 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2683 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2684 } 2685 2686 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2687 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2688 assert_different_registers(dst, src, zero, one, vtmp); 2689 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2690 2691 sve_orr(vtmp, src, src); 2692 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2693 switch (T) { 2694 case S: 2695 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2696 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2697 // on the sign of the float value 2698 break; 2699 case D: 2700 sve_and(vtmp, T, min_jlong); 2701 sve_orr(vtmp, T, jlong_cast(1.0)); 2702 break; 2703 default: 2704 assert(false, "unsupported"); 2705 ShouldNotReachHere(); 2706 } 2707 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2708 // Result in dst 2709 } 2710 2711 bool C2_MacroAssembler::in_scratch_emit_size() { 2712 if (ciEnv::current()->task() != nullptr) { 2713 PhaseOutput* phase_output = Compile::current()->output(); 2714 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2715 return true; 2716 } 2717 } 2718 return MacroAssembler::in_scratch_emit_size(); 2719 }