1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 // jdk.internal.util.ArraysSupport.vectorizedHashCode 50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 51 FloatRegister vdata0, FloatRegister vdata1, 52 FloatRegister vdata2, FloatRegister vdata3, 53 FloatRegister vmul0, FloatRegister vmul1, 54 FloatRegister vmul2, FloatRegister vmul3, 55 FloatRegister vpow, FloatRegister vpowm, 56 BasicType eltype) { 57 ARRAYS_HASHCODE_REGISTERS; 58 59 Register tmp1 = rscratch1, tmp2 = rscratch2; 60 61 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 62 63 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 64 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 65 // use 4H for chars and shorts instead, but using 8H gives better performance. 66 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 67 : eltype == T_CHAR || eltype == T_SHORT ? 8 68 : eltype == T_INT ? 4 69 : 0; 70 guarantee(vf, "unsupported eltype"); 71 72 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 73 const size_t unroll_factor = 4; 74 75 switch (eltype) { 76 case T_BOOLEAN: 77 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 78 break; 79 case T_CHAR: 80 BLOCK_COMMENT("arrays_hashcode(char) {"); 81 break; 82 case T_BYTE: 83 BLOCK_COMMENT("arrays_hashcode(byte) {"); 84 break; 85 case T_SHORT: 86 BLOCK_COMMENT("arrays_hashcode(short) {"); 87 break; 88 case T_INT: 89 BLOCK_COMMENT("arrays_hashcode(int) {"); 90 break; 91 default: 92 ShouldNotReachHere(); 93 } 94 95 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 96 // implemented by the stub executes just once. Call the stub only if at least two iterations will 97 // be executed. 98 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 99 cmpw(cnt, large_threshold); 100 br(Assembler::HS, LARGE); 101 102 bind(TAIL); 103 104 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 105 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 106 // Iteration eats up the remainder, uf elements at a time. 107 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 108 andr(tmp2, cnt, unroll_factor - 1); 109 adr(tmp1, BR_BASE); 110 sub(tmp1, tmp1, tmp2, ext::sxtw, 3); 111 movw(tmp2, 0x1f); 112 br(tmp1); 113 114 bind(LOOP); 115 for (size_t i = 0; i < unroll_factor; ++i) { 116 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 117 maddw(result, result, tmp2, tmp1); 118 } 119 bind(BR_BASE); 120 subsw(cnt, cnt, unroll_factor); 121 br(Assembler::HS, LOOP); 122 123 b(DONE); 124 125 bind(LARGE); 126 127 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 128 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 129 address tpc = trampoline_call(stub); 130 if (tpc == nullptr) { 131 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 132 postcond(pc() == badAddress); 133 return nullptr; 134 } 135 136 bind(DONE); 137 138 BLOCK_COMMENT("} // arrays_hashcode"); 139 140 postcond(pc() != badAddress); 141 return pc(); 142 } 143 144 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 145 Register tmp2Reg, Register tmp3Reg) { 146 Register oop = objectReg; 147 Register box = boxReg; 148 Register disp_hdr = tmpReg; 149 Register tmp = tmp2Reg; 150 Label cont; 151 Label object_has_monitor; 152 Label count, no_count; 153 154 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 155 assert_different_registers(oop, box, tmp, disp_hdr, rscratch2); 156 157 // Load markWord from object into displaced_header. 158 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 159 160 if (DiagnoseSyncOnValueBasedClasses != 0) { 161 load_klass(tmp, oop); 162 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 163 tst(tmp, KlassFlags::_misc_is_value_based_class); 164 br(Assembler::NE, cont); 165 } 166 167 // Check for existing monitor 168 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 169 170 if (LockingMode == LM_MONITOR) { 171 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 172 b(cont); 173 } else { 174 assert(LockingMode == LM_LEGACY, "must be"); 175 // Set tmp to be (markWord of object | UNLOCK_VALUE). 176 orr(tmp, disp_hdr, markWord::unlocked_value); 177 178 // Initialize the box. (Must happen before we update the object mark!) 179 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 180 181 // Compare object markWord with an unlocked value (tmp) and if 182 // equal exchange the stack address of our box with object markWord. 183 // On failure disp_hdr contains the possibly locked markWord. 184 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 185 /*release*/ true, /*weak*/ false, disp_hdr); 186 br(Assembler::EQ, cont); 187 188 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 189 190 // If the compare-and-exchange succeeded, then we found an unlocked 191 // object, will have now locked it will continue at label cont 192 193 // Check if the owner is self by comparing the value in the 194 // markWord of object (disp_hdr) with the stack pointer. 195 mov(rscratch1, sp); 196 sub(disp_hdr, disp_hdr, rscratch1); 197 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 198 // If condition is true we are cont and hence we can store 0 as the 199 // displaced header in the box, which indicates that it is a recursive lock. 200 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 201 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 202 b(cont); 203 } 204 205 // Handle existing monitor. 206 bind(object_has_monitor); 207 208 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 209 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 210 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 211 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 212 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 213 214 // Store a non-null value into the box to avoid looking like a re-entrant 215 // lock. The fast-path monitor unlock code checks for 216 // markWord::monitor_value so use markWord::unused_mark which has the 217 // relevant bit set, and also matches ObjectSynchronizer::enter. 218 mov(tmp, (address)markWord::unused_mark().value()); 219 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 220 221 br(Assembler::EQ, cont); // CAS success means locking succeeded 222 223 cmp(tmp3Reg, rscratch2); 224 br(Assembler::NE, cont); // Check for recursive locking 225 226 // Recursive lock case 227 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 228 // flag == EQ still from the cmp above, checking if this is a reentrant lock 229 230 bind(cont); 231 // flag == EQ indicates success 232 // flag == NE indicates failure 233 br(Assembler::NE, no_count); 234 235 bind(count); 236 if (LockingMode == LM_LEGACY) { 237 inc_held_monitor_count(rscratch1); 238 } 239 240 bind(no_count); 241 } 242 243 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 244 Register tmp2Reg) { 245 Register oop = objectReg; 246 Register box = boxReg; 247 Register disp_hdr = tmpReg; 248 Register owner_addr = tmpReg; 249 Register tmp = tmp2Reg; 250 Label cont; 251 Label object_has_monitor; 252 Label count, no_count; 253 Label unlocked; 254 255 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 256 assert_different_registers(oop, box, tmp, disp_hdr); 257 258 if (LockingMode == LM_LEGACY) { 259 // Find the lock address and load the displaced header from the stack. 260 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 261 262 // If the displaced header is 0, we have a recursive unlock. 263 cmp(disp_hdr, zr); 264 br(Assembler::EQ, cont); 265 } 266 267 // Handle existing monitor. 268 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 269 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 270 271 if (LockingMode == LM_MONITOR) { 272 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 273 b(cont); 274 } else { 275 assert(LockingMode == LM_LEGACY, "must be"); 276 // Check if it is still a light weight lock, this is is true if we 277 // see the stack address of the basicLock in the markWord of the 278 // object. 279 280 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 281 /*release*/ true, /*weak*/ false, tmp); 282 b(cont); 283 } 284 285 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 286 287 // Handle existing monitor. 288 bind(object_has_monitor); 289 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 290 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 291 292 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 293 294 Label notRecursive; 295 cbz(disp_hdr, notRecursive); 296 297 // Recursive lock 298 sub(disp_hdr, disp_hdr, 1u); 299 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 300 cmp(disp_hdr, disp_hdr); // Sets flags for result 301 b(cont); 302 303 bind(notRecursive); 304 305 // Compute owner address. 306 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 307 308 // Set owner to null. 309 // Release to satisfy the JMM 310 stlr(zr, owner_addr); 311 // We need a full fence after clearing owner to avoid stranding. 312 // StoreLoad achieves this. 313 membar(StoreLoad); 314 315 // Check if the entry_list is empty. 316 ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset())); 317 cmp(rscratch1, zr); 318 br(Assembler::EQ, cont); // If so we are done. 319 320 // Check if there is a successor. 321 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 322 cmp(rscratch1, zr); 323 br(Assembler::NE, unlocked); // If so we are done. 324 325 // Save the monitor pointer in the current thread, so we can try to 326 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 327 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 328 329 cmp(zr, rthread); // Set Flag to NE => slow path 330 b(cont); 331 332 bind(unlocked); 333 cmp(zr, zr); // Set Flag to EQ => fast path 334 335 // Intentional fall-through 336 337 bind(cont); 338 // flag == EQ indicates success 339 // flag == NE indicates failure 340 br(Assembler::NE, no_count); 341 342 bind(count); 343 if (LockingMode == LM_LEGACY) { 344 dec_held_monitor_count(rscratch1); 345 } 346 347 bind(no_count); 348 } 349 350 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 351 Register t2, Register t3) { 352 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 353 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 354 355 // Handle inflated monitor. 356 Label inflated; 357 // Finish fast lock successfully. MUST branch to with flag == EQ 358 Label locked; 359 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 360 Label slow_path; 361 362 if (UseObjectMonitorTable) { 363 // Clear cache in case fast locking succeeds. 364 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 365 } 366 367 if (DiagnoseSyncOnValueBasedClasses != 0) { 368 load_klass(t1, obj); 369 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 370 tst(t1, KlassFlags::_misc_is_value_based_class); 371 br(Assembler::NE, slow_path); 372 } 373 374 const Register t1_mark = t1; 375 const Register t3_t = t3; 376 377 { // Lightweight locking 378 379 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 380 Label push; 381 382 const Register t2_top = t2; 383 384 // Check if lock-stack is full. 385 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 386 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 387 br(Assembler::GT, slow_path); 388 389 // Check if recursive. 390 subw(t3_t, t2_top, oopSize); 391 ldr(t3_t, Address(rthread, t3_t)); 392 cmp(obj, t3_t); 393 br(Assembler::EQ, push); 394 395 // Relaxed normal load to check for monitor. Optimization for monitor case. 396 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 397 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 398 399 // Not inflated 400 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 401 402 // Try to lock. Transition lock-bits 0b01 => 0b00 403 orr(t1_mark, t1_mark, markWord::unlocked_value); 404 eor(t3_t, t1_mark, markWord::unlocked_value); 405 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 406 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 407 br(Assembler::NE, slow_path); 408 409 bind(push); 410 // After successful lock, push object on lock-stack. 411 str(obj, Address(rthread, t2_top)); 412 addw(t2_top, t2_top, oopSize); 413 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 414 b(locked); 415 } 416 417 { // Handle inflated monitor. 418 bind(inflated); 419 420 const Register t1_monitor = t1; 421 422 if (!UseObjectMonitorTable) { 423 assert(t1_monitor == t1_mark, "should be the same here"); 424 } else { 425 Label monitor_found; 426 427 // Load cache address 428 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 429 430 const int num_unrolled = 2; 431 for (int i = 0; i < num_unrolled; i++) { 432 ldr(t1, Address(t3_t)); 433 cmp(obj, t1); 434 br(Assembler::EQ, monitor_found); 435 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 436 } 437 438 Label loop; 439 440 // Search for obj in cache. 441 bind(loop); 442 443 // Check for match. 444 ldr(t1, Address(t3_t)); 445 cmp(obj, t1); 446 br(Assembler::EQ, monitor_found); 447 448 // Search until null encountered, guaranteed _null_sentinel at end. 449 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 450 cbnz(t1, loop); 451 // Cache Miss, NE set from cmp above, cbnz does not set flags 452 b(slow_path); 453 454 bind(monitor_found); 455 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 456 } 457 458 const Register t2_owner_addr = t2; 459 const Register t3_owner = t3; 460 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 461 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 462 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 463 464 Label monitor_locked; 465 466 // Compute owner address. 467 lea(t2_owner_addr, owner_address); 468 469 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 470 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 471 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 472 /*release*/ false, /*weak*/ false, t3_owner); 473 br(Assembler::EQ, monitor_locked); 474 475 // Check if recursive. 476 cmp(t3_owner, rscratch2); 477 br(Assembler::NE, slow_path); 478 479 // Recursive. 480 increment(recursions_address, 1); 481 482 bind(monitor_locked); 483 if (UseObjectMonitorTable) { 484 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 485 } 486 } 487 488 bind(locked); 489 490 #ifdef ASSERT 491 // Check that locked label is reached with Flags == EQ. 492 Label flag_correct; 493 br(Assembler::EQ, flag_correct); 494 stop("Fast Lock Flag != EQ"); 495 #endif 496 497 bind(slow_path); 498 #ifdef ASSERT 499 // Check that slow_path label is reached with Flags == NE. 500 br(Assembler::NE, flag_correct); 501 stop("Fast Lock Flag != NE"); 502 bind(flag_correct); 503 #endif 504 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 505 } 506 507 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 508 Register t2, Register t3) { 509 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 510 assert_different_registers(obj, box, t1, t2, t3); 511 512 // Handle inflated monitor. 513 Label inflated, inflated_load_mark; 514 // Finish fast unlock successfully. MUST branch to with flag == EQ 515 Label unlocked; 516 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 517 Label slow_path; 518 519 const Register t1_mark = t1; 520 const Register t2_top = t2; 521 const Register t3_t = t3; 522 523 { // Lightweight unlock 524 525 Label push_and_slow_path; 526 527 // Check if obj is top of lock-stack. 528 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 529 subw(t2_top, t2_top, oopSize); 530 ldr(t3_t, Address(rthread, t2_top)); 531 cmp(obj, t3_t); 532 // Top of lock stack was not obj. Must be monitor. 533 br(Assembler::NE, inflated_load_mark); 534 535 // Pop lock-stack. 536 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 537 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 538 539 // Check if recursive. 540 subw(t3_t, t2_top, oopSize); 541 ldr(t3_t, Address(rthread, t3_t)); 542 cmp(obj, t3_t); 543 br(Assembler::EQ, unlocked); 544 545 // Not recursive. 546 // Load Mark. 547 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 548 549 // Check header for monitor (0b10). 550 // Because we got here by popping (meaning we pushed in locked) 551 // there will be no monitor in the box. So we need to push back the obj 552 // so that the runtime can fix any potential anonymous owner. 553 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 554 555 // Try to unlock. Transition lock bits 0b00 => 0b01 556 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 557 orr(t3_t, t1_mark, markWord::unlocked_value); 558 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 559 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 560 br(Assembler::EQ, unlocked); 561 562 bind(push_and_slow_path); 563 // Compare and exchange failed. 564 // Restore lock-stack and handle the unlock in runtime. 565 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 566 addw(t2_top, t2_top, oopSize); 567 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 568 b(slow_path); 569 } 570 571 572 { // Handle inflated monitor. 573 bind(inflated_load_mark); 574 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 575 #ifdef ASSERT 576 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 577 stop("Fast Unlock not monitor"); 578 #endif 579 580 bind(inflated); 581 582 #ifdef ASSERT 583 Label check_done; 584 subw(t2_top, t2_top, oopSize); 585 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 586 br(Assembler::LT, check_done); 587 ldr(t3_t, Address(rthread, t2_top)); 588 cmp(obj, t3_t); 589 br(Assembler::NE, inflated); 590 stop("Fast Unlock lock on stack"); 591 bind(check_done); 592 #endif 593 594 const Register t1_monitor = t1; 595 596 if (!UseObjectMonitorTable) { 597 assert(t1_monitor == t1_mark, "should be the same here"); 598 599 // Untag the monitor. 600 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 601 } else { 602 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 603 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 604 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 605 br(Assembler::LO, slow_path); 606 } 607 608 const Register t2_recursions = t2; 609 Label not_recursive; 610 611 // Check if recursive. 612 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 613 cbz(t2_recursions, not_recursive); 614 615 // Recursive unlock. 616 sub(t2_recursions, t2_recursions, 1u); 617 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 618 // Set flag == EQ 619 cmp(t2_recursions, t2_recursions); 620 b(unlocked); 621 622 bind(not_recursive); 623 624 const Register t2_owner_addr = t2; 625 626 // Compute owner address. 627 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 628 629 // Set owner to null. 630 // Release to satisfy the JMM 631 stlr(zr, t2_owner_addr); 632 // We need a full fence after clearing owner to avoid stranding. 633 // StoreLoad achieves this. 634 membar(StoreLoad); 635 636 // Check if the entry_list is empty. 637 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset())); 638 cmp(rscratch1, zr); 639 br(Assembler::EQ, unlocked); // If so we are done. 640 641 // Check if there is a successor. 642 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 643 cmp(rscratch1, zr); 644 br(Assembler::NE, unlocked); // If so we are done. 645 646 // Save the monitor pointer in the current thread, so we can try to 647 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 648 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 649 650 cmp(zr, rthread); // Set Flag to NE => slow path 651 b(slow_path); 652 } 653 654 bind(unlocked); 655 cmp(zr, zr); // Set Flags to EQ => fast path 656 657 #ifdef ASSERT 658 // Check that unlocked label is reached with Flags == EQ. 659 Label flag_correct; 660 br(Assembler::EQ, flag_correct); 661 stop("Fast Unlock Flag != EQ"); 662 #endif 663 664 bind(slow_path); 665 #ifdef ASSERT 666 // Check that slow_path label is reached with Flags == NE. 667 br(Assembler::NE, flag_correct); 668 stop("Fast Unlock Flag != NE"); 669 bind(flag_correct); 670 #endif 671 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 672 } 673 674 // Search for str1 in str2 and return index or -1 675 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 676 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 677 Register cnt2, Register cnt1, 678 Register tmp1, Register tmp2, 679 Register tmp3, Register tmp4, 680 Register tmp5, Register tmp6, 681 int icnt1, Register result, int ae) { 682 // NOTE: tmp5, tmp6 can be zr depending on specific method version 683 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 684 685 Register ch1 = rscratch1; 686 Register ch2 = rscratch2; 687 Register cnt1tmp = tmp1; 688 Register cnt2tmp = tmp2; 689 Register cnt1_neg = cnt1; 690 Register cnt2_neg = cnt2; 691 Register result_tmp = tmp4; 692 693 bool isL = ae == StrIntrinsicNode::LL; 694 695 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 696 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 697 int str1_chr_shift = str1_isL ? 0:1; 698 int str2_chr_shift = str2_isL ? 0:1; 699 int str1_chr_size = str1_isL ? 1:2; 700 int str2_chr_size = str2_isL ? 1:2; 701 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 702 (chr_insn)&MacroAssembler::ldrh; 703 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 704 (chr_insn)&MacroAssembler::ldrh; 705 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 706 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 707 708 // Note, inline_string_indexOf() generates checks: 709 // if (substr.count > string.count) return -1; 710 // if (substr.count == 0) return 0; 711 712 // We have two strings, a source string in str2, cnt2 and a pattern string 713 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 714 715 // For larger pattern and source we use a simplified Boyer Moore algorithm. 716 // With a small pattern and source we use linear scan. 717 718 if (icnt1 == -1) { 719 sub(result_tmp, cnt2, cnt1); 720 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 721 br(LT, LINEARSEARCH); 722 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 723 subs(zr, cnt1, 256); 724 lsr(tmp1, cnt2, 2); 725 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 726 br(GE, LINEARSTUB); 727 } 728 729 // The Boyer Moore alogorithm is based on the description here:- 730 // 731 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 732 // 733 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 734 // and the 'Good Suffix' rule. 735 // 736 // These rules are essentially heuristics for how far we can shift the 737 // pattern along the search string. 738 // 739 // The implementation here uses the 'Bad Character' rule only because of the 740 // complexity of initialisation for the 'Good Suffix' rule. 741 // 742 // This is also known as the Boyer-Moore-Horspool algorithm:- 743 // 744 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 745 // 746 // This particular implementation has few java-specific optimizations. 747 // 748 // #define ASIZE 256 749 // 750 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 751 // int i, j; 752 // unsigned c; 753 // unsigned char bc[ASIZE]; 754 // 755 // /* Preprocessing */ 756 // for (i = 0; i < ASIZE; ++i) 757 // bc[i] = m; 758 // for (i = 0; i < m - 1; ) { 759 // c = x[i]; 760 // ++i; 761 // // c < 256 for Latin1 string, so, no need for branch 762 // #ifdef PATTERN_STRING_IS_LATIN1 763 // bc[c] = m - i; 764 // #else 765 // if (c < ASIZE) bc[c] = m - i; 766 // #endif 767 // } 768 // 769 // /* Searching */ 770 // j = 0; 771 // while (j <= n - m) { 772 // c = y[i+j]; 773 // if (x[m-1] == c) 774 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 775 // if (i < 0) return j; 776 // // c < 256 for Latin1 string, so, no need for branch 777 // #ifdef SOURCE_STRING_IS_LATIN1 778 // // LL case: (c< 256) always true. Remove branch 779 // j += bc[y[j+m-1]]; 780 // #endif 781 // #ifndef PATTERN_STRING_IS_UTF 782 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 783 // if (c < ASIZE) 784 // j += bc[y[j+m-1]]; 785 // else 786 // j += 1 787 // #endif 788 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 789 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 790 // if (c < ASIZE) 791 // j += bc[y[j+m-1]]; 792 // else 793 // j += m 794 // #endif 795 // } 796 // } 797 798 if (icnt1 == -1) { 799 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 800 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 801 Register cnt1end = tmp2; 802 Register str2end = cnt2; 803 Register skipch = tmp2; 804 805 // str1 length is >=8, so, we can read at least 1 register for cases when 806 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 807 // UL case. We'll re-read last character in inner pre-loop code to have 808 // single outer pre-loop load 809 const int firstStep = isL ? 7 : 3; 810 811 const int ASIZE = 256; 812 const int STORED_BYTES = 32; // amount of bytes stored per instruction 813 sub(sp, sp, ASIZE); 814 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 815 mov(ch1, sp); 816 BIND(BM_INIT_LOOP); 817 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 818 subs(tmp5, tmp5, 1); 819 br(GT, BM_INIT_LOOP); 820 821 sub(cnt1tmp, cnt1, 1); 822 mov(tmp5, str2); 823 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 824 sub(ch2, cnt1, 1); 825 mov(tmp3, str1); 826 BIND(BCLOOP); 827 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 828 if (!str1_isL) { 829 subs(zr, ch1, ASIZE); 830 br(HS, BCSKIP); 831 } 832 strb(ch2, Address(sp, ch1)); 833 BIND(BCSKIP); 834 subs(ch2, ch2, 1); 835 br(GT, BCLOOP); 836 837 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 838 if (str1_isL == str2_isL) { 839 // load last 8 bytes (8LL/4UU symbols) 840 ldr(tmp6, Address(tmp6, -wordSize)); 841 } else { 842 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 843 // convert Latin1 to UTF. We'll have to wait until load completed, but 844 // it's still faster than per-character loads+checks 845 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 846 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 847 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 848 andr(tmp6, tmp6, 0xFF); // str1[N-4] 849 orr(ch2, ch1, ch2, LSL, 16); 850 orr(tmp6, tmp6, tmp3, LSL, 48); 851 orr(tmp6, tmp6, ch2, LSL, 16); 852 } 853 BIND(BMLOOPSTR2); 854 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 855 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 856 if (str1_isL == str2_isL) { 857 // re-init tmp3. It's for free because it's executed in parallel with 858 // load above. Alternative is to initialize it before loop, but it'll 859 // affect performance on in-order systems with 2 or more ld/st pipelines 860 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 861 } 862 if (!isL) { // UU/UL case 863 lsl(ch2, cnt1tmp, 1); // offset in bytes 864 } 865 cmp(tmp3, skipch); 866 br(NE, BMSKIP); 867 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 868 mov(ch1, tmp6); 869 if (isL) { 870 b(BMLOOPSTR1_AFTER_LOAD); 871 } else { 872 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 873 b(BMLOOPSTR1_CMP); 874 } 875 BIND(BMLOOPSTR1); 876 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 877 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 878 BIND(BMLOOPSTR1_AFTER_LOAD); 879 subs(cnt1tmp, cnt1tmp, 1); 880 br(LT, BMLOOPSTR1_LASTCMP); 881 BIND(BMLOOPSTR1_CMP); 882 cmp(ch1, ch2); 883 br(EQ, BMLOOPSTR1); 884 BIND(BMSKIP); 885 if (!isL) { 886 // if we've met UTF symbol while searching Latin1 pattern, then we can 887 // skip cnt1 symbols 888 if (str1_isL != str2_isL) { 889 mov(result_tmp, cnt1); 890 } else { 891 mov(result_tmp, 1); 892 } 893 subs(zr, skipch, ASIZE); 894 br(HS, BMADV); 895 } 896 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 897 BIND(BMADV); 898 sub(cnt1tmp, cnt1, 1); 899 add(str2, str2, result_tmp, LSL, str2_chr_shift); 900 cmp(str2, str2end); 901 br(LE, BMLOOPSTR2); 902 add(sp, sp, ASIZE); 903 b(NOMATCH); 904 BIND(BMLOOPSTR1_LASTCMP); 905 cmp(ch1, ch2); 906 br(NE, BMSKIP); 907 BIND(BMMATCH); 908 sub(result, str2, tmp5); 909 if (!str2_isL) lsr(result, result, 1); 910 add(sp, sp, ASIZE); 911 b(DONE); 912 913 BIND(LINEARSTUB); 914 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 915 br(LT, LINEAR_MEDIUM); 916 mov(result, zr); 917 RuntimeAddress stub = nullptr; 918 if (isL) { 919 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 920 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 921 } else if (str1_isL) { 922 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 923 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 924 } else { 925 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 926 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 927 } 928 address call = trampoline_call(stub); 929 if (call == nullptr) { 930 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 931 ciEnv::current()->record_failure("CodeCache is full"); 932 return; 933 } 934 b(DONE); 935 } 936 937 BIND(LINEARSEARCH); 938 { 939 Label DO1, DO2, DO3; 940 941 Register str2tmp = tmp2; 942 Register first = tmp3; 943 944 if (icnt1 == -1) 945 { 946 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 947 948 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 949 br(LT, DOSHORT); 950 BIND(LINEAR_MEDIUM); 951 (this->*str1_load_1chr)(first, Address(str1)); 952 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 953 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 954 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 955 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 956 957 BIND(FIRST_LOOP); 958 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 959 cmp(first, ch2); 960 br(EQ, STR1_LOOP); 961 BIND(STR2_NEXT); 962 adds(cnt2_neg, cnt2_neg, str2_chr_size); 963 br(LE, FIRST_LOOP); 964 b(NOMATCH); 965 966 BIND(STR1_LOOP); 967 adds(cnt1tmp, cnt1_neg, str1_chr_size); 968 add(cnt2tmp, cnt2_neg, str2_chr_size); 969 br(GE, MATCH); 970 971 BIND(STR1_NEXT); 972 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 973 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 974 cmp(ch1, ch2); 975 br(NE, STR2_NEXT); 976 adds(cnt1tmp, cnt1tmp, str1_chr_size); 977 add(cnt2tmp, cnt2tmp, str2_chr_size); 978 br(LT, STR1_NEXT); 979 b(MATCH); 980 981 BIND(DOSHORT); 982 if (str1_isL == str2_isL) { 983 cmp(cnt1, (u1)2); 984 br(LT, DO1); 985 br(GT, DO3); 986 } 987 } 988 989 if (icnt1 == 4) { 990 Label CH1_LOOP; 991 992 (this->*load_4chr)(ch1, str1); 993 sub(result_tmp, cnt2, 4); 994 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 995 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 996 997 BIND(CH1_LOOP); 998 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 999 cmp(ch1, ch2); 1000 br(EQ, MATCH); 1001 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1002 br(LE, CH1_LOOP); 1003 b(NOMATCH); 1004 } 1005 1006 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1007 Label CH1_LOOP; 1008 1009 BIND(DO2); 1010 (this->*load_2chr)(ch1, str1); 1011 if (icnt1 == 2) { 1012 sub(result_tmp, cnt2, 2); 1013 } 1014 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1015 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1016 BIND(CH1_LOOP); 1017 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1018 cmp(ch1, ch2); 1019 br(EQ, MATCH); 1020 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1021 br(LE, CH1_LOOP); 1022 b(NOMATCH); 1023 } 1024 1025 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1026 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1027 1028 BIND(DO3); 1029 (this->*load_2chr)(first, str1); 1030 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1031 if (icnt1 == 3) { 1032 sub(result_tmp, cnt2, 3); 1033 } 1034 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1035 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1036 BIND(FIRST_LOOP); 1037 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1038 cmpw(first, ch2); 1039 br(EQ, STR1_LOOP); 1040 BIND(STR2_NEXT); 1041 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1042 br(LE, FIRST_LOOP); 1043 b(NOMATCH); 1044 1045 BIND(STR1_LOOP); 1046 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1047 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1048 cmp(ch1, ch2); 1049 br(NE, STR2_NEXT); 1050 b(MATCH); 1051 } 1052 1053 if (icnt1 == -1 || icnt1 == 1) { 1054 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1055 1056 BIND(DO1); 1057 (this->*str1_load_1chr)(ch1, str1); 1058 cmp(cnt2, (u1)8); 1059 br(LT, DO1_SHORT); 1060 1061 sub(result_tmp, cnt2, 8/str2_chr_size); 1062 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1063 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1064 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1065 1066 if (str2_isL) { 1067 orr(ch1, ch1, ch1, LSL, 8); 1068 } 1069 orr(ch1, ch1, ch1, LSL, 16); 1070 orr(ch1, ch1, ch1, LSL, 32); 1071 BIND(CH1_LOOP); 1072 ldr(ch2, Address(str2, cnt2_neg)); 1073 eor(ch2, ch1, ch2); 1074 sub(tmp1, ch2, tmp3); 1075 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1076 bics(tmp1, tmp1, tmp2); 1077 br(NE, HAS_ZERO); 1078 adds(cnt2_neg, cnt2_neg, 8); 1079 br(LT, CH1_LOOP); 1080 1081 cmp(cnt2_neg, (u1)8); 1082 mov(cnt2_neg, 0); 1083 br(LT, CH1_LOOP); 1084 b(NOMATCH); 1085 1086 BIND(HAS_ZERO); 1087 rev(tmp1, tmp1); 1088 clz(tmp1, tmp1); 1089 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1090 b(MATCH); 1091 1092 BIND(DO1_SHORT); 1093 mov(result_tmp, cnt2); 1094 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1095 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1096 BIND(DO1_LOOP); 1097 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1098 cmpw(ch1, ch2); 1099 br(EQ, MATCH); 1100 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1101 br(LT, DO1_LOOP); 1102 } 1103 } 1104 BIND(NOMATCH); 1105 mov(result, -1); 1106 b(DONE); 1107 BIND(MATCH); 1108 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1109 BIND(DONE); 1110 } 1111 1112 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1113 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1114 1115 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1116 Register ch, Register result, 1117 Register tmp1, Register tmp2, Register tmp3) 1118 { 1119 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1120 Register cnt1_neg = cnt1; 1121 Register ch1 = rscratch1; 1122 Register result_tmp = rscratch2; 1123 1124 cbz(cnt1, NOMATCH); 1125 1126 cmp(cnt1, (u1)4); 1127 br(LT, DO1_SHORT); 1128 1129 orr(ch, ch, ch, LSL, 16); 1130 orr(ch, ch, ch, LSL, 32); 1131 1132 sub(cnt1, cnt1, 4); 1133 mov(result_tmp, cnt1); 1134 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1135 sub(cnt1_neg, zr, cnt1, LSL, 1); 1136 1137 mov(tmp3, 0x0001000100010001); 1138 1139 BIND(CH1_LOOP); 1140 ldr(ch1, Address(str1, cnt1_neg)); 1141 eor(ch1, ch, ch1); 1142 sub(tmp1, ch1, tmp3); 1143 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1144 bics(tmp1, tmp1, tmp2); 1145 br(NE, HAS_ZERO); 1146 adds(cnt1_neg, cnt1_neg, 8); 1147 br(LT, CH1_LOOP); 1148 1149 cmp(cnt1_neg, (u1)8); 1150 mov(cnt1_neg, 0); 1151 br(LT, CH1_LOOP); 1152 b(NOMATCH); 1153 1154 BIND(HAS_ZERO); 1155 rev(tmp1, tmp1); 1156 clz(tmp1, tmp1); 1157 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1158 b(MATCH); 1159 1160 BIND(DO1_SHORT); 1161 mov(result_tmp, cnt1); 1162 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1163 sub(cnt1_neg, zr, cnt1, LSL, 1); 1164 BIND(DO1_LOOP); 1165 ldrh(ch1, Address(str1, cnt1_neg)); 1166 cmpw(ch, ch1); 1167 br(EQ, MATCH); 1168 adds(cnt1_neg, cnt1_neg, 2); 1169 br(LT, DO1_LOOP); 1170 BIND(NOMATCH); 1171 mov(result, -1); 1172 b(DONE); 1173 BIND(MATCH); 1174 add(result, result_tmp, cnt1_neg, ASR, 1); 1175 BIND(DONE); 1176 } 1177 1178 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1179 Register ch, Register result, 1180 FloatRegister ztmp1, 1181 FloatRegister ztmp2, 1182 PRegister tmp_pg, 1183 PRegister tmp_pdn, bool isL) 1184 { 1185 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1186 assert(tmp_pg->is_governing(), 1187 "this register has to be a governing predicate register"); 1188 1189 Label LOOP, MATCH, DONE, NOMATCH; 1190 Register vec_len = rscratch1; 1191 Register idx = rscratch2; 1192 1193 SIMD_RegVariant T = (isL == true) ? B : H; 1194 1195 cbz(cnt1, NOMATCH); 1196 1197 // Assign the particular char throughout the vector. 1198 sve_dup(ztmp2, T, ch); 1199 if (isL) { 1200 sve_cntb(vec_len); 1201 } else { 1202 sve_cnth(vec_len); 1203 } 1204 mov(idx, 0); 1205 1206 // Generate a predicate to control the reading of input string. 1207 sve_whilelt(tmp_pg, T, idx, cnt1); 1208 1209 BIND(LOOP); 1210 // Read a vector of 8- or 16-bit data depending on the string type. Note 1211 // that inactive elements indicated by the predicate register won't cause 1212 // a data read from memory to the destination vector. 1213 if (isL) { 1214 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1215 } else { 1216 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1217 } 1218 add(idx, idx, vec_len); 1219 1220 // Perform the comparison. An element of the destination predicate is set 1221 // to active if the particular char is matched. 1222 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1223 1224 // Branch if the particular char is found. 1225 br(NE, MATCH); 1226 1227 sve_whilelt(tmp_pg, T, idx, cnt1); 1228 1229 // Loop back if the particular char not found. 1230 br(MI, LOOP); 1231 1232 BIND(NOMATCH); 1233 mov(result, -1); 1234 b(DONE); 1235 1236 BIND(MATCH); 1237 // Undo the index increment. 1238 sub(idx, idx, vec_len); 1239 1240 // Crop the vector to find its location. 1241 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1242 add(result, idx, -1); 1243 sve_incp(result, T, tmp_pdn); 1244 BIND(DONE); 1245 } 1246 1247 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1248 Register ch, Register result, 1249 Register tmp1, Register tmp2, Register tmp3) 1250 { 1251 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1252 Register cnt1_neg = cnt1; 1253 Register ch1 = rscratch1; 1254 Register result_tmp = rscratch2; 1255 1256 cbz(cnt1, NOMATCH); 1257 1258 cmp(cnt1, (u1)8); 1259 br(LT, DO1_SHORT); 1260 1261 orr(ch, ch, ch, LSL, 8); 1262 orr(ch, ch, ch, LSL, 16); 1263 orr(ch, ch, ch, LSL, 32); 1264 1265 sub(cnt1, cnt1, 8); 1266 mov(result_tmp, cnt1); 1267 lea(str1, Address(str1, cnt1)); 1268 sub(cnt1_neg, zr, cnt1); 1269 1270 mov(tmp3, 0x0101010101010101); 1271 1272 BIND(CH1_LOOP); 1273 ldr(ch1, Address(str1, cnt1_neg)); 1274 eor(ch1, ch, ch1); 1275 sub(tmp1, ch1, tmp3); 1276 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1277 bics(tmp1, tmp1, tmp2); 1278 br(NE, HAS_ZERO); 1279 adds(cnt1_neg, cnt1_neg, 8); 1280 br(LT, CH1_LOOP); 1281 1282 cmp(cnt1_neg, (u1)8); 1283 mov(cnt1_neg, 0); 1284 br(LT, CH1_LOOP); 1285 b(NOMATCH); 1286 1287 BIND(HAS_ZERO); 1288 rev(tmp1, tmp1); 1289 clz(tmp1, tmp1); 1290 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1291 b(MATCH); 1292 1293 BIND(DO1_SHORT); 1294 mov(result_tmp, cnt1); 1295 lea(str1, Address(str1, cnt1)); 1296 sub(cnt1_neg, zr, cnt1); 1297 BIND(DO1_LOOP); 1298 ldrb(ch1, Address(str1, cnt1_neg)); 1299 cmp(ch, ch1); 1300 br(EQ, MATCH); 1301 adds(cnt1_neg, cnt1_neg, 1); 1302 br(LT, DO1_LOOP); 1303 BIND(NOMATCH); 1304 mov(result, -1); 1305 b(DONE); 1306 BIND(MATCH); 1307 add(result, result_tmp, cnt1_neg); 1308 BIND(DONE); 1309 } 1310 1311 // Compare strings. 1312 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1313 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1314 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1315 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1316 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1317 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1318 SHORT_LOOP_START, TAIL_CHECK; 1319 1320 bool isLL = ae == StrIntrinsicNode::LL; 1321 bool isLU = ae == StrIntrinsicNode::LU; 1322 bool isUL = ae == StrIntrinsicNode::UL; 1323 1324 // The stub threshold for LL strings is: 72 (64 + 8) chars 1325 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1326 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1327 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1328 1329 bool str1_isL = isLL || isLU; 1330 bool str2_isL = isLL || isUL; 1331 1332 int str1_chr_shift = str1_isL ? 0 : 1; 1333 int str2_chr_shift = str2_isL ? 0 : 1; 1334 int str1_chr_size = str1_isL ? 1 : 2; 1335 int str2_chr_size = str2_isL ? 1 : 2; 1336 int minCharsInWord = isLL ? wordSize : wordSize/2; 1337 1338 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1339 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1340 (chr_insn)&MacroAssembler::ldrh; 1341 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1342 (chr_insn)&MacroAssembler::ldrh; 1343 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1344 (uxt_insn)&MacroAssembler::uxthw; 1345 1346 BLOCK_COMMENT("string_compare {"); 1347 1348 // Bizarrely, the counts are passed in bytes, regardless of whether they 1349 // are L or U strings, however the result is always in characters. 1350 if (!str1_isL) asrw(cnt1, cnt1, 1); 1351 if (!str2_isL) asrw(cnt2, cnt2, 1); 1352 1353 // Compute the minimum of the string lengths and save the difference. 1354 subsw(result, cnt1, cnt2); 1355 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1356 1357 // A very short string 1358 cmpw(cnt2, minCharsInWord); 1359 br(Assembler::LE, SHORT_STRING); 1360 1361 // Compare longwords 1362 // load first parts of strings and finish initialization while loading 1363 { 1364 if (str1_isL == str2_isL) { // LL or UU 1365 ldr(tmp1, Address(str1)); 1366 cmp(str1, str2); 1367 br(Assembler::EQ, DONE); 1368 ldr(tmp2, Address(str2)); 1369 cmp(cnt2, stub_threshold); 1370 br(GE, STUB); 1371 subsw(cnt2, cnt2, minCharsInWord); 1372 br(EQ, TAIL_CHECK); 1373 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1374 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1375 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1376 } else if (isLU) { 1377 ldrs(vtmp, Address(str1)); 1378 ldr(tmp2, Address(str2)); 1379 cmp(cnt2, stub_threshold); 1380 br(GE, STUB); 1381 subw(cnt2, cnt2, 4); 1382 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1383 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1384 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1385 zip1(vtmp, T8B, vtmp, vtmpZ); 1386 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1387 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1388 add(cnt1, cnt1, 4); 1389 fmovd(tmp1, vtmp); 1390 } else { // UL case 1391 ldr(tmp1, Address(str1)); 1392 ldrs(vtmp, Address(str2)); 1393 cmp(cnt2, stub_threshold); 1394 br(GE, STUB); 1395 subw(cnt2, cnt2, 4); 1396 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1397 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1398 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1399 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1400 zip1(vtmp, T8B, vtmp, vtmpZ); 1401 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1402 add(cnt1, cnt1, 8); 1403 fmovd(tmp2, vtmp); 1404 } 1405 adds(cnt2, cnt2, isUL ? 4 : 8); 1406 br(GE, TAIL); 1407 eor(rscratch2, tmp1, tmp2); 1408 cbnz(rscratch2, DIFF); 1409 // main loop 1410 bind(NEXT_WORD); 1411 if (str1_isL == str2_isL) { 1412 ldr(tmp1, Address(str1, cnt2)); 1413 ldr(tmp2, Address(str2, cnt2)); 1414 adds(cnt2, cnt2, 8); 1415 } else if (isLU) { 1416 ldrs(vtmp, Address(str1, cnt1)); 1417 ldr(tmp2, Address(str2, cnt2)); 1418 add(cnt1, cnt1, 4); 1419 zip1(vtmp, T8B, vtmp, vtmpZ); 1420 fmovd(tmp1, vtmp); 1421 adds(cnt2, cnt2, 8); 1422 } else { // UL 1423 ldrs(vtmp, Address(str2, cnt2)); 1424 ldr(tmp1, Address(str1, cnt1)); 1425 zip1(vtmp, T8B, vtmp, vtmpZ); 1426 add(cnt1, cnt1, 8); 1427 fmovd(tmp2, vtmp); 1428 adds(cnt2, cnt2, 4); 1429 } 1430 br(GE, TAIL); 1431 1432 eor(rscratch2, tmp1, tmp2); 1433 cbz(rscratch2, NEXT_WORD); 1434 b(DIFF); 1435 bind(TAIL); 1436 eor(rscratch2, tmp1, tmp2); 1437 cbnz(rscratch2, DIFF); 1438 // Last longword. In the case where length == 4 we compare the 1439 // same longword twice, but that's still faster than another 1440 // conditional branch. 1441 if (str1_isL == str2_isL) { 1442 ldr(tmp1, Address(str1)); 1443 ldr(tmp2, Address(str2)); 1444 } else if (isLU) { 1445 ldrs(vtmp, Address(str1)); 1446 ldr(tmp2, Address(str2)); 1447 zip1(vtmp, T8B, vtmp, vtmpZ); 1448 fmovd(tmp1, vtmp); 1449 } else { // UL 1450 ldrs(vtmp, Address(str2)); 1451 ldr(tmp1, Address(str1)); 1452 zip1(vtmp, T8B, vtmp, vtmpZ); 1453 fmovd(tmp2, vtmp); 1454 } 1455 bind(TAIL_CHECK); 1456 eor(rscratch2, tmp1, tmp2); 1457 cbz(rscratch2, DONE); 1458 1459 // Find the first different characters in the longwords and 1460 // compute their difference. 1461 bind(DIFF); 1462 rev(rscratch2, rscratch2); 1463 clz(rscratch2, rscratch2); 1464 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1465 lsrv(tmp1, tmp1, rscratch2); 1466 (this->*ext_chr)(tmp1, tmp1); 1467 lsrv(tmp2, tmp2, rscratch2); 1468 (this->*ext_chr)(tmp2, tmp2); 1469 subw(result, tmp1, tmp2); 1470 b(DONE); 1471 } 1472 1473 bind(STUB); 1474 RuntimeAddress stub = nullptr; 1475 switch(ae) { 1476 case StrIntrinsicNode::LL: 1477 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1478 break; 1479 case StrIntrinsicNode::UU: 1480 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1481 break; 1482 case StrIntrinsicNode::LU: 1483 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1484 break; 1485 case StrIntrinsicNode::UL: 1486 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1487 break; 1488 default: 1489 ShouldNotReachHere(); 1490 } 1491 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1492 address call = trampoline_call(stub); 1493 if (call == nullptr) { 1494 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1495 ciEnv::current()->record_failure("CodeCache is full"); 1496 return; 1497 } 1498 b(DONE); 1499 1500 bind(SHORT_STRING); 1501 // Is the minimum length zero? 1502 cbz(cnt2, DONE); 1503 // arrange code to do most branches while loading and loading next characters 1504 // while comparing previous 1505 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1506 subs(cnt2, cnt2, 1); 1507 br(EQ, SHORT_LAST_INIT); 1508 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1509 b(SHORT_LOOP_START); 1510 bind(SHORT_LOOP); 1511 subs(cnt2, cnt2, 1); 1512 br(EQ, SHORT_LAST); 1513 bind(SHORT_LOOP_START); 1514 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1515 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1516 cmp(tmp1, cnt1); 1517 br(NE, SHORT_LOOP_TAIL); 1518 subs(cnt2, cnt2, 1); 1519 br(EQ, SHORT_LAST2); 1520 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1521 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1522 cmp(tmp2, rscratch1); 1523 br(EQ, SHORT_LOOP); 1524 sub(result, tmp2, rscratch1); 1525 b(DONE); 1526 bind(SHORT_LOOP_TAIL); 1527 sub(result, tmp1, cnt1); 1528 b(DONE); 1529 bind(SHORT_LAST2); 1530 cmp(tmp2, rscratch1); 1531 br(EQ, DONE); 1532 sub(result, tmp2, rscratch1); 1533 1534 b(DONE); 1535 bind(SHORT_LAST_INIT); 1536 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1537 bind(SHORT_LAST); 1538 cmp(tmp1, cnt1); 1539 br(EQ, DONE); 1540 sub(result, tmp1, cnt1); 1541 1542 bind(DONE); 1543 1544 BLOCK_COMMENT("} string_compare"); 1545 } 1546 1547 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1548 FloatRegister src2, Condition cond, bool isQ) { 1549 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1550 FloatRegister zn = src1, zm = src2; 1551 bool needs_negation = false; 1552 switch (cond) { 1553 case LT: cond = GT; zn = src2; zm = src1; break; 1554 case LE: cond = GE; zn = src2; zm = src1; break; 1555 case LO: cond = HI; zn = src2; zm = src1; break; 1556 case LS: cond = HS; zn = src2; zm = src1; break; 1557 case NE: cond = EQ; needs_negation = true; break; 1558 default: 1559 break; 1560 } 1561 1562 if (is_floating_point_type(bt)) { 1563 fcm(cond, dst, size, zn, zm); 1564 } else { 1565 cm(cond, dst, size, zn, zm); 1566 } 1567 1568 if (needs_negation) { 1569 notr(dst, isQ ? T16B : T8B, dst); 1570 } 1571 } 1572 1573 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1574 Condition cond, bool isQ) { 1575 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1576 if (bt == T_FLOAT || bt == T_DOUBLE) { 1577 if (cond == Assembler::NE) { 1578 fcm(Assembler::EQ, dst, size, src); 1579 notr(dst, isQ ? T16B : T8B, dst); 1580 } else { 1581 fcm(cond, dst, size, src); 1582 } 1583 } else { 1584 if (cond == Assembler::NE) { 1585 cm(Assembler::EQ, dst, size, src); 1586 notr(dst, isQ ? T16B : T8B, dst); 1587 } else { 1588 cm(cond, dst, size, src); 1589 } 1590 } 1591 } 1592 1593 // Compress the least significant bit of each byte to the rightmost and clear 1594 // the higher garbage bits. 1595 void C2_MacroAssembler::bytemask_compress(Register dst) { 1596 // Example input, dst = 0x01 00 00 00 01 01 00 01 1597 // The "??" bytes are garbage. 1598 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1599 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1600 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1601 andr(dst, dst, 0xff); // dst = 0x8D 1602 } 1603 1604 // Pack the lowest-numbered bit of each mask element in src into a long value 1605 // in dst, at most the first 64 lane elements. 1606 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1607 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1608 FloatRegister vtmp1, FloatRegister vtmp2) { 1609 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1610 assert_different_registers(dst, rscratch1); 1611 assert_different_registers(vtmp1, vtmp2); 1612 1613 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1614 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1615 // Expected: dst = 0x658D 1616 1617 // Convert the mask into vector with sequential bytes. 1618 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1619 sve_cpy(vtmp1, size, src, 1, false); 1620 if (bt != T_BYTE) { 1621 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1622 } 1623 1624 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1625 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1626 // is to compress each significant bit of the byte in a cross-lane way. Due 1627 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1628 // (bit-compress in each lane) with the biggest lane size (T = D) then 1629 // concatenate the results. 1630 1631 // The second source input of BEXT, initialized with 0x01 in each byte. 1632 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1633 sve_dup(vtmp2, B, 1); 1634 1635 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1636 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1637 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1638 // --------------------------------------- 1639 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1640 sve_bext(vtmp1, D, vtmp1, vtmp2); 1641 1642 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1643 // result to dst. 1644 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1645 // dst = 0x658D 1646 if (lane_cnt <= 8) { 1647 // No need to concatenate. 1648 umov(dst, vtmp1, B, 0); 1649 } else if (lane_cnt <= 16) { 1650 ins(vtmp1, B, vtmp1, 1, 8); 1651 umov(dst, vtmp1, H, 0); 1652 } else { 1653 // As the lane count is 64 at most, the final expected value must be in 1654 // the lowest 64 bits after narrowing vtmp1 from D to B. 1655 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1656 umov(dst, vtmp1, D, 0); 1657 } 1658 } else if (UseSVE > 0) { 1659 // Compress the lowest 8 bytes. 1660 fmovd(dst, vtmp1); 1661 bytemask_compress(dst); 1662 if (lane_cnt <= 8) return; 1663 1664 // Repeat on higher bytes and join the results. 1665 // Compress 8 bytes in each iteration. 1666 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1667 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1668 bytemask_compress(rscratch1); 1669 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1670 } 1671 } else { 1672 assert(false, "unsupported"); 1673 ShouldNotReachHere(); 1674 } 1675 } 1676 1677 // Unpack the mask, a long value in src, into predicate register dst based on the 1678 // corresponding data type. Note that dst can support at most 64 lanes. 1679 // Below example gives the expected dst predicate register in different types, with 1680 // a valid src(0x658D) on a 1024-bit vector size machine. 1681 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1682 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1683 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1684 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1685 // 1686 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1687 // has 24 significant bits would be an invalid input if dst predicate register refers to 1688 // a LONG type 1024-bit vector, which has at most 16 lanes. 1689 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1690 FloatRegister vtmp1, FloatRegister vtmp2) { 1691 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1692 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1693 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1694 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1695 // Expected: dst = 0b01101001 10001101 1696 1697 // Put long value from general purpose register into the first lane of vector. 1698 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1699 sve_dup(vtmp1, B, 0); 1700 mov(vtmp1, D, 0, src); 1701 1702 // As sve_cmp generates mask value with the minimum unit in byte, we should 1703 // transform the value in the first lane which is mask in bit now to the 1704 // mask in byte, which can be done by SVE2's BDEP instruction. 1705 1706 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1707 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1708 if (lane_cnt <= 8) { 1709 // Nothing. As only one byte exsits. 1710 } else if (lane_cnt <= 16) { 1711 ins(vtmp1, B, vtmp1, 8, 1); 1712 mov(vtmp1, B, 1, zr); 1713 } else { 1714 sve_vector_extend(vtmp1, D, vtmp1, B); 1715 } 1716 1717 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1718 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1719 sve_dup(vtmp2, B, 1); 1720 1721 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1722 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1723 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1724 // --------------------------------------- 1725 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1726 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1727 1728 if (bt != T_BYTE) { 1729 sve_vector_extend(vtmp1, size, vtmp1, B); 1730 } 1731 // Generate mask according to the given vector, in which the elements have been 1732 // extended to expected type. 1733 // dst = 0b01101001 10001101 1734 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1735 } 1736 1737 // Clobbers: rflags 1738 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1739 FloatRegister zn, FloatRegister zm, Condition cond) { 1740 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1741 FloatRegister z1 = zn, z2 = zm; 1742 switch (cond) { 1743 case LE: z1 = zm; z2 = zn; cond = GE; break; 1744 case LT: z1 = zm; z2 = zn; cond = GT; break; 1745 case LO: z1 = zm; z2 = zn; cond = HI; break; 1746 case LS: z1 = zm; z2 = zn; cond = HS; break; 1747 default: 1748 break; 1749 } 1750 1751 SIMD_RegVariant size = elemType_to_regVariant(bt); 1752 if (is_floating_point_type(bt)) { 1753 sve_fcm(cond, pd, size, pg, z1, z2); 1754 } else { 1755 assert(is_integral_type(bt), "unsupported element type"); 1756 sve_cmp(cond, pd, size, pg, z1, z2); 1757 } 1758 } 1759 1760 // Get index of the last mask lane that is set 1761 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1762 SIMD_RegVariant size = elemType_to_regVariant(bt); 1763 sve_rev(ptmp, size, src); 1764 sve_brkb(ptmp, ptrue, ptmp, false); 1765 sve_cntp(dst, size, ptrue, ptmp); 1766 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1767 subw(dst, rscratch1, dst); 1768 } 1769 1770 // Extend integer vector src to dst with the same lane count 1771 // but larger element size, e.g. 4B -> 4I 1772 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1773 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1774 if (src_bt == T_BYTE) { 1775 if (dst_bt == T_SHORT) { 1776 // 4B/8B to 4S/8S 1777 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1778 } else { 1779 // 4B to 4I 1780 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1781 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1782 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1783 } 1784 } else if (src_bt == T_SHORT) { 1785 // 4S to 4I 1786 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1787 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1788 } else if (src_bt == T_INT) { 1789 // 2I to 2L 1790 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1791 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1792 } else { 1793 ShouldNotReachHere(); 1794 } 1795 } 1796 1797 // Narrow integer vector src down to dst with the same lane count 1798 // but smaller element size, e.g. 4I -> 4B 1799 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1800 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1801 if (src_bt == T_SHORT) { 1802 // 4S/8S to 4B/8B 1803 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1804 assert(dst_bt == T_BYTE, "unsupported"); 1805 xtn(dst, T8B, src, T8H); 1806 } else if (src_bt == T_INT) { 1807 // 4I to 4B/4S 1808 assert(src_vlen_in_bytes == 16, "unsupported"); 1809 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1810 xtn(dst, T4H, src, T4S); 1811 if (dst_bt == T_BYTE) { 1812 xtn(dst, T8B, dst, T8H); 1813 } 1814 } else if (src_bt == T_LONG) { 1815 // 2L to 2I 1816 assert(src_vlen_in_bytes == 16, "unsupported"); 1817 assert(dst_bt == T_INT, "unsupported"); 1818 xtn(dst, T2S, src, T2D); 1819 } else { 1820 ShouldNotReachHere(); 1821 } 1822 } 1823 1824 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1825 FloatRegister src, SIMD_RegVariant src_size, 1826 bool is_unsigned) { 1827 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1828 1829 if (src_size == B) { 1830 switch (dst_size) { 1831 case H: 1832 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1833 break; 1834 case S: 1835 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1836 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1837 break; 1838 case D: 1839 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1840 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1841 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1842 break; 1843 default: 1844 ShouldNotReachHere(); 1845 } 1846 } else if (src_size == H) { 1847 if (dst_size == S) { 1848 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1849 } else { // D 1850 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1851 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1852 } 1853 } else if (src_size == S) { 1854 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1855 } 1856 } 1857 1858 // Vector narrow from src to dst with specified element sizes. 1859 // High part of dst vector will be filled with zero. 1860 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1861 FloatRegister src, SIMD_RegVariant src_size, 1862 FloatRegister tmp) { 1863 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1864 assert_different_registers(src, tmp); 1865 sve_dup(tmp, src_size, 0); 1866 if (src_size == D) { 1867 switch (dst_size) { 1868 case S: 1869 sve_uzp1(dst, S, src, tmp); 1870 break; 1871 case H: 1872 assert_different_registers(dst, tmp); 1873 sve_uzp1(dst, S, src, tmp); 1874 sve_uzp1(dst, H, dst, tmp); 1875 break; 1876 case B: 1877 assert_different_registers(dst, tmp); 1878 sve_uzp1(dst, S, src, tmp); 1879 sve_uzp1(dst, H, dst, tmp); 1880 sve_uzp1(dst, B, dst, tmp); 1881 break; 1882 default: 1883 ShouldNotReachHere(); 1884 } 1885 } else if (src_size == S) { 1886 if (dst_size == H) { 1887 sve_uzp1(dst, H, src, tmp); 1888 } else { // B 1889 assert_different_registers(dst, tmp); 1890 sve_uzp1(dst, H, src, tmp); 1891 sve_uzp1(dst, B, dst, tmp); 1892 } 1893 } else if (src_size == H) { 1894 sve_uzp1(dst, B, src, tmp); 1895 } 1896 } 1897 1898 // Extend src predicate to dst predicate with the same lane count but larger 1899 // element size, e.g. 64Byte -> 512Long 1900 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1901 uint dst_element_length_in_bytes, 1902 uint src_element_length_in_bytes) { 1903 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1904 sve_punpklo(dst, src); 1905 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1906 sve_punpklo(dst, src); 1907 sve_punpklo(dst, dst); 1908 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1909 sve_punpklo(dst, src); 1910 sve_punpklo(dst, dst); 1911 sve_punpklo(dst, dst); 1912 } else { 1913 assert(false, "unsupported"); 1914 ShouldNotReachHere(); 1915 } 1916 } 1917 1918 // Narrow src predicate to dst predicate with the same lane count but 1919 // smaller element size, e.g. 512Long -> 64Byte 1920 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1921 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1922 // The insignificant bits in src predicate are expected to be zero. 1923 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1924 // passed as the second argument. An example narrowing operation with a given mask would be - 1925 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1926 // Mask (for 2 Longs) : TF 1927 // Predicate register for the above mask (16 bits) : 00000001 00000000 1928 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1929 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1930 assert_different_registers(src, ptmp); 1931 assert_different_registers(dst, ptmp); 1932 sve_pfalse(ptmp); 1933 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1934 sve_uzp1(dst, B, src, ptmp); 1935 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1936 sve_uzp1(dst, H, src, ptmp); 1937 sve_uzp1(dst, B, dst, ptmp); 1938 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1939 sve_uzp1(dst, S, src, ptmp); 1940 sve_uzp1(dst, H, dst, ptmp); 1941 sve_uzp1(dst, B, dst, ptmp); 1942 } else { 1943 assert(false, "unsupported"); 1944 ShouldNotReachHere(); 1945 } 1946 } 1947 1948 // Vector reduction add for integral type with ASIMD instructions. 1949 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1950 Register isrc, FloatRegister vsrc, 1951 unsigned vector_length_in_bytes, 1952 FloatRegister vtmp) { 1953 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1954 assert_different_registers(dst, isrc); 1955 bool isQ = vector_length_in_bytes == 16; 1956 1957 BLOCK_COMMENT("neon_reduce_add_integral {"); 1958 switch(bt) { 1959 case T_BYTE: 1960 addv(vtmp, isQ ? T16B : T8B, vsrc); 1961 smov(dst, vtmp, B, 0); 1962 addw(dst, dst, isrc, ext::sxtb); 1963 break; 1964 case T_SHORT: 1965 addv(vtmp, isQ ? T8H : T4H, vsrc); 1966 smov(dst, vtmp, H, 0); 1967 addw(dst, dst, isrc, ext::sxth); 1968 break; 1969 case T_INT: 1970 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1971 umov(dst, vtmp, S, 0); 1972 addw(dst, dst, isrc); 1973 break; 1974 case T_LONG: 1975 assert(isQ, "unsupported"); 1976 addpd(vtmp, vsrc); 1977 umov(dst, vtmp, D, 0); 1978 add(dst, dst, isrc); 1979 break; 1980 default: 1981 assert(false, "unsupported"); 1982 ShouldNotReachHere(); 1983 } 1984 BLOCK_COMMENT("} neon_reduce_add_integral"); 1985 } 1986 1987 // Vector reduction multiply for integral type with ASIMD instructions. 1988 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1989 // Clobbers: rscratch1 1990 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1991 Register isrc, FloatRegister vsrc, 1992 unsigned vector_length_in_bytes, 1993 FloatRegister vtmp1, FloatRegister vtmp2) { 1994 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1995 bool isQ = vector_length_in_bytes == 16; 1996 1997 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1998 switch(bt) { 1999 case T_BYTE: 2000 if (isQ) { 2001 // Multiply the lower half and higher half of vector iteratively. 2002 // vtmp1 = vsrc[8:15] 2003 ins(vtmp1, D, vsrc, 0, 1); 2004 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2005 mulv(vtmp1, T8B, vtmp1, vsrc); 2006 // vtmp2 = vtmp1[4:7] 2007 ins(vtmp2, S, vtmp1, 0, 1); 2008 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2009 mulv(vtmp1, T8B, vtmp2, vtmp1); 2010 } else { 2011 ins(vtmp1, S, vsrc, 0, 1); 2012 mulv(vtmp1, T8B, vtmp1, vsrc); 2013 } 2014 // vtmp2 = vtmp1[2:3] 2015 ins(vtmp2, H, vtmp1, 0, 1); 2016 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2017 mulv(vtmp2, T8B, vtmp2, vtmp1); 2018 // dst = vtmp2[0] * isrc * vtmp2[1] 2019 umov(rscratch1, vtmp2, B, 0); 2020 mulw(dst, rscratch1, isrc); 2021 sxtb(dst, dst); 2022 umov(rscratch1, vtmp2, B, 1); 2023 mulw(dst, rscratch1, dst); 2024 sxtb(dst, dst); 2025 break; 2026 case T_SHORT: 2027 if (isQ) { 2028 ins(vtmp2, D, vsrc, 0, 1); 2029 mulv(vtmp2, T4H, vtmp2, vsrc); 2030 ins(vtmp1, S, vtmp2, 0, 1); 2031 mulv(vtmp1, T4H, vtmp1, vtmp2); 2032 } else { 2033 ins(vtmp1, S, vsrc, 0, 1); 2034 mulv(vtmp1, T4H, vtmp1, vsrc); 2035 } 2036 umov(rscratch1, vtmp1, H, 0); 2037 mulw(dst, rscratch1, isrc); 2038 sxth(dst, dst); 2039 umov(rscratch1, vtmp1, H, 1); 2040 mulw(dst, rscratch1, dst); 2041 sxth(dst, dst); 2042 break; 2043 case T_INT: 2044 if (isQ) { 2045 ins(vtmp1, D, vsrc, 0, 1); 2046 mulv(vtmp1, T2S, vtmp1, vsrc); 2047 } else { 2048 vtmp1 = vsrc; 2049 } 2050 umov(rscratch1, vtmp1, S, 0); 2051 mul(dst, rscratch1, isrc); 2052 umov(rscratch1, vtmp1, S, 1); 2053 mul(dst, rscratch1, dst); 2054 break; 2055 case T_LONG: 2056 umov(rscratch1, vsrc, D, 0); 2057 mul(dst, isrc, rscratch1); 2058 umov(rscratch1, vsrc, D, 1); 2059 mul(dst, dst, rscratch1); 2060 break; 2061 default: 2062 assert(false, "unsupported"); 2063 ShouldNotReachHere(); 2064 } 2065 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2066 } 2067 2068 // Vector reduction multiply for floating-point type with ASIMD instructions. 2069 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2070 FloatRegister fsrc, FloatRegister vsrc, 2071 unsigned vector_length_in_bytes, 2072 FloatRegister vtmp) { 2073 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2074 bool isQ = vector_length_in_bytes == 16; 2075 2076 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2077 switch(bt) { 2078 case T_FLOAT: 2079 fmuls(dst, fsrc, vsrc); 2080 ins(vtmp, S, vsrc, 0, 1); 2081 fmuls(dst, dst, vtmp); 2082 if (isQ) { 2083 ins(vtmp, S, vsrc, 0, 2); 2084 fmuls(dst, dst, vtmp); 2085 ins(vtmp, S, vsrc, 0, 3); 2086 fmuls(dst, dst, vtmp); 2087 } 2088 break; 2089 case T_DOUBLE: 2090 assert(isQ, "unsupported"); 2091 fmuld(dst, fsrc, vsrc); 2092 ins(vtmp, D, vsrc, 0, 1); 2093 fmuld(dst, dst, vtmp); 2094 break; 2095 default: 2096 assert(false, "unsupported"); 2097 ShouldNotReachHere(); 2098 } 2099 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2100 } 2101 2102 // Helper to select logical instruction 2103 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2104 Register Rn, Register Rm, 2105 enum shift_kind kind, unsigned shift) { 2106 switch(opc) { 2107 case Op_AndReductionV: 2108 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2109 break; 2110 case Op_OrReductionV: 2111 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2112 break; 2113 case Op_XorReductionV: 2114 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2115 break; 2116 default: 2117 assert(false, "unsupported"); 2118 ShouldNotReachHere(); 2119 } 2120 } 2121 2122 // Vector reduction logical operations And, Or, Xor 2123 // Clobbers: rscratch1 2124 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2125 Register isrc, FloatRegister vsrc, 2126 unsigned vector_length_in_bytes) { 2127 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2128 "unsupported"); 2129 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2130 assert_different_registers(dst, isrc); 2131 bool isQ = vector_length_in_bytes == 16; 2132 2133 BLOCK_COMMENT("neon_reduce_logical {"); 2134 umov(rscratch1, vsrc, isQ ? D : S, 0); 2135 umov(dst, vsrc, isQ ? D : S, 1); 2136 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2137 switch(bt) { 2138 case T_BYTE: 2139 if (isQ) { 2140 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2141 } 2142 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2143 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2144 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2145 sxtb(dst, dst); 2146 break; 2147 case T_SHORT: 2148 if (isQ) { 2149 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2150 } 2151 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2152 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2153 sxth(dst, dst); 2154 break; 2155 case T_INT: 2156 if (isQ) { 2157 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2158 } 2159 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2160 break; 2161 case T_LONG: 2162 assert(isQ, "unsupported"); 2163 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2164 break; 2165 default: 2166 assert(false, "unsupported"); 2167 ShouldNotReachHere(); 2168 } 2169 BLOCK_COMMENT("} neon_reduce_logical"); 2170 } 2171 2172 // Vector reduction min/max for integral type with ASIMD instructions. 2173 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2174 // Clobbers: rscratch1, rflags 2175 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2176 Register isrc, FloatRegister vsrc, 2177 unsigned vector_length_in_bytes, 2178 FloatRegister vtmp) { 2179 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2180 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2181 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2182 assert_different_registers(dst, isrc); 2183 bool isQ = vector_length_in_bytes == 16; 2184 bool is_min = opc == Op_MinReductionV; 2185 2186 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2187 if (bt == T_LONG) { 2188 assert(vtmp == fnoreg, "should be"); 2189 assert(isQ, "should be"); 2190 umov(rscratch1, vsrc, D, 0); 2191 cmp(isrc, rscratch1); 2192 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2193 umov(rscratch1, vsrc, D, 1); 2194 cmp(dst, rscratch1); 2195 csel(dst, dst, rscratch1, is_min ? LT : GT); 2196 } else { 2197 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2198 if (size == T2S) { 2199 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2200 } else { 2201 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2202 } 2203 if (bt == T_INT) { 2204 umov(dst, vtmp, S, 0); 2205 } else { 2206 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2207 } 2208 cmpw(dst, isrc); 2209 cselw(dst, dst, isrc, is_min ? LT : GT); 2210 } 2211 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2212 } 2213 2214 // Vector reduction for integral type with SVE instruction. 2215 // Supported operations are Add, And, Or, Xor, Max, Min. 2216 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2217 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2218 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2219 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2220 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2221 assert_different_registers(src1, dst); 2222 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2223 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2224 switch (opc) { 2225 case Op_AddReductionVI: { 2226 sve_uaddv(tmp, size, pg, src2); 2227 if (bt == T_BYTE) { 2228 smov(dst, tmp, size, 0); 2229 addw(dst, src1, dst, ext::sxtb); 2230 } else if (bt == T_SHORT) { 2231 smov(dst, tmp, size, 0); 2232 addw(dst, src1, dst, ext::sxth); 2233 } else { 2234 umov(dst, tmp, size, 0); 2235 addw(dst, dst, src1); 2236 } 2237 break; 2238 } 2239 case Op_AddReductionVL: { 2240 sve_uaddv(tmp, size, pg, src2); 2241 umov(dst, tmp, size, 0); 2242 add(dst, dst, src1); 2243 break; 2244 } 2245 case Op_AndReductionV: { 2246 sve_andv(tmp, size, pg, src2); 2247 if (bt == T_INT || bt == T_LONG) { 2248 umov(dst, tmp, size, 0); 2249 } else { 2250 smov(dst, tmp, size, 0); 2251 } 2252 if (bt == T_LONG) { 2253 andr(dst, dst, src1); 2254 } else { 2255 andw(dst, dst, src1); 2256 } 2257 break; 2258 } 2259 case Op_OrReductionV: { 2260 sve_orv(tmp, size, pg, src2); 2261 if (bt == T_INT || bt == T_LONG) { 2262 umov(dst, tmp, size, 0); 2263 } else { 2264 smov(dst, tmp, size, 0); 2265 } 2266 if (bt == T_LONG) { 2267 orr(dst, dst, src1); 2268 } else { 2269 orrw(dst, dst, src1); 2270 } 2271 break; 2272 } 2273 case Op_XorReductionV: { 2274 sve_eorv(tmp, size, pg, src2); 2275 if (bt == T_INT || bt == T_LONG) { 2276 umov(dst, tmp, size, 0); 2277 } else { 2278 smov(dst, tmp, size, 0); 2279 } 2280 if (bt == T_LONG) { 2281 eor(dst, dst, src1); 2282 } else { 2283 eorw(dst, dst, src1); 2284 } 2285 break; 2286 } 2287 case Op_MaxReductionV: { 2288 sve_smaxv(tmp, size, pg, src2); 2289 if (bt == T_INT || bt == T_LONG) { 2290 umov(dst, tmp, size, 0); 2291 } else { 2292 smov(dst, tmp, size, 0); 2293 } 2294 if (bt == T_LONG) { 2295 cmp(dst, src1); 2296 csel(dst, dst, src1, Assembler::GT); 2297 } else { 2298 cmpw(dst, src1); 2299 cselw(dst, dst, src1, Assembler::GT); 2300 } 2301 break; 2302 } 2303 case Op_MinReductionV: { 2304 sve_sminv(tmp, size, pg, src2); 2305 if (bt == T_INT || bt == T_LONG) { 2306 umov(dst, tmp, size, 0); 2307 } else { 2308 smov(dst, tmp, size, 0); 2309 } 2310 if (bt == T_LONG) { 2311 cmp(dst, src1); 2312 csel(dst, dst, src1, Assembler::LT); 2313 } else { 2314 cmpw(dst, src1); 2315 cselw(dst, dst, src1, Assembler::LT); 2316 } 2317 break; 2318 } 2319 default: 2320 assert(false, "unsupported"); 2321 ShouldNotReachHere(); 2322 } 2323 2324 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2325 if (bt == T_BYTE) { 2326 sxtb(dst, dst); 2327 } else if (bt == T_SHORT) { 2328 sxth(dst, dst); 2329 } 2330 } 2331 } 2332 2333 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2334 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2335 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2336 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2337 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2338 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2339 2340 // Set all elements to false if the input "lane_cnt" is zero. 2341 if (lane_cnt == 0) { 2342 sve_pfalse(dst); 2343 return; 2344 } 2345 2346 SIMD_RegVariant size = elemType_to_regVariant(bt); 2347 assert(size != Q, "invalid size"); 2348 2349 // Set all true if "lane_cnt" equals to the max lane count. 2350 if (lane_cnt == max_vector_length) { 2351 sve_ptrue(dst, size, /* ALL */ 0b11111); 2352 return; 2353 } 2354 2355 // Fixed numbers for "ptrue". 2356 switch(lane_cnt) { 2357 case 1: /* VL1 */ 2358 case 2: /* VL2 */ 2359 case 3: /* VL3 */ 2360 case 4: /* VL4 */ 2361 case 5: /* VL5 */ 2362 case 6: /* VL6 */ 2363 case 7: /* VL7 */ 2364 case 8: /* VL8 */ 2365 sve_ptrue(dst, size, lane_cnt); 2366 return; 2367 case 16: 2368 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2369 return; 2370 case 32: 2371 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2372 return; 2373 case 64: 2374 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2375 return; 2376 case 128: 2377 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2378 return; 2379 case 256: 2380 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2381 return; 2382 default: 2383 break; 2384 } 2385 2386 // Special patterns for "ptrue". 2387 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2388 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2389 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2390 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2391 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2392 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2393 } else { 2394 // Encode to "whileltw" for the remaining cases. 2395 mov(rscratch1, lane_cnt); 2396 sve_whileltw(dst, size, zr, rscratch1); 2397 } 2398 } 2399 2400 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2401 // Any remaining elements of dst will be filled with zero. 2402 // Clobbers: rscratch1 2403 // Preserves: src, mask 2404 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2405 FloatRegister vtmp1, FloatRegister vtmp2, 2406 PRegister pgtmp) { 2407 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2408 assert_different_registers(dst, src, vtmp1, vtmp2); 2409 assert_different_registers(mask, pgtmp); 2410 2411 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2412 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2413 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2414 sve_dup(vtmp2, H, 0); 2415 2416 // Extend lowest half to type INT. 2417 // dst = 00004444 00003333 00002222 00001111 2418 sve_uunpklo(dst, S, src); 2419 // pgtmp = 00000001 00000000 00000001 00000001 2420 sve_punpklo(pgtmp, mask); 2421 // Pack the active elements in size of type INT to the right, 2422 // and fill the remainings with zero. 2423 // dst = 00000000 00004444 00002222 00001111 2424 sve_compact(dst, S, dst, pgtmp); 2425 // Narrow the result back to type SHORT. 2426 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2427 sve_uzp1(dst, H, dst, vtmp2); 2428 // Count the active elements of lowest half. 2429 // rscratch1 = 3 2430 sve_cntp(rscratch1, S, ptrue, pgtmp); 2431 2432 // Repeat to the highest half. 2433 // pgtmp = 00000001 00000000 00000000 00000001 2434 sve_punpkhi(pgtmp, mask); 2435 // vtmp1 = 00008888 00007777 00006666 00005555 2436 sve_uunpkhi(vtmp1, S, src); 2437 // vtmp1 = 00000000 00000000 00008888 00005555 2438 sve_compact(vtmp1, S, vtmp1, pgtmp); 2439 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2440 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2441 2442 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2443 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2444 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2445 // TRUE_CNT is the number of active elements in the compressed low. 2446 neg(rscratch1, rscratch1); 2447 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2448 sve_index(vtmp2, H, rscratch1, 1); 2449 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2450 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2451 2452 // Combine the compressed high(after shifted) with the compressed low. 2453 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2454 sve_orr(dst, dst, vtmp1); 2455 } 2456 2457 // Clobbers: rscratch1, rscratch2 2458 // Preserves: src, mask 2459 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2460 FloatRegister vtmp1, FloatRegister vtmp2, 2461 FloatRegister vtmp3, FloatRegister vtmp4, 2462 PRegister ptmp, PRegister pgtmp) { 2463 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2464 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2465 assert_different_registers(mask, ptmp, pgtmp); 2466 // Example input: src = 88 77 66 55 44 33 22 11 2467 // mask = 01 00 00 01 01 00 01 01 2468 // Expected result: dst = 00 00 00 88 55 44 22 11 2469 2470 sve_dup(vtmp4, B, 0); 2471 // Extend lowest half to type SHORT. 2472 // vtmp1 = 0044 0033 0022 0011 2473 sve_uunpklo(vtmp1, H, src); 2474 // ptmp = 0001 0000 0001 0001 2475 sve_punpklo(ptmp, mask); 2476 // Count the active elements of lowest half. 2477 // rscratch2 = 3 2478 sve_cntp(rscratch2, H, ptrue, ptmp); 2479 // Pack the active elements in size of type SHORT to the right, 2480 // and fill the remainings with zero. 2481 // dst = 0000 0044 0022 0011 2482 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2483 // Narrow the result back to type BYTE. 2484 // dst = 00 00 00 00 00 44 22 11 2485 sve_uzp1(dst, B, dst, vtmp4); 2486 2487 // Repeat to the highest half. 2488 // ptmp = 0001 0000 0000 0001 2489 sve_punpkhi(ptmp, mask); 2490 // vtmp1 = 0088 0077 0066 0055 2491 sve_uunpkhi(vtmp2, H, src); 2492 // vtmp1 = 0000 0000 0088 0055 2493 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2494 2495 sve_dup(vtmp4, B, 0); 2496 // vtmp1 = 00 00 00 00 00 00 88 55 2497 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2498 2499 // Compressed low: dst = 00 00 00 00 00 44 22 11 2500 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2501 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2502 // TRUE_CNT is the number of active elements in the compressed low. 2503 neg(rscratch2, rscratch2); 2504 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2505 sve_index(vtmp2, B, rscratch2, 1); 2506 // vtmp1 = 00 00 00 88 55 00 00 00 2507 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2508 // Combine the compressed high(after shifted) with the compressed low. 2509 // dst = 00 00 00 88 55 44 22 11 2510 sve_orr(dst, dst, vtmp1); 2511 } 2512 2513 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2514 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2515 SIMD_Arrangement size = isQ ? T16B : T8B; 2516 if (bt == T_BYTE) { 2517 rbit(dst, size, src); 2518 } else { 2519 neon_reverse_bytes(dst, src, bt, isQ); 2520 rbit(dst, size, dst); 2521 } 2522 } 2523 2524 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2525 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2526 SIMD_Arrangement size = isQ ? T16B : T8B; 2527 switch (bt) { 2528 case T_BYTE: 2529 if (dst != src) { 2530 orr(dst, size, src, src); 2531 } 2532 break; 2533 case T_SHORT: 2534 rev16(dst, size, src); 2535 break; 2536 case T_INT: 2537 rev32(dst, size, src); 2538 break; 2539 case T_LONG: 2540 rev64(dst, size, src); 2541 break; 2542 default: 2543 assert(false, "unsupported"); 2544 ShouldNotReachHere(); 2545 } 2546 } 2547 2548 // VectorRearrange implementation for short/int/float/long/double types with NEON 2549 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. 2550 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group. 2551 // For VectorRearrange long/double, we compare the shuffle input with iota indices, 2552 // and use bsl to implement the operation. 2553 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, 2554 FloatRegister shuffle, FloatRegister tmp, 2555 BasicType bt, bool isQ) { 2556 assert_different_registers(dst, src, shuffle, tmp); 2557 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2558 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2559 2560 // Here is an example that rearranges a NEON vector with 4 ints: 2561 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] 2562 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. 2563 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector 2564 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get 2565 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. 2566 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], 2567 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] 2568 // 4. Use Vm as index register, and use V1 as table register. 2569 // Then get V2 as the result by tbl NEON instructions. 2570 switch (bt) { 2571 case T_SHORT: 2572 mov(tmp, size1, 0x02); 2573 mulv(dst, size2, shuffle, tmp); 2574 mov(tmp, size2, 0x0100); 2575 addv(dst, size1, dst, tmp); 2576 tbl(dst, size1, src, 1, dst); 2577 break; 2578 case T_INT: 2579 case T_FLOAT: 2580 mov(tmp, size1, 0x04); 2581 mulv(dst, size2, shuffle, tmp); 2582 mov(tmp, size2, 0x03020100); 2583 addv(dst, size1, dst, tmp); 2584 tbl(dst, size1, src, 1, dst); 2585 break; 2586 case T_LONG: 2587 case T_DOUBLE: 2588 // Load the iota indices for Long type. The indices are ordered by 2589 // type B/S/I/L/F/D, and the offset between two types is 16; Hence 2590 // the offset for L is 48. 2591 lea(rscratch1, 2592 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); 2593 ldrq(tmp, rscratch1); 2594 // Check whether the input "shuffle" is the same with iota indices. 2595 // Return "src" if true, otherwise swap the two elements of "src". 2596 cm(EQ, dst, size2, shuffle, tmp); 2597 ext(tmp, size1, src, src, 8); 2598 bsl(dst, size1, src, tmp); 2599 break; 2600 default: 2601 assert(false, "unsupported element type"); 2602 ShouldNotReachHere(); 2603 } 2604 } 2605 2606 // Extract a scalar element from an sve vector at position 'idx'. 2607 // The input elements in src are expected to be of integral type. 2608 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2609 int idx, FloatRegister vtmp) { 2610 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2611 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2612 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2613 if (bt == T_INT || bt == T_LONG) { 2614 umov(dst, src, size, idx); 2615 } else { 2616 smov(dst, src, size, idx); 2617 } 2618 } else { 2619 sve_orr(vtmp, src, src); 2620 sve_ext(vtmp, vtmp, idx << size); 2621 if (bt == T_INT || bt == T_LONG) { 2622 umov(dst, vtmp, size, 0); 2623 } else { 2624 smov(dst, vtmp, size, 0); 2625 } 2626 } 2627 } 2628 2629 // java.lang.Math::round intrinsics 2630 2631 // Clobbers: rscratch1, rflags 2632 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2633 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2634 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2635 switch (T) { 2636 case T2S: 2637 case T4S: 2638 fmovs(tmp1, T, 0.5f); 2639 mov(rscratch1, jint_cast(0x1.0p23f)); 2640 break; 2641 case T2D: 2642 fmovd(tmp1, T, 0.5); 2643 mov(rscratch1, julong_cast(0x1.0p52)); 2644 break; 2645 default: 2646 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2647 } 2648 fadd(tmp1, T, tmp1, src); 2649 fcvtms(tmp1, T, tmp1); 2650 // tmp1 = floor(src + 0.5, ties to even) 2651 2652 fcvtas(dst, T, src); 2653 // dst = round(src), ties to away 2654 2655 fneg(tmp3, T, src); 2656 dup(tmp2, T, rscratch1); 2657 cm(HS, tmp3, T, tmp3, tmp2); 2658 // tmp3 is now a set of flags 2659 2660 bif(dst, T16B, tmp1, tmp3); 2661 // result in dst 2662 } 2663 2664 // Clobbers: rscratch1, rflags 2665 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2666 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2667 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2668 assert_different_registers(tmp1, tmp2, src, dst); 2669 2670 switch (T) { 2671 case S: 2672 mov(rscratch1, jint_cast(0x1.0p23f)); 2673 break; 2674 case D: 2675 mov(rscratch1, julong_cast(0x1.0p52)); 2676 break; 2677 default: 2678 assert(T == S || T == D, "invalid register variant"); 2679 } 2680 2681 sve_frinta(dst, T, ptrue, src); 2682 // dst = round(src), ties to away 2683 2684 Label none; 2685 2686 sve_fneg(tmp1, T, ptrue, src); 2687 sve_dup(tmp2, T, rscratch1); 2688 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2689 br(EQ, none); 2690 { 2691 sve_cpy(tmp1, T, pgtmp, 0.5); 2692 sve_fadd(tmp1, T, pgtmp, src); 2693 sve_frintm(dst, T, pgtmp, tmp1); 2694 // dst = floor(src + 0.5, ties to even) 2695 } 2696 bind(none); 2697 2698 sve_fcvtzs(dst, T, ptrue, dst, T); 2699 // result in dst 2700 } 2701 2702 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2703 FloatRegister one, SIMD_Arrangement T) { 2704 assert_different_registers(dst, src, zero, one); 2705 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2706 2707 facgt(dst, T, src, zero); 2708 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2709 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2710 } 2711 2712 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2713 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2714 assert_different_registers(dst, src, zero, one, vtmp); 2715 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2716 2717 sve_orr(vtmp, src, src); 2718 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2719 switch (T) { 2720 case S: 2721 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2722 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2723 // on the sign of the float value 2724 break; 2725 case D: 2726 sve_and(vtmp, T, min_jlong); 2727 sve_orr(vtmp, T, jlong_cast(1.0)); 2728 break; 2729 default: 2730 assert(false, "unsupported"); 2731 ShouldNotReachHere(); 2732 } 2733 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2734 // Result in dst 2735 } 2736 2737 bool C2_MacroAssembler::in_scratch_emit_size() { 2738 if (ciEnv::current()->task() != nullptr) { 2739 PhaseOutput* phase_output = Compile::current()->output(); 2740 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2741 return true; 2742 } 2743 } 2744 return MacroAssembler::in_scratch_emit_size(); 2745 }