1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 // jdk.internal.util.ArraysSupport.vectorizedHashCode 50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 51 FloatRegister vdata0, FloatRegister vdata1, 52 FloatRegister vdata2, FloatRegister vdata3, 53 FloatRegister vmul0, FloatRegister vmul1, 54 FloatRegister vmul2, FloatRegister vmul3, 55 FloatRegister vpow, FloatRegister vpowm, 56 BasicType eltype) { 57 ARRAYS_HASHCODE_REGISTERS; 58 59 Register tmp1 = rscratch1, tmp2 = rscratch2; 60 61 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 62 63 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 64 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 65 // use 4H for chars and shorts instead, but using 8H gives better performance. 66 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 67 : eltype == T_CHAR || eltype == T_SHORT ? 8 68 : eltype == T_INT ? 4 69 : 0; 70 guarantee(vf, "unsupported eltype"); 71 72 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 73 const size_t unroll_factor = 4; 74 75 switch (eltype) { 76 case T_BOOLEAN: 77 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 78 break; 79 case T_CHAR: 80 BLOCK_COMMENT("arrays_hashcode(char) {"); 81 break; 82 case T_BYTE: 83 BLOCK_COMMENT("arrays_hashcode(byte) {"); 84 break; 85 case T_SHORT: 86 BLOCK_COMMENT("arrays_hashcode(short) {"); 87 break; 88 case T_INT: 89 BLOCK_COMMENT("arrays_hashcode(int) {"); 90 break; 91 default: 92 ShouldNotReachHere(); 93 } 94 95 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 96 // implemented by the stub executes just once. Call the stub only if at least two iterations will 97 // be executed. 98 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 99 cmpw(cnt, large_threshold); 100 br(Assembler::HS, LARGE); 101 102 bind(TAIL); 103 104 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 105 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 106 // Iteration eats up the remainder, uf elements at a time. 107 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 108 andr(tmp2, cnt, unroll_factor - 1); 109 adr(tmp1, BR_BASE); 110 // For Cortex-A53 offset is 4 because 2 nops are generated. 111 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3); 112 movw(tmp2, 0x1f); 113 br(tmp1); 114 115 bind(LOOP); 116 for (size_t i = 0; i < unroll_factor; ++i) { 117 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 118 maddw(result, result, tmp2, tmp1); 119 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 120 // Generate 2nd nop to have 4 instructions per iteration. 121 if (VM_Version::supports_a53mac()) { 122 nop(); 123 } 124 } 125 bind(BR_BASE); 126 subsw(cnt, cnt, unroll_factor); 127 br(Assembler::HS, LOOP); 128 129 b(DONE); 130 131 bind(LARGE); 132 133 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 134 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 135 address tpc = trampoline_call(stub); 136 if (tpc == nullptr) { 137 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 138 postcond(pc() == badAddress); 139 return nullptr; 140 } 141 142 bind(DONE); 143 144 BLOCK_COMMENT("} // arrays_hashcode"); 145 146 postcond(pc() != badAddress); 147 return pc(); 148 } 149 150 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 151 Register tmp2Reg, Register tmp3Reg) { 152 Register oop = objectReg; 153 Register box = boxReg; 154 Register disp_hdr = tmpReg; 155 Register tmp = tmp2Reg; 156 Label cont; 157 Label object_has_monitor; 158 Label count, no_count; 159 160 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 161 assert_different_registers(oop, box, tmp, disp_hdr, rscratch2); 162 163 // Load markWord from object into displaced_header. 164 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 165 166 if (DiagnoseSyncOnValueBasedClasses != 0) { 167 load_klass(tmp, oop); 168 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 169 tst(tmp, KlassFlags::_misc_is_value_based_class); 170 br(Assembler::NE, cont); 171 } 172 173 // Check for existing monitor 174 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 175 176 if (LockingMode == LM_MONITOR) { 177 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 178 b(cont); 179 } else { 180 assert(LockingMode == LM_LEGACY, "must be"); 181 // Set tmp to be (markWord of object | UNLOCK_VALUE). 182 orr(tmp, disp_hdr, markWord::unlocked_value); 183 184 // Initialize the box. (Must happen before we update the object mark!) 185 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 186 187 // Compare object markWord with an unlocked value (tmp) and if 188 // equal exchange the stack address of our box with object markWord. 189 // On failure disp_hdr contains the possibly locked markWord. 190 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 191 /*release*/ true, /*weak*/ false, disp_hdr); 192 br(Assembler::EQ, cont); 193 194 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 195 196 // If the compare-and-exchange succeeded, then we found an unlocked 197 // object, will have now locked it will continue at label cont 198 199 // Check if the owner is self by comparing the value in the 200 // markWord of object (disp_hdr) with the stack pointer. 201 mov(rscratch1, sp); 202 sub(disp_hdr, disp_hdr, rscratch1); 203 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 204 // If condition is true we are cont and hence we can store 0 as the 205 // displaced header in the box, which indicates that it is a recursive lock. 206 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 207 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 208 b(cont); 209 } 210 211 // Handle existing monitor. 212 bind(object_has_monitor); 213 214 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 215 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 216 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 217 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 218 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 219 220 // Store a non-null value into the box to avoid looking like a re-entrant 221 // lock. The fast-path monitor unlock code checks for 222 // markWord::monitor_value so use markWord::unused_mark which has the 223 // relevant bit set, and also matches ObjectSynchronizer::enter. 224 mov(tmp, (address)markWord::unused_mark().value()); 225 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 226 227 br(Assembler::EQ, cont); // CAS success means locking succeeded 228 229 cmp(tmp3Reg, rscratch2); 230 br(Assembler::NE, cont); // Check for recursive locking 231 232 // Recursive lock case 233 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 234 // flag == EQ still from the cmp above, checking if this is a reentrant lock 235 236 bind(cont); 237 // flag == EQ indicates success 238 // flag == NE indicates failure 239 br(Assembler::NE, no_count); 240 241 bind(count); 242 if (LockingMode == LM_LEGACY) { 243 inc_held_monitor_count(rscratch1); 244 } 245 246 bind(no_count); 247 } 248 249 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 250 Register tmp2Reg) { 251 Register oop = objectReg; 252 Register box = boxReg; 253 Register disp_hdr = tmpReg; 254 Register owner_addr = tmpReg; 255 Register tmp = tmp2Reg; 256 Label cont; 257 Label object_has_monitor; 258 Label count, no_count; 259 Label unlocked; 260 261 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 262 assert_different_registers(oop, box, tmp, disp_hdr); 263 264 if (LockingMode == LM_LEGACY) { 265 // Find the lock address and load the displaced header from the stack. 266 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 267 268 // If the displaced header is 0, we have a recursive unlock. 269 cmp(disp_hdr, zr); 270 br(Assembler::EQ, cont); 271 } 272 273 // Handle existing monitor. 274 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 275 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 276 277 if (LockingMode == LM_MONITOR) { 278 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 279 b(cont); 280 } else { 281 assert(LockingMode == LM_LEGACY, "must be"); 282 // Check if it is still a light weight lock, this is is true if we 283 // see the stack address of the basicLock in the markWord of the 284 // object. 285 286 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 287 /*release*/ true, /*weak*/ false, tmp); 288 b(cont); 289 } 290 291 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 292 293 // Handle existing monitor. 294 bind(object_has_monitor); 295 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 296 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 297 298 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 299 300 Label notRecursive; 301 cbz(disp_hdr, notRecursive); 302 303 // Recursive lock 304 sub(disp_hdr, disp_hdr, 1u); 305 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 306 cmp(disp_hdr, disp_hdr); // Sets flags for result 307 b(cont); 308 309 bind(notRecursive); 310 311 // Compute owner address. 312 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 313 314 // Set owner to null. 315 // Release to satisfy the JMM 316 stlr(zr, owner_addr); 317 // We need a full fence after clearing owner to avoid stranding. 318 // StoreLoad achieves this. 319 membar(StoreLoad); 320 321 // Check if the entry_list is empty. 322 ldr(rscratch1, Address(tmp, ObjectMonitor::entry_list_offset())); 323 cmp(rscratch1, zr); 324 br(Assembler::EQ, cont); // If so we are done. 325 326 // Check if there is a successor. 327 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 328 cmp(rscratch1, zr); 329 br(Assembler::NE, unlocked); // If so we are done. 330 331 // Save the monitor pointer in the current thread, so we can try to 332 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 333 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 334 335 cmp(zr, rthread); // Set Flag to NE => slow path 336 b(cont); 337 338 bind(unlocked); 339 cmp(zr, zr); // Set Flag to EQ => fast path 340 341 // Intentional fall-through 342 343 bind(cont); 344 // flag == EQ indicates success 345 // flag == NE indicates failure 346 br(Assembler::NE, no_count); 347 348 bind(count); 349 if (LockingMode == LM_LEGACY) { 350 dec_held_monitor_count(rscratch1); 351 } 352 353 bind(no_count); 354 } 355 356 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 357 Register t2, Register t3) { 358 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 359 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 360 361 // Handle inflated monitor. 362 Label inflated; 363 // Finish fast lock successfully. MUST branch to with flag == EQ 364 Label locked; 365 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 366 Label slow_path; 367 368 if (UseObjectMonitorTable) { 369 // Clear cache in case fast locking succeeds or we need to take the slow-path. 370 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 371 } 372 373 if (DiagnoseSyncOnValueBasedClasses != 0) { 374 load_klass(t1, obj); 375 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 376 tst(t1, KlassFlags::_misc_is_value_based_class); 377 br(Assembler::NE, slow_path); 378 } 379 380 const Register t1_mark = t1; 381 const Register t3_t = t3; 382 383 { // Lightweight locking 384 385 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 386 Label push; 387 388 const Register t2_top = t2; 389 390 // Check if lock-stack is full. 391 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 392 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 393 br(Assembler::GT, slow_path); 394 395 // Check if recursive. 396 subw(t3_t, t2_top, oopSize); 397 ldr(t3_t, Address(rthread, t3_t)); 398 cmp(obj, t3_t); 399 br(Assembler::EQ, push); 400 401 // Relaxed normal load to check for monitor. Optimization for monitor case. 402 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 403 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 404 405 // Not inflated 406 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 407 408 // Try to lock. Transition lock-bits 0b01 => 0b00 409 orr(t1_mark, t1_mark, markWord::unlocked_value); 410 eor(t3_t, t1_mark, markWord::unlocked_value); 411 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 412 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 413 br(Assembler::NE, slow_path); 414 415 bind(push); 416 // After successful lock, push object on lock-stack. 417 str(obj, Address(rthread, t2_top)); 418 addw(t2_top, t2_top, oopSize); 419 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 420 b(locked); 421 } 422 423 { // Handle inflated monitor. 424 bind(inflated); 425 426 const Register t1_monitor = t1; 427 428 if (!UseObjectMonitorTable) { 429 assert(t1_monitor == t1_mark, "should be the same here"); 430 } else { 431 Label monitor_found; 432 433 // Load cache address 434 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 435 436 const int num_unrolled = 2; 437 for (int i = 0; i < num_unrolled; i++) { 438 ldr(t1, Address(t3_t)); 439 cmp(obj, t1); 440 br(Assembler::EQ, monitor_found); 441 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 442 } 443 444 Label loop; 445 446 // Search for obj in cache. 447 bind(loop); 448 449 // Check for match. 450 ldr(t1, Address(t3_t)); 451 cmp(obj, t1); 452 br(Assembler::EQ, monitor_found); 453 454 // Search until null encountered, guaranteed _null_sentinel at end. 455 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 456 cbnz(t1, loop); 457 // Cache Miss, NE set from cmp above, cbnz does not set flags 458 b(slow_path); 459 460 bind(monitor_found); 461 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 462 } 463 464 const Register t2_owner_addr = t2; 465 const Register t3_owner = t3; 466 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 467 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 468 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 469 470 Label monitor_locked; 471 472 // Compute owner address. 473 lea(t2_owner_addr, owner_address); 474 475 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 476 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 477 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 478 /*release*/ false, /*weak*/ false, t3_owner); 479 br(Assembler::EQ, monitor_locked); 480 481 // Check if recursive. 482 cmp(t3_owner, rscratch2); 483 br(Assembler::NE, slow_path); 484 485 // Recursive. 486 increment(recursions_address, 1); 487 488 bind(monitor_locked); 489 if (UseObjectMonitorTable) { 490 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 491 } 492 } 493 494 bind(locked); 495 496 #ifdef ASSERT 497 // Check that locked label is reached with Flags == EQ. 498 Label flag_correct; 499 br(Assembler::EQ, flag_correct); 500 stop("Fast Lock Flag != EQ"); 501 #endif 502 503 bind(slow_path); 504 #ifdef ASSERT 505 // Check that slow_path label is reached with Flags == NE. 506 br(Assembler::NE, flag_correct); 507 stop("Fast Lock Flag != NE"); 508 bind(flag_correct); 509 #endif 510 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 511 } 512 513 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 514 Register t2, Register t3) { 515 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 516 assert_different_registers(obj, box, t1, t2, t3); 517 518 // Handle inflated monitor. 519 Label inflated, inflated_load_mark; 520 // Finish fast unlock successfully. MUST branch to with flag == EQ 521 Label unlocked; 522 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 523 Label slow_path; 524 525 const Register t1_mark = t1; 526 const Register t2_top = t2; 527 const Register t3_t = t3; 528 529 { // Lightweight unlock 530 531 Label push_and_slow_path; 532 533 // Check if obj is top of lock-stack. 534 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 535 subw(t2_top, t2_top, oopSize); 536 ldr(t3_t, Address(rthread, t2_top)); 537 cmp(obj, t3_t); 538 // Top of lock stack was not obj. Must be monitor. 539 br(Assembler::NE, inflated_load_mark); 540 541 // Pop lock-stack. 542 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 543 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 544 545 // Check if recursive. 546 subw(t3_t, t2_top, oopSize); 547 ldr(t3_t, Address(rthread, t3_t)); 548 cmp(obj, t3_t); 549 br(Assembler::EQ, unlocked); 550 551 // Not recursive. 552 // Load Mark. 553 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 554 555 // Check header for monitor (0b10). 556 // Because we got here by popping (meaning we pushed in locked) 557 // there will be no monitor in the box. So we need to push back the obj 558 // so that the runtime can fix any potential anonymous owner. 559 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 560 561 // Try to unlock. Transition lock bits 0b00 => 0b01 562 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 563 orr(t3_t, t1_mark, markWord::unlocked_value); 564 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 565 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 566 br(Assembler::EQ, unlocked); 567 568 bind(push_and_slow_path); 569 // Compare and exchange failed. 570 // Restore lock-stack and handle the unlock in runtime. 571 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 572 addw(t2_top, t2_top, oopSize); 573 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 574 b(slow_path); 575 } 576 577 578 { // Handle inflated monitor. 579 bind(inflated_load_mark); 580 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 581 #ifdef ASSERT 582 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 583 stop("Fast Unlock not monitor"); 584 #endif 585 586 bind(inflated); 587 588 #ifdef ASSERT 589 Label check_done; 590 subw(t2_top, t2_top, oopSize); 591 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 592 br(Assembler::LT, check_done); 593 ldr(t3_t, Address(rthread, t2_top)); 594 cmp(obj, t3_t); 595 br(Assembler::NE, inflated); 596 stop("Fast Unlock lock on stack"); 597 bind(check_done); 598 #endif 599 600 const Register t1_monitor = t1; 601 602 if (!UseObjectMonitorTable) { 603 assert(t1_monitor == t1_mark, "should be the same here"); 604 605 // Untag the monitor. 606 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 607 } else { 608 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 609 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 610 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 611 br(Assembler::LO, slow_path); 612 } 613 614 const Register t2_recursions = t2; 615 Label not_recursive; 616 617 // Check if recursive. 618 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 619 cbz(t2_recursions, not_recursive); 620 621 // Recursive unlock. 622 sub(t2_recursions, t2_recursions, 1u); 623 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 624 // Set flag == EQ 625 cmp(t2_recursions, t2_recursions); 626 b(unlocked); 627 628 bind(not_recursive); 629 630 const Register t2_owner_addr = t2; 631 632 // Compute owner address. 633 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 634 635 // Set owner to null. 636 // Release to satisfy the JMM 637 stlr(zr, t2_owner_addr); 638 // We need a full fence after clearing owner to avoid stranding. 639 // StoreLoad achieves this. 640 membar(StoreLoad); 641 642 // Check if the entry_list is empty. 643 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset())); 644 cmp(rscratch1, zr); 645 br(Assembler::EQ, unlocked); // If so we are done. 646 647 // Check if there is a successor. 648 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 649 cmp(rscratch1, zr); 650 br(Assembler::NE, unlocked); // If so we are done. 651 652 // Save the monitor pointer in the current thread, so we can try to 653 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 654 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 655 656 cmp(zr, rthread); // Set Flag to NE => slow path 657 b(slow_path); 658 } 659 660 bind(unlocked); 661 cmp(zr, zr); // Set Flags to EQ => fast path 662 663 #ifdef ASSERT 664 // Check that unlocked label is reached with Flags == EQ. 665 Label flag_correct; 666 br(Assembler::EQ, flag_correct); 667 stop("Fast Unlock Flag != EQ"); 668 #endif 669 670 bind(slow_path); 671 #ifdef ASSERT 672 // Check that slow_path label is reached with Flags == NE. 673 br(Assembler::NE, flag_correct); 674 stop("Fast Unlock Flag != NE"); 675 bind(flag_correct); 676 #endif 677 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 678 } 679 680 // Search for str1 in str2 and return index or -1 681 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 682 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 683 Register cnt2, Register cnt1, 684 Register tmp1, Register tmp2, 685 Register tmp3, Register tmp4, 686 Register tmp5, Register tmp6, 687 int icnt1, Register result, int ae) { 688 // NOTE: tmp5, tmp6 can be zr depending on specific method version 689 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 690 691 Register ch1 = rscratch1; 692 Register ch2 = rscratch2; 693 Register cnt1tmp = tmp1; 694 Register cnt2tmp = tmp2; 695 Register cnt1_neg = cnt1; 696 Register cnt2_neg = cnt2; 697 Register result_tmp = tmp4; 698 699 bool isL = ae == StrIntrinsicNode::LL; 700 701 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 702 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 703 int str1_chr_shift = str1_isL ? 0:1; 704 int str2_chr_shift = str2_isL ? 0:1; 705 int str1_chr_size = str1_isL ? 1:2; 706 int str2_chr_size = str2_isL ? 1:2; 707 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 708 (chr_insn)&MacroAssembler::ldrh; 709 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 710 (chr_insn)&MacroAssembler::ldrh; 711 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 712 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 713 714 // Note, inline_string_indexOf() generates checks: 715 // if (substr.count > string.count) return -1; 716 // if (substr.count == 0) return 0; 717 718 // We have two strings, a source string in str2, cnt2 and a pattern string 719 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 720 721 // For larger pattern and source we use a simplified Boyer Moore algorithm. 722 // With a small pattern and source we use linear scan. 723 724 if (icnt1 == -1) { 725 sub(result_tmp, cnt2, cnt1); 726 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 727 br(LT, LINEARSEARCH); 728 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 729 subs(zr, cnt1, 256); 730 lsr(tmp1, cnt2, 2); 731 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 732 br(GE, LINEARSTUB); 733 } 734 735 // The Boyer Moore alogorithm is based on the description here:- 736 // 737 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 738 // 739 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 740 // and the 'Good Suffix' rule. 741 // 742 // These rules are essentially heuristics for how far we can shift the 743 // pattern along the search string. 744 // 745 // The implementation here uses the 'Bad Character' rule only because of the 746 // complexity of initialisation for the 'Good Suffix' rule. 747 // 748 // This is also known as the Boyer-Moore-Horspool algorithm:- 749 // 750 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 751 // 752 // This particular implementation has few java-specific optimizations. 753 // 754 // #define ASIZE 256 755 // 756 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 757 // int i, j; 758 // unsigned c; 759 // unsigned char bc[ASIZE]; 760 // 761 // /* Preprocessing */ 762 // for (i = 0; i < ASIZE; ++i) 763 // bc[i] = m; 764 // for (i = 0; i < m - 1; ) { 765 // c = x[i]; 766 // ++i; 767 // // c < 256 for Latin1 string, so, no need for branch 768 // #ifdef PATTERN_STRING_IS_LATIN1 769 // bc[c] = m - i; 770 // #else 771 // if (c < ASIZE) bc[c] = m - i; 772 // #endif 773 // } 774 // 775 // /* Searching */ 776 // j = 0; 777 // while (j <= n - m) { 778 // c = y[i+j]; 779 // if (x[m-1] == c) 780 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 781 // if (i < 0) return j; 782 // // c < 256 for Latin1 string, so, no need for branch 783 // #ifdef SOURCE_STRING_IS_LATIN1 784 // // LL case: (c< 256) always true. Remove branch 785 // j += bc[y[j+m-1]]; 786 // #endif 787 // #ifndef PATTERN_STRING_IS_UTF 788 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 789 // if (c < ASIZE) 790 // j += bc[y[j+m-1]]; 791 // else 792 // j += 1 793 // #endif 794 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 795 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 796 // if (c < ASIZE) 797 // j += bc[y[j+m-1]]; 798 // else 799 // j += m 800 // #endif 801 // } 802 // } 803 804 if (icnt1 == -1) { 805 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 806 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 807 Register cnt1end = tmp2; 808 Register str2end = cnt2; 809 Register skipch = tmp2; 810 811 // str1 length is >=8, so, we can read at least 1 register for cases when 812 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 813 // UL case. We'll re-read last character in inner pre-loop code to have 814 // single outer pre-loop load 815 const int firstStep = isL ? 7 : 3; 816 817 const int ASIZE = 256; 818 const int STORED_BYTES = 32; // amount of bytes stored per instruction 819 sub(sp, sp, ASIZE); 820 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 821 mov(ch1, sp); 822 BIND(BM_INIT_LOOP); 823 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 824 subs(tmp5, tmp5, 1); 825 br(GT, BM_INIT_LOOP); 826 827 sub(cnt1tmp, cnt1, 1); 828 mov(tmp5, str2); 829 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 830 sub(ch2, cnt1, 1); 831 mov(tmp3, str1); 832 BIND(BCLOOP); 833 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 834 if (!str1_isL) { 835 subs(zr, ch1, ASIZE); 836 br(HS, BCSKIP); 837 } 838 strb(ch2, Address(sp, ch1)); 839 BIND(BCSKIP); 840 subs(ch2, ch2, 1); 841 br(GT, BCLOOP); 842 843 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 844 if (str1_isL == str2_isL) { 845 // load last 8 bytes (8LL/4UU symbols) 846 ldr(tmp6, Address(tmp6, -wordSize)); 847 } else { 848 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 849 // convert Latin1 to UTF. We'll have to wait until load completed, but 850 // it's still faster than per-character loads+checks 851 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 852 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 853 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 854 andr(tmp6, tmp6, 0xFF); // str1[N-4] 855 orr(ch2, ch1, ch2, LSL, 16); 856 orr(tmp6, tmp6, tmp3, LSL, 48); 857 orr(tmp6, tmp6, ch2, LSL, 16); 858 } 859 BIND(BMLOOPSTR2); 860 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 861 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 862 if (str1_isL == str2_isL) { 863 // re-init tmp3. It's for free because it's executed in parallel with 864 // load above. Alternative is to initialize it before loop, but it'll 865 // affect performance on in-order systems with 2 or more ld/st pipelines 866 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 867 } 868 if (!isL) { // UU/UL case 869 lsl(ch2, cnt1tmp, 1); // offset in bytes 870 } 871 cmp(tmp3, skipch); 872 br(NE, BMSKIP); 873 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 874 mov(ch1, tmp6); 875 if (isL) { 876 b(BMLOOPSTR1_AFTER_LOAD); 877 } else { 878 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 879 b(BMLOOPSTR1_CMP); 880 } 881 BIND(BMLOOPSTR1); 882 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 883 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 884 BIND(BMLOOPSTR1_AFTER_LOAD); 885 subs(cnt1tmp, cnt1tmp, 1); 886 br(LT, BMLOOPSTR1_LASTCMP); 887 BIND(BMLOOPSTR1_CMP); 888 cmp(ch1, ch2); 889 br(EQ, BMLOOPSTR1); 890 BIND(BMSKIP); 891 if (!isL) { 892 // if we've met UTF symbol while searching Latin1 pattern, then we can 893 // skip cnt1 symbols 894 if (str1_isL != str2_isL) { 895 mov(result_tmp, cnt1); 896 } else { 897 mov(result_tmp, 1); 898 } 899 subs(zr, skipch, ASIZE); 900 br(HS, BMADV); 901 } 902 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 903 BIND(BMADV); 904 sub(cnt1tmp, cnt1, 1); 905 add(str2, str2, result_tmp, LSL, str2_chr_shift); 906 cmp(str2, str2end); 907 br(LE, BMLOOPSTR2); 908 add(sp, sp, ASIZE); 909 b(NOMATCH); 910 BIND(BMLOOPSTR1_LASTCMP); 911 cmp(ch1, ch2); 912 br(NE, BMSKIP); 913 BIND(BMMATCH); 914 sub(result, str2, tmp5); 915 if (!str2_isL) lsr(result, result, 1); 916 add(sp, sp, ASIZE); 917 b(DONE); 918 919 BIND(LINEARSTUB); 920 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 921 br(LT, LINEAR_MEDIUM); 922 mov(result, zr); 923 RuntimeAddress stub = nullptr; 924 if (isL) { 925 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 926 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 927 } else if (str1_isL) { 928 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 929 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 930 } else { 931 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 932 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 933 } 934 address call = trampoline_call(stub); 935 if (call == nullptr) { 936 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 937 ciEnv::current()->record_failure("CodeCache is full"); 938 return; 939 } 940 b(DONE); 941 } 942 943 BIND(LINEARSEARCH); 944 { 945 Label DO1, DO2, DO3; 946 947 Register str2tmp = tmp2; 948 Register first = tmp3; 949 950 if (icnt1 == -1) 951 { 952 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 953 954 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 955 br(LT, DOSHORT); 956 BIND(LINEAR_MEDIUM); 957 (this->*str1_load_1chr)(first, Address(str1)); 958 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 959 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 960 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 961 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 962 963 BIND(FIRST_LOOP); 964 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 965 cmp(first, ch2); 966 br(EQ, STR1_LOOP); 967 BIND(STR2_NEXT); 968 adds(cnt2_neg, cnt2_neg, str2_chr_size); 969 br(LE, FIRST_LOOP); 970 b(NOMATCH); 971 972 BIND(STR1_LOOP); 973 adds(cnt1tmp, cnt1_neg, str1_chr_size); 974 add(cnt2tmp, cnt2_neg, str2_chr_size); 975 br(GE, MATCH); 976 977 BIND(STR1_NEXT); 978 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 979 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 980 cmp(ch1, ch2); 981 br(NE, STR2_NEXT); 982 adds(cnt1tmp, cnt1tmp, str1_chr_size); 983 add(cnt2tmp, cnt2tmp, str2_chr_size); 984 br(LT, STR1_NEXT); 985 b(MATCH); 986 987 BIND(DOSHORT); 988 if (str1_isL == str2_isL) { 989 cmp(cnt1, (u1)2); 990 br(LT, DO1); 991 br(GT, DO3); 992 } 993 } 994 995 if (icnt1 == 4) { 996 Label CH1_LOOP; 997 998 (this->*load_4chr)(ch1, str1); 999 sub(result_tmp, cnt2, 4); 1000 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1001 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1002 1003 BIND(CH1_LOOP); 1004 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 1005 cmp(ch1, ch2); 1006 br(EQ, MATCH); 1007 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1008 br(LE, CH1_LOOP); 1009 b(NOMATCH); 1010 } 1011 1012 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1013 Label CH1_LOOP; 1014 1015 BIND(DO2); 1016 (this->*load_2chr)(ch1, str1); 1017 if (icnt1 == 2) { 1018 sub(result_tmp, cnt2, 2); 1019 } 1020 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1021 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1022 BIND(CH1_LOOP); 1023 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1024 cmp(ch1, ch2); 1025 br(EQ, MATCH); 1026 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1027 br(LE, CH1_LOOP); 1028 b(NOMATCH); 1029 } 1030 1031 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1032 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1033 1034 BIND(DO3); 1035 (this->*load_2chr)(first, str1); 1036 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1037 if (icnt1 == 3) { 1038 sub(result_tmp, cnt2, 3); 1039 } 1040 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1041 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1042 BIND(FIRST_LOOP); 1043 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1044 cmpw(first, ch2); 1045 br(EQ, STR1_LOOP); 1046 BIND(STR2_NEXT); 1047 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1048 br(LE, FIRST_LOOP); 1049 b(NOMATCH); 1050 1051 BIND(STR1_LOOP); 1052 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1053 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1054 cmp(ch1, ch2); 1055 br(NE, STR2_NEXT); 1056 b(MATCH); 1057 } 1058 1059 if (icnt1 == -1 || icnt1 == 1) { 1060 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1061 1062 BIND(DO1); 1063 (this->*str1_load_1chr)(ch1, str1); 1064 cmp(cnt2, (u1)8); 1065 br(LT, DO1_SHORT); 1066 1067 sub(result_tmp, cnt2, 8/str2_chr_size); 1068 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1069 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1070 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1071 1072 if (str2_isL) { 1073 orr(ch1, ch1, ch1, LSL, 8); 1074 } 1075 orr(ch1, ch1, ch1, LSL, 16); 1076 orr(ch1, ch1, ch1, LSL, 32); 1077 BIND(CH1_LOOP); 1078 ldr(ch2, Address(str2, cnt2_neg)); 1079 eor(ch2, ch1, ch2); 1080 sub(tmp1, ch2, tmp3); 1081 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1082 bics(tmp1, tmp1, tmp2); 1083 br(NE, HAS_ZERO); 1084 adds(cnt2_neg, cnt2_neg, 8); 1085 br(LT, CH1_LOOP); 1086 1087 cmp(cnt2_neg, (u1)8); 1088 mov(cnt2_neg, 0); 1089 br(LT, CH1_LOOP); 1090 b(NOMATCH); 1091 1092 BIND(HAS_ZERO); 1093 rev(tmp1, tmp1); 1094 clz(tmp1, tmp1); 1095 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1096 b(MATCH); 1097 1098 BIND(DO1_SHORT); 1099 mov(result_tmp, cnt2); 1100 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1101 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1102 BIND(DO1_LOOP); 1103 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1104 cmpw(ch1, ch2); 1105 br(EQ, MATCH); 1106 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1107 br(LT, DO1_LOOP); 1108 } 1109 } 1110 BIND(NOMATCH); 1111 mov(result, -1); 1112 b(DONE); 1113 BIND(MATCH); 1114 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1115 BIND(DONE); 1116 } 1117 1118 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1119 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1120 1121 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1122 Register ch, Register result, 1123 Register tmp1, Register tmp2, Register tmp3) 1124 { 1125 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1126 Register cnt1_neg = cnt1; 1127 Register ch1 = rscratch1; 1128 Register result_tmp = rscratch2; 1129 1130 cbz(cnt1, NOMATCH); 1131 1132 cmp(cnt1, (u1)4); 1133 br(LT, DO1_SHORT); 1134 1135 orr(ch, ch, ch, LSL, 16); 1136 orr(ch, ch, ch, LSL, 32); 1137 1138 sub(cnt1, cnt1, 4); 1139 mov(result_tmp, cnt1); 1140 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1141 sub(cnt1_neg, zr, cnt1, LSL, 1); 1142 1143 mov(tmp3, 0x0001000100010001); 1144 1145 BIND(CH1_LOOP); 1146 ldr(ch1, Address(str1, cnt1_neg)); 1147 eor(ch1, ch, ch1); 1148 sub(tmp1, ch1, tmp3); 1149 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1150 bics(tmp1, tmp1, tmp2); 1151 br(NE, HAS_ZERO); 1152 adds(cnt1_neg, cnt1_neg, 8); 1153 br(LT, CH1_LOOP); 1154 1155 cmp(cnt1_neg, (u1)8); 1156 mov(cnt1_neg, 0); 1157 br(LT, CH1_LOOP); 1158 b(NOMATCH); 1159 1160 BIND(HAS_ZERO); 1161 rev(tmp1, tmp1); 1162 clz(tmp1, tmp1); 1163 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1164 b(MATCH); 1165 1166 BIND(DO1_SHORT); 1167 mov(result_tmp, cnt1); 1168 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1169 sub(cnt1_neg, zr, cnt1, LSL, 1); 1170 BIND(DO1_LOOP); 1171 ldrh(ch1, Address(str1, cnt1_neg)); 1172 cmpw(ch, ch1); 1173 br(EQ, MATCH); 1174 adds(cnt1_neg, cnt1_neg, 2); 1175 br(LT, DO1_LOOP); 1176 BIND(NOMATCH); 1177 mov(result, -1); 1178 b(DONE); 1179 BIND(MATCH); 1180 add(result, result_tmp, cnt1_neg, ASR, 1); 1181 BIND(DONE); 1182 } 1183 1184 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1185 Register ch, Register result, 1186 FloatRegister ztmp1, 1187 FloatRegister ztmp2, 1188 PRegister tmp_pg, 1189 PRegister tmp_pdn, bool isL) 1190 { 1191 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1192 assert(tmp_pg->is_governing(), 1193 "this register has to be a governing predicate register"); 1194 1195 Label LOOP, MATCH, DONE, NOMATCH; 1196 Register vec_len = rscratch1; 1197 Register idx = rscratch2; 1198 1199 SIMD_RegVariant T = (isL == true) ? B : H; 1200 1201 cbz(cnt1, NOMATCH); 1202 1203 // Assign the particular char throughout the vector. 1204 sve_dup(ztmp2, T, ch); 1205 if (isL) { 1206 sve_cntb(vec_len); 1207 } else { 1208 sve_cnth(vec_len); 1209 } 1210 mov(idx, 0); 1211 1212 // Generate a predicate to control the reading of input string. 1213 sve_whilelt(tmp_pg, T, idx, cnt1); 1214 1215 BIND(LOOP); 1216 // Read a vector of 8- or 16-bit data depending on the string type. Note 1217 // that inactive elements indicated by the predicate register won't cause 1218 // a data read from memory to the destination vector. 1219 if (isL) { 1220 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1221 } else { 1222 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1223 } 1224 add(idx, idx, vec_len); 1225 1226 // Perform the comparison. An element of the destination predicate is set 1227 // to active if the particular char is matched. 1228 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1229 1230 // Branch if the particular char is found. 1231 br(NE, MATCH); 1232 1233 sve_whilelt(tmp_pg, T, idx, cnt1); 1234 1235 // Loop back if the particular char not found. 1236 br(MI, LOOP); 1237 1238 BIND(NOMATCH); 1239 mov(result, -1); 1240 b(DONE); 1241 1242 BIND(MATCH); 1243 // Undo the index increment. 1244 sub(idx, idx, vec_len); 1245 1246 // Crop the vector to find its location. 1247 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1248 add(result, idx, -1); 1249 sve_incp(result, T, tmp_pdn); 1250 BIND(DONE); 1251 } 1252 1253 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1254 Register ch, Register result, 1255 Register tmp1, Register tmp2, Register tmp3) 1256 { 1257 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1258 Register cnt1_neg = cnt1; 1259 Register ch1 = rscratch1; 1260 Register result_tmp = rscratch2; 1261 1262 cbz(cnt1, NOMATCH); 1263 1264 cmp(cnt1, (u1)8); 1265 br(LT, DO1_SHORT); 1266 1267 orr(ch, ch, ch, LSL, 8); 1268 orr(ch, ch, ch, LSL, 16); 1269 orr(ch, ch, ch, LSL, 32); 1270 1271 sub(cnt1, cnt1, 8); 1272 mov(result_tmp, cnt1); 1273 lea(str1, Address(str1, cnt1)); 1274 sub(cnt1_neg, zr, cnt1); 1275 1276 mov(tmp3, 0x0101010101010101); 1277 1278 BIND(CH1_LOOP); 1279 ldr(ch1, Address(str1, cnt1_neg)); 1280 eor(ch1, ch, ch1); 1281 sub(tmp1, ch1, tmp3); 1282 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1283 bics(tmp1, tmp1, tmp2); 1284 br(NE, HAS_ZERO); 1285 adds(cnt1_neg, cnt1_neg, 8); 1286 br(LT, CH1_LOOP); 1287 1288 cmp(cnt1_neg, (u1)8); 1289 mov(cnt1_neg, 0); 1290 br(LT, CH1_LOOP); 1291 b(NOMATCH); 1292 1293 BIND(HAS_ZERO); 1294 rev(tmp1, tmp1); 1295 clz(tmp1, tmp1); 1296 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1297 b(MATCH); 1298 1299 BIND(DO1_SHORT); 1300 mov(result_tmp, cnt1); 1301 lea(str1, Address(str1, cnt1)); 1302 sub(cnt1_neg, zr, cnt1); 1303 BIND(DO1_LOOP); 1304 ldrb(ch1, Address(str1, cnt1_neg)); 1305 cmp(ch, ch1); 1306 br(EQ, MATCH); 1307 adds(cnt1_neg, cnt1_neg, 1); 1308 br(LT, DO1_LOOP); 1309 BIND(NOMATCH); 1310 mov(result, -1); 1311 b(DONE); 1312 BIND(MATCH); 1313 add(result, result_tmp, cnt1_neg); 1314 BIND(DONE); 1315 } 1316 1317 // Compare strings. 1318 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1319 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1320 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1321 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1322 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1323 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1324 SHORT_LOOP_START, TAIL_CHECK; 1325 1326 bool isLL = ae == StrIntrinsicNode::LL; 1327 bool isLU = ae == StrIntrinsicNode::LU; 1328 bool isUL = ae == StrIntrinsicNode::UL; 1329 1330 // The stub threshold for LL strings is: 72 (64 + 8) chars 1331 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1332 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1333 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1334 1335 bool str1_isL = isLL || isLU; 1336 bool str2_isL = isLL || isUL; 1337 1338 int str1_chr_shift = str1_isL ? 0 : 1; 1339 int str2_chr_shift = str2_isL ? 0 : 1; 1340 int str1_chr_size = str1_isL ? 1 : 2; 1341 int str2_chr_size = str2_isL ? 1 : 2; 1342 int minCharsInWord = isLL ? wordSize : wordSize/2; 1343 1344 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1345 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1346 (chr_insn)&MacroAssembler::ldrh; 1347 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1348 (chr_insn)&MacroAssembler::ldrh; 1349 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1350 (uxt_insn)&MacroAssembler::uxthw; 1351 1352 BLOCK_COMMENT("string_compare {"); 1353 1354 // Bizarrely, the counts are passed in bytes, regardless of whether they 1355 // are L or U strings, however the result is always in characters. 1356 if (!str1_isL) asrw(cnt1, cnt1, 1); 1357 if (!str2_isL) asrw(cnt2, cnt2, 1); 1358 1359 // Compute the minimum of the string lengths and save the difference. 1360 subsw(result, cnt1, cnt2); 1361 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1362 1363 // A very short string 1364 cmpw(cnt2, minCharsInWord); 1365 br(Assembler::LE, SHORT_STRING); 1366 1367 // Compare longwords 1368 // load first parts of strings and finish initialization while loading 1369 { 1370 if (str1_isL == str2_isL) { // LL or UU 1371 ldr(tmp1, Address(str1)); 1372 cmp(str1, str2); 1373 br(Assembler::EQ, DONE); 1374 ldr(tmp2, Address(str2)); 1375 cmp(cnt2, stub_threshold); 1376 br(GE, STUB); 1377 subsw(cnt2, cnt2, minCharsInWord); 1378 br(EQ, TAIL_CHECK); 1379 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1380 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1381 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1382 } else if (isLU) { 1383 ldrs(vtmp, Address(str1)); 1384 ldr(tmp2, Address(str2)); 1385 cmp(cnt2, stub_threshold); 1386 br(GE, STUB); 1387 subw(cnt2, cnt2, 4); 1388 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1389 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1390 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1391 zip1(vtmp, T8B, vtmp, vtmpZ); 1392 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1393 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1394 add(cnt1, cnt1, 4); 1395 fmovd(tmp1, vtmp); 1396 } else { // UL case 1397 ldr(tmp1, Address(str1)); 1398 ldrs(vtmp, Address(str2)); 1399 cmp(cnt2, stub_threshold); 1400 br(GE, STUB); 1401 subw(cnt2, cnt2, 4); 1402 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1403 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1404 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1405 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1406 zip1(vtmp, T8B, vtmp, vtmpZ); 1407 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1408 add(cnt1, cnt1, 8); 1409 fmovd(tmp2, vtmp); 1410 } 1411 adds(cnt2, cnt2, isUL ? 4 : 8); 1412 br(GE, TAIL); 1413 eor(rscratch2, tmp1, tmp2); 1414 cbnz(rscratch2, DIFF); 1415 // main loop 1416 bind(NEXT_WORD); 1417 if (str1_isL == str2_isL) { 1418 ldr(tmp1, Address(str1, cnt2)); 1419 ldr(tmp2, Address(str2, cnt2)); 1420 adds(cnt2, cnt2, 8); 1421 } else if (isLU) { 1422 ldrs(vtmp, Address(str1, cnt1)); 1423 ldr(tmp2, Address(str2, cnt2)); 1424 add(cnt1, cnt1, 4); 1425 zip1(vtmp, T8B, vtmp, vtmpZ); 1426 fmovd(tmp1, vtmp); 1427 adds(cnt2, cnt2, 8); 1428 } else { // UL 1429 ldrs(vtmp, Address(str2, cnt2)); 1430 ldr(tmp1, Address(str1, cnt1)); 1431 zip1(vtmp, T8B, vtmp, vtmpZ); 1432 add(cnt1, cnt1, 8); 1433 fmovd(tmp2, vtmp); 1434 adds(cnt2, cnt2, 4); 1435 } 1436 br(GE, TAIL); 1437 1438 eor(rscratch2, tmp1, tmp2); 1439 cbz(rscratch2, NEXT_WORD); 1440 b(DIFF); 1441 bind(TAIL); 1442 eor(rscratch2, tmp1, tmp2); 1443 cbnz(rscratch2, DIFF); 1444 // Last longword. In the case where length == 4 we compare the 1445 // same longword twice, but that's still faster than another 1446 // conditional branch. 1447 if (str1_isL == str2_isL) { 1448 ldr(tmp1, Address(str1)); 1449 ldr(tmp2, Address(str2)); 1450 } else if (isLU) { 1451 ldrs(vtmp, Address(str1)); 1452 ldr(tmp2, Address(str2)); 1453 zip1(vtmp, T8B, vtmp, vtmpZ); 1454 fmovd(tmp1, vtmp); 1455 } else { // UL 1456 ldrs(vtmp, Address(str2)); 1457 ldr(tmp1, Address(str1)); 1458 zip1(vtmp, T8B, vtmp, vtmpZ); 1459 fmovd(tmp2, vtmp); 1460 } 1461 bind(TAIL_CHECK); 1462 eor(rscratch2, tmp1, tmp2); 1463 cbz(rscratch2, DONE); 1464 1465 // Find the first different characters in the longwords and 1466 // compute their difference. 1467 bind(DIFF); 1468 rev(rscratch2, rscratch2); 1469 clz(rscratch2, rscratch2); 1470 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1471 lsrv(tmp1, tmp1, rscratch2); 1472 (this->*ext_chr)(tmp1, tmp1); 1473 lsrv(tmp2, tmp2, rscratch2); 1474 (this->*ext_chr)(tmp2, tmp2); 1475 subw(result, tmp1, tmp2); 1476 b(DONE); 1477 } 1478 1479 bind(STUB); 1480 RuntimeAddress stub = nullptr; 1481 switch(ae) { 1482 case StrIntrinsicNode::LL: 1483 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1484 break; 1485 case StrIntrinsicNode::UU: 1486 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1487 break; 1488 case StrIntrinsicNode::LU: 1489 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1490 break; 1491 case StrIntrinsicNode::UL: 1492 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1493 break; 1494 default: 1495 ShouldNotReachHere(); 1496 } 1497 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1498 address call = trampoline_call(stub); 1499 if (call == nullptr) { 1500 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1501 ciEnv::current()->record_failure("CodeCache is full"); 1502 return; 1503 } 1504 b(DONE); 1505 1506 bind(SHORT_STRING); 1507 // Is the minimum length zero? 1508 cbz(cnt2, DONE); 1509 // arrange code to do most branches while loading and loading next characters 1510 // while comparing previous 1511 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1512 subs(cnt2, cnt2, 1); 1513 br(EQ, SHORT_LAST_INIT); 1514 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1515 b(SHORT_LOOP_START); 1516 bind(SHORT_LOOP); 1517 subs(cnt2, cnt2, 1); 1518 br(EQ, SHORT_LAST); 1519 bind(SHORT_LOOP_START); 1520 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1521 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1522 cmp(tmp1, cnt1); 1523 br(NE, SHORT_LOOP_TAIL); 1524 subs(cnt2, cnt2, 1); 1525 br(EQ, SHORT_LAST2); 1526 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1527 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1528 cmp(tmp2, rscratch1); 1529 br(EQ, SHORT_LOOP); 1530 sub(result, tmp2, rscratch1); 1531 b(DONE); 1532 bind(SHORT_LOOP_TAIL); 1533 sub(result, tmp1, cnt1); 1534 b(DONE); 1535 bind(SHORT_LAST2); 1536 cmp(tmp2, rscratch1); 1537 br(EQ, DONE); 1538 sub(result, tmp2, rscratch1); 1539 1540 b(DONE); 1541 bind(SHORT_LAST_INIT); 1542 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1543 bind(SHORT_LAST); 1544 cmp(tmp1, cnt1); 1545 br(EQ, DONE); 1546 sub(result, tmp1, cnt1); 1547 1548 bind(DONE); 1549 1550 BLOCK_COMMENT("} string_compare"); 1551 } 1552 1553 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1554 FloatRegister src2, Condition cond, bool isQ) { 1555 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1556 FloatRegister zn = src1, zm = src2; 1557 bool needs_negation = false; 1558 switch (cond) { 1559 case LT: cond = GT; zn = src2; zm = src1; break; 1560 case LE: cond = GE; zn = src2; zm = src1; break; 1561 case LO: cond = HI; zn = src2; zm = src1; break; 1562 case LS: cond = HS; zn = src2; zm = src1; break; 1563 case NE: cond = EQ; needs_negation = true; break; 1564 default: 1565 break; 1566 } 1567 1568 if (is_floating_point_type(bt)) { 1569 fcm(cond, dst, size, zn, zm); 1570 } else { 1571 cm(cond, dst, size, zn, zm); 1572 } 1573 1574 if (needs_negation) { 1575 notr(dst, isQ ? T16B : T8B, dst); 1576 } 1577 } 1578 1579 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1580 Condition cond, bool isQ) { 1581 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1582 if (bt == T_FLOAT || bt == T_DOUBLE) { 1583 if (cond == Assembler::NE) { 1584 fcm(Assembler::EQ, dst, size, src); 1585 notr(dst, isQ ? T16B : T8B, dst); 1586 } else { 1587 fcm(cond, dst, size, src); 1588 } 1589 } else { 1590 if (cond == Assembler::NE) { 1591 cm(Assembler::EQ, dst, size, src); 1592 notr(dst, isQ ? T16B : T8B, dst); 1593 } else { 1594 cm(cond, dst, size, src); 1595 } 1596 } 1597 } 1598 1599 // Compress the least significant bit of each byte to the rightmost and clear 1600 // the higher garbage bits. 1601 void C2_MacroAssembler::bytemask_compress(Register dst) { 1602 // Example input, dst = 0x01 00 00 00 01 01 00 01 1603 // The "??" bytes are garbage. 1604 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1605 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1606 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1607 andr(dst, dst, 0xff); // dst = 0x8D 1608 } 1609 1610 // Pack the lowest-numbered bit of each mask element in src into a long value 1611 // in dst, at most the first 64 lane elements. 1612 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1613 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1614 FloatRegister vtmp1, FloatRegister vtmp2) { 1615 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1616 assert_different_registers(dst, rscratch1); 1617 assert_different_registers(vtmp1, vtmp2); 1618 1619 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1620 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1621 // Expected: dst = 0x658D 1622 1623 // Convert the mask into vector with sequential bytes. 1624 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1625 sve_cpy(vtmp1, size, src, 1, false); 1626 if (bt != T_BYTE) { 1627 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1628 } 1629 1630 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1631 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1632 // is to compress each significant bit of the byte in a cross-lane way. Due 1633 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1634 // (bit-compress in each lane) with the biggest lane size (T = D) then 1635 // concatenate the results. 1636 1637 // The second source input of BEXT, initialized with 0x01 in each byte. 1638 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1639 sve_dup(vtmp2, B, 1); 1640 1641 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1642 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1643 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1644 // --------------------------------------- 1645 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1646 sve_bext(vtmp1, D, vtmp1, vtmp2); 1647 1648 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1649 // result to dst. 1650 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1651 // dst = 0x658D 1652 if (lane_cnt <= 8) { 1653 // No need to concatenate. 1654 umov(dst, vtmp1, B, 0); 1655 } else if (lane_cnt <= 16) { 1656 ins(vtmp1, B, vtmp1, 1, 8); 1657 umov(dst, vtmp1, H, 0); 1658 } else { 1659 // As the lane count is 64 at most, the final expected value must be in 1660 // the lowest 64 bits after narrowing vtmp1 from D to B. 1661 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1662 umov(dst, vtmp1, D, 0); 1663 } 1664 } else if (UseSVE > 0) { 1665 // Compress the lowest 8 bytes. 1666 fmovd(dst, vtmp1); 1667 bytemask_compress(dst); 1668 if (lane_cnt <= 8) return; 1669 1670 // Repeat on higher bytes and join the results. 1671 // Compress 8 bytes in each iteration. 1672 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1673 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1674 bytemask_compress(rscratch1); 1675 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1676 } 1677 } else { 1678 assert(false, "unsupported"); 1679 ShouldNotReachHere(); 1680 } 1681 } 1682 1683 // Unpack the mask, a long value in src, into predicate register dst based on the 1684 // corresponding data type. Note that dst can support at most 64 lanes. 1685 // Below example gives the expected dst predicate register in different types, with 1686 // a valid src(0x658D) on a 1024-bit vector size machine. 1687 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1688 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1689 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1690 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1691 // 1692 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1693 // has 24 significant bits would be an invalid input if dst predicate register refers to 1694 // a LONG type 1024-bit vector, which has at most 16 lanes. 1695 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1696 FloatRegister vtmp1, FloatRegister vtmp2) { 1697 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1698 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1699 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1700 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1701 // Expected: dst = 0b01101001 10001101 1702 1703 // Put long value from general purpose register into the first lane of vector. 1704 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1705 sve_dup(vtmp1, B, 0); 1706 mov(vtmp1, D, 0, src); 1707 1708 // As sve_cmp generates mask value with the minimum unit in byte, we should 1709 // transform the value in the first lane which is mask in bit now to the 1710 // mask in byte, which can be done by SVE2's BDEP instruction. 1711 1712 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1713 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1714 if (lane_cnt <= 8) { 1715 // Nothing. As only one byte exsits. 1716 } else if (lane_cnt <= 16) { 1717 ins(vtmp1, B, vtmp1, 8, 1); 1718 mov(vtmp1, B, 1, zr); 1719 } else { 1720 sve_vector_extend(vtmp1, D, vtmp1, B); 1721 } 1722 1723 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1724 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1725 sve_dup(vtmp2, B, 1); 1726 1727 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1728 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1729 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1730 // --------------------------------------- 1731 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1732 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1733 1734 if (bt != T_BYTE) { 1735 sve_vector_extend(vtmp1, size, vtmp1, B); 1736 } 1737 // Generate mask according to the given vector, in which the elements have been 1738 // extended to expected type. 1739 // dst = 0b01101001 10001101 1740 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1741 } 1742 1743 // Clobbers: rflags 1744 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1745 FloatRegister zn, FloatRegister zm, Condition cond) { 1746 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1747 FloatRegister z1 = zn, z2 = zm; 1748 switch (cond) { 1749 case LE: z1 = zm; z2 = zn; cond = GE; break; 1750 case LT: z1 = zm; z2 = zn; cond = GT; break; 1751 case LO: z1 = zm; z2 = zn; cond = HI; break; 1752 case LS: z1 = zm; z2 = zn; cond = HS; break; 1753 default: 1754 break; 1755 } 1756 1757 SIMD_RegVariant size = elemType_to_regVariant(bt); 1758 if (is_floating_point_type(bt)) { 1759 sve_fcm(cond, pd, size, pg, z1, z2); 1760 } else { 1761 assert(is_integral_type(bt), "unsupported element type"); 1762 sve_cmp(cond, pd, size, pg, z1, z2); 1763 } 1764 } 1765 1766 // Get index of the last mask lane that is set 1767 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1768 SIMD_RegVariant size = elemType_to_regVariant(bt); 1769 sve_rev(ptmp, size, src); 1770 sve_brkb(ptmp, ptrue, ptmp, false); 1771 sve_cntp(dst, size, ptrue, ptmp); 1772 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1773 subw(dst, rscratch1, dst); 1774 } 1775 1776 // Extend integer vector src to dst with the same lane count 1777 // but larger element size, e.g. 4B -> 4I 1778 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1779 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1780 if (src_bt == T_BYTE) { 1781 if (dst_bt == T_SHORT) { 1782 // 4B/8B to 4S/8S 1783 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1784 } else { 1785 // 4B to 4I 1786 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1787 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1788 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1789 } 1790 } else if (src_bt == T_SHORT) { 1791 // 4S to 4I 1792 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1793 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1794 } else if (src_bt == T_INT) { 1795 // 2I to 2L 1796 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1797 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1798 } else { 1799 ShouldNotReachHere(); 1800 } 1801 } 1802 1803 // Narrow integer vector src down to dst with the same lane count 1804 // but smaller element size, e.g. 4I -> 4B 1805 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1806 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1807 if (src_bt == T_SHORT) { 1808 // 4S/8S to 4B/8B 1809 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1810 assert(dst_bt == T_BYTE, "unsupported"); 1811 xtn(dst, T8B, src, T8H); 1812 } else if (src_bt == T_INT) { 1813 // 4I to 4B/4S 1814 assert(src_vlen_in_bytes == 16, "unsupported"); 1815 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1816 xtn(dst, T4H, src, T4S); 1817 if (dst_bt == T_BYTE) { 1818 xtn(dst, T8B, dst, T8H); 1819 } 1820 } else if (src_bt == T_LONG) { 1821 // 2L to 2I 1822 assert(src_vlen_in_bytes == 16, "unsupported"); 1823 assert(dst_bt == T_INT, "unsupported"); 1824 xtn(dst, T2S, src, T2D); 1825 } else { 1826 ShouldNotReachHere(); 1827 } 1828 } 1829 1830 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1831 FloatRegister src, SIMD_RegVariant src_size, 1832 bool is_unsigned) { 1833 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1834 1835 if (src_size == B) { 1836 switch (dst_size) { 1837 case H: 1838 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1839 break; 1840 case S: 1841 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1842 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1843 break; 1844 case D: 1845 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1846 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1847 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1848 break; 1849 default: 1850 ShouldNotReachHere(); 1851 } 1852 } else if (src_size == H) { 1853 if (dst_size == S) { 1854 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1855 } else { // D 1856 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1857 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1858 } 1859 } else if (src_size == S) { 1860 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1861 } 1862 } 1863 1864 // Vector narrow from src to dst with specified element sizes. 1865 // High part of dst vector will be filled with zero. 1866 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1867 FloatRegister src, SIMD_RegVariant src_size, 1868 FloatRegister tmp) { 1869 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1870 assert_different_registers(src, tmp); 1871 sve_dup(tmp, src_size, 0); 1872 if (src_size == D) { 1873 switch (dst_size) { 1874 case S: 1875 sve_uzp1(dst, S, src, tmp); 1876 break; 1877 case H: 1878 assert_different_registers(dst, tmp); 1879 sve_uzp1(dst, S, src, tmp); 1880 sve_uzp1(dst, H, dst, tmp); 1881 break; 1882 case B: 1883 assert_different_registers(dst, tmp); 1884 sve_uzp1(dst, S, src, tmp); 1885 sve_uzp1(dst, H, dst, tmp); 1886 sve_uzp1(dst, B, dst, tmp); 1887 break; 1888 default: 1889 ShouldNotReachHere(); 1890 } 1891 } else if (src_size == S) { 1892 if (dst_size == H) { 1893 sve_uzp1(dst, H, src, tmp); 1894 } else { // B 1895 assert_different_registers(dst, tmp); 1896 sve_uzp1(dst, H, src, tmp); 1897 sve_uzp1(dst, B, dst, tmp); 1898 } 1899 } else if (src_size == H) { 1900 sve_uzp1(dst, B, src, tmp); 1901 } 1902 } 1903 1904 // Extend src predicate to dst predicate with the same lane count but larger 1905 // element size, e.g. 64Byte -> 512Long 1906 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1907 uint dst_element_length_in_bytes, 1908 uint src_element_length_in_bytes) { 1909 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1910 sve_punpklo(dst, src); 1911 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1912 sve_punpklo(dst, src); 1913 sve_punpklo(dst, dst); 1914 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1915 sve_punpklo(dst, src); 1916 sve_punpklo(dst, dst); 1917 sve_punpklo(dst, dst); 1918 } else { 1919 assert(false, "unsupported"); 1920 ShouldNotReachHere(); 1921 } 1922 } 1923 1924 // Narrow src predicate to dst predicate with the same lane count but 1925 // smaller element size, e.g. 512Long -> 64Byte 1926 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1927 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1928 // The insignificant bits in src predicate are expected to be zero. 1929 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1930 // passed as the second argument. An example narrowing operation with a given mask would be - 1931 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1932 // Mask (for 2 Longs) : TF 1933 // Predicate register for the above mask (16 bits) : 00000001 00000000 1934 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1935 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1936 assert_different_registers(src, ptmp); 1937 assert_different_registers(dst, ptmp); 1938 sve_pfalse(ptmp); 1939 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1940 sve_uzp1(dst, B, src, ptmp); 1941 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1942 sve_uzp1(dst, H, src, ptmp); 1943 sve_uzp1(dst, B, dst, ptmp); 1944 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1945 sve_uzp1(dst, S, src, ptmp); 1946 sve_uzp1(dst, H, dst, ptmp); 1947 sve_uzp1(dst, B, dst, ptmp); 1948 } else { 1949 assert(false, "unsupported"); 1950 ShouldNotReachHere(); 1951 } 1952 } 1953 1954 // Vector reduction add for integral type with ASIMD instructions. 1955 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1956 Register isrc, FloatRegister vsrc, 1957 unsigned vector_length_in_bytes, 1958 FloatRegister vtmp) { 1959 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1960 assert_different_registers(dst, isrc); 1961 bool isQ = vector_length_in_bytes == 16; 1962 1963 BLOCK_COMMENT("neon_reduce_add_integral {"); 1964 switch(bt) { 1965 case T_BYTE: 1966 addv(vtmp, isQ ? T16B : T8B, vsrc); 1967 smov(dst, vtmp, B, 0); 1968 addw(dst, dst, isrc, ext::sxtb); 1969 break; 1970 case T_SHORT: 1971 addv(vtmp, isQ ? T8H : T4H, vsrc); 1972 smov(dst, vtmp, H, 0); 1973 addw(dst, dst, isrc, ext::sxth); 1974 break; 1975 case T_INT: 1976 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1977 umov(dst, vtmp, S, 0); 1978 addw(dst, dst, isrc); 1979 break; 1980 case T_LONG: 1981 assert(isQ, "unsupported"); 1982 addpd(vtmp, vsrc); 1983 umov(dst, vtmp, D, 0); 1984 add(dst, dst, isrc); 1985 break; 1986 default: 1987 assert(false, "unsupported"); 1988 ShouldNotReachHere(); 1989 } 1990 BLOCK_COMMENT("} neon_reduce_add_integral"); 1991 } 1992 1993 // Vector reduction multiply for integral type with ASIMD instructions. 1994 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1995 // Clobbers: rscratch1 1996 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1997 Register isrc, FloatRegister vsrc, 1998 unsigned vector_length_in_bytes, 1999 FloatRegister vtmp1, FloatRegister vtmp2) { 2000 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2001 bool isQ = vector_length_in_bytes == 16; 2002 2003 BLOCK_COMMENT("neon_reduce_mul_integral {"); 2004 switch(bt) { 2005 case T_BYTE: 2006 if (isQ) { 2007 // Multiply the lower half and higher half of vector iteratively. 2008 // vtmp1 = vsrc[8:15] 2009 ins(vtmp1, D, vsrc, 0, 1); 2010 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2011 mulv(vtmp1, T8B, vtmp1, vsrc); 2012 // vtmp2 = vtmp1[4:7] 2013 ins(vtmp2, S, vtmp1, 0, 1); 2014 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2015 mulv(vtmp1, T8B, vtmp2, vtmp1); 2016 } else { 2017 ins(vtmp1, S, vsrc, 0, 1); 2018 mulv(vtmp1, T8B, vtmp1, vsrc); 2019 } 2020 // vtmp2 = vtmp1[2:3] 2021 ins(vtmp2, H, vtmp1, 0, 1); 2022 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2023 mulv(vtmp2, T8B, vtmp2, vtmp1); 2024 // dst = vtmp2[0] * isrc * vtmp2[1] 2025 umov(rscratch1, vtmp2, B, 0); 2026 mulw(dst, rscratch1, isrc); 2027 sxtb(dst, dst); 2028 umov(rscratch1, vtmp2, B, 1); 2029 mulw(dst, rscratch1, dst); 2030 sxtb(dst, dst); 2031 break; 2032 case T_SHORT: 2033 if (isQ) { 2034 ins(vtmp2, D, vsrc, 0, 1); 2035 mulv(vtmp2, T4H, vtmp2, vsrc); 2036 ins(vtmp1, S, vtmp2, 0, 1); 2037 mulv(vtmp1, T4H, vtmp1, vtmp2); 2038 } else { 2039 ins(vtmp1, S, vsrc, 0, 1); 2040 mulv(vtmp1, T4H, vtmp1, vsrc); 2041 } 2042 umov(rscratch1, vtmp1, H, 0); 2043 mulw(dst, rscratch1, isrc); 2044 sxth(dst, dst); 2045 umov(rscratch1, vtmp1, H, 1); 2046 mulw(dst, rscratch1, dst); 2047 sxth(dst, dst); 2048 break; 2049 case T_INT: 2050 if (isQ) { 2051 ins(vtmp1, D, vsrc, 0, 1); 2052 mulv(vtmp1, T2S, vtmp1, vsrc); 2053 } else { 2054 vtmp1 = vsrc; 2055 } 2056 umov(rscratch1, vtmp1, S, 0); 2057 mul(dst, rscratch1, isrc); 2058 umov(rscratch1, vtmp1, S, 1); 2059 mul(dst, rscratch1, dst); 2060 break; 2061 case T_LONG: 2062 umov(rscratch1, vsrc, D, 0); 2063 mul(dst, isrc, rscratch1); 2064 umov(rscratch1, vsrc, D, 1); 2065 mul(dst, dst, rscratch1); 2066 break; 2067 default: 2068 assert(false, "unsupported"); 2069 ShouldNotReachHere(); 2070 } 2071 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2072 } 2073 2074 // Vector reduction multiply for floating-point type with ASIMD instructions. 2075 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2076 FloatRegister fsrc, FloatRegister vsrc, 2077 unsigned vector_length_in_bytes, 2078 FloatRegister vtmp) { 2079 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2080 bool isQ = vector_length_in_bytes == 16; 2081 2082 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2083 switch(bt) { 2084 case T_FLOAT: 2085 fmuls(dst, fsrc, vsrc); 2086 ins(vtmp, S, vsrc, 0, 1); 2087 fmuls(dst, dst, vtmp); 2088 if (isQ) { 2089 ins(vtmp, S, vsrc, 0, 2); 2090 fmuls(dst, dst, vtmp); 2091 ins(vtmp, S, vsrc, 0, 3); 2092 fmuls(dst, dst, vtmp); 2093 } 2094 break; 2095 case T_DOUBLE: 2096 assert(isQ, "unsupported"); 2097 fmuld(dst, fsrc, vsrc); 2098 ins(vtmp, D, vsrc, 0, 1); 2099 fmuld(dst, dst, vtmp); 2100 break; 2101 default: 2102 assert(false, "unsupported"); 2103 ShouldNotReachHere(); 2104 } 2105 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2106 } 2107 2108 // Helper to select logical instruction 2109 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2110 Register Rn, Register Rm, 2111 enum shift_kind kind, unsigned shift) { 2112 switch(opc) { 2113 case Op_AndReductionV: 2114 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2115 break; 2116 case Op_OrReductionV: 2117 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2118 break; 2119 case Op_XorReductionV: 2120 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2121 break; 2122 default: 2123 assert(false, "unsupported"); 2124 ShouldNotReachHere(); 2125 } 2126 } 2127 2128 // Vector reduction logical operations And, Or, Xor 2129 // Clobbers: rscratch1 2130 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2131 Register isrc, FloatRegister vsrc, 2132 unsigned vector_length_in_bytes) { 2133 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2134 "unsupported"); 2135 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2136 assert_different_registers(dst, isrc); 2137 bool isQ = vector_length_in_bytes == 16; 2138 2139 BLOCK_COMMENT("neon_reduce_logical {"); 2140 umov(rscratch1, vsrc, isQ ? D : S, 0); 2141 umov(dst, vsrc, isQ ? D : S, 1); 2142 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2143 switch(bt) { 2144 case T_BYTE: 2145 if (isQ) { 2146 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2147 } 2148 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2149 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2150 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2151 sxtb(dst, dst); 2152 break; 2153 case T_SHORT: 2154 if (isQ) { 2155 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2156 } 2157 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2158 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2159 sxth(dst, dst); 2160 break; 2161 case T_INT: 2162 if (isQ) { 2163 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2164 } 2165 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2166 break; 2167 case T_LONG: 2168 assert(isQ, "unsupported"); 2169 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2170 break; 2171 default: 2172 assert(false, "unsupported"); 2173 ShouldNotReachHere(); 2174 } 2175 BLOCK_COMMENT("} neon_reduce_logical"); 2176 } 2177 2178 // Vector reduction min/max for integral type with ASIMD instructions. 2179 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2180 // Clobbers: rscratch1, rflags 2181 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2182 Register isrc, FloatRegister vsrc, 2183 unsigned vector_length_in_bytes, 2184 FloatRegister vtmp) { 2185 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2186 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2187 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2188 assert_different_registers(dst, isrc); 2189 bool isQ = vector_length_in_bytes == 16; 2190 bool is_min = opc == Op_MinReductionV; 2191 2192 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2193 if (bt == T_LONG) { 2194 assert(vtmp == fnoreg, "should be"); 2195 assert(isQ, "should be"); 2196 umov(rscratch1, vsrc, D, 0); 2197 cmp(isrc, rscratch1); 2198 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2199 umov(rscratch1, vsrc, D, 1); 2200 cmp(dst, rscratch1); 2201 csel(dst, dst, rscratch1, is_min ? LT : GT); 2202 } else { 2203 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2204 if (size == T2S) { 2205 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2206 } else { 2207 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2208 } 2209 if (bt == T_INT) { 2210 umov(dst, vtmp, S, 0); 2211 } else { 2212 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2213 } 2214 cmpw(dst, isrc); 2215 cselw(dst, dst, isrc, is_min ? LT : GT); 2216 } 2217 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2218 } 2219 2220 // Vector reduction for integral type with SVE instruction. 2221 // Supported operations are Add, And, Or, Xor, Max, Min. 2222 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2223 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2224 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2225 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2226 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2227 assert_different_registers(src1, dst); 2228 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2229 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2230 switch (opc) { 2231 case Op_AddReductionVI: { 2232 sve_uaddv(tmp, size, pg, src2); 2233 if (bt == T_BYTE) { 2234 smov(dst, tmp, size, 0); 2235 addw(dst, src1, dst, ext::sxtb); 2236 } else if (bt == T_SHORT) { 2237 smov(dst, tmp, size, 0); 2238 addw(dst, src1, dst, ext::sxth); 2239 } else { 2240 umov(dst, tmp, size, 0); 2241 addw(dst, dst, src1); 2242 } 2243 break; 2244 } 2245 case Op_AddReductionVL: { 2246 sve_uaddv(tmp, size, pg, src2); 2247 umov(dst, tmp, size, 0); 2248 add(dst, dst, src1); 2249 break; 2250 } 2251 case Op_AndReductionV: { 2252 sve_andv(tmp, size, pg, src2); 2253 if (bt == T_INT || bt == T_LONG) { 2254 umov(dst, tmp, size, 0); 2255 } else { 2256 smov(dst, tmp, size, 0); 2257 } 2258 if (bt == T_LONG) { 2259 andr(dst, dst, src1); 2260 } else { 2261 andw(dst, dst, src1); 2262 } 2263 break; 2264 } 2265 case Op_OrReductionV: { 2266 sve_orv(tmp, size, pg, src2); 2267 if (bt == T_INT || bt == T_LONG) { 2268 umov(dst, tmp, size, 0); 2269 } else { 2270 smov(dst, tmp, size, 0); 2271 } 2272 if (bt == T_LONG) { 2273 orr(dst, dst, src1); 2274 } else { 2275 orrw(dst, dst, src1); 2276 } 2277 break; 2278 } 2279 case Op_XorReductionV: { 2280 sve_eorv(tmp, size, pg, src2); 2281 if (bt == T_INT || bt == T_LONG) { 2282 umov(dst, tmp, size, 0); 2283 } else { 2284 smov(dst, tmp, size, 0); 2285 } 2286 if (bt == T_LONG) { 2287 eor(dst, dst, src1); 2288 } else { 2289 eorw(dst, dst, src1); 2290 } 2291 break; 2292 } 2293 case Op_MaxReductionV: { 2294 sve_smaxv(tmp, size, pg, src2); 2295 if (bt == T_INT || bt == T_LONG) { 2296 umov(dst, tmp, size, 0); 2297 } else { 2298 smov(dst, tmp, size, 0); 2299 } 2300 if (bt == T_LONG) { 2301 cmp(dst, src1); 2302 csel(dst, dst, src1, Assembler::GT); 2303 } else { 2304 cmpw(dst, src1); 2305 cselw(dst, dst, src1, Assembler::GT); 2306 } 2307 break; 2308 } 2309 case Op_MinReductionV: { 2310 sve_sminv(tmp, size, pg, src2); 2311 if (bt == T_INT || bt == T_LONG) { 2312 umov(dst, tmp, size, 0); 2313 } else { 2314 smov(dst, tmp, size, 0); 2315 } 2316 if (bt == T_LONG) { 2317 cmp(dst, src1); 2318 csel(dst, dst, src1, Assembler::LT); 2319 } else { 2320 cmpw(dst, src1); 2321 cselw(dst, dst, src1, Assembler::LT); 2322 } 2323 break; 2324 } 2325 default: 2326 assert(false, "unsupported"); 2327 ShouldNotReachHere(); 2328 } 2329 2330 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2331 if (bt == T_BYTE) { 2332 sxtb(dst, dst); 2333 } else if (bt == T_SHORT) { 2334 sxth(dst, dst); 2335 } 2336 } 2337 } 2338 2339 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2340 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2341 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2342 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2343 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2344 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2345 2346 // Set all elements to false if the input "lane_cnt" is zero. 2347 if (lane_cnt == 0) { 2348 sve_pfalse(dst); 2349 return; 2350 } 2351 2352 SIMD_RegVariant size = elemType_to_regVariant(bt); 2353 assert(size != Q, "invalid size"); 2354 2355 // Set all true if "lane_cnt" equals to the max lane count. 2356 if (lane_cnt == max_vector_length) { 2357 sve_ptrue(dst, size, /* ALL */ 0b11111); 2358 return; 2359 } 2360 2361 // Fixed numbers for "ptrue". 2362 switch(lane_cnt) { 2363 case 1: /* VL1 */ 2364 case 2: /* VL2 */ 2365 case 3: /* VL3 */ 2366 case 4: /* VL4 */ 2367 case 5: /* VL5 */ 2368 case 6: /* VL6 */ 2369 case 7: /* VL7 */ 2370 case 8: /* VL8 */ 2371 sve_ptrue(dst, size, lane_cnt); 2372 return; 2373 case 16: 2374 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2375 return; 2376 case 32: 2377 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2378 return; 2379 case 64: 2380 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2381 return; 2382 case 128: 2383 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2384 return; 2385 case 256: 2386 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2387 return; 2388 default: 2389 break; 2390 } 2391 2392 // Special patterns for "ptrue". 2393 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2394 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2395 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2396 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2397 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2398 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2399 } else { 2400 // Encode to "whileltw" for the remaining cases. 2401 mov(rscratch1, lane_cnt); 2402 sve_whileltw(dst, size, zr, rscratch1); 2403 } 2404 } 2405 2406 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2407 // Any remaining elements of dst will be filled with zero. 2408 // Clobbers: rscratch1 2409 // Preserves: src, mask 2410 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2411 FloatRegister vtmp1, FloatRegister vtmp2, 2412 PRegister pgtmp) { 2413 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2414 assert_different_registers(dst, src, vtmp1, vtmp2); 2415 assert_different_registers(mask, pgtmp); 2416 2417 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2418 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2419 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2420 sve_dup(vtmp2, H, 0); 2421 2422 // Extend lowest half to type INT. 2423 // dst = 00004444 00003333 00002222 00001111 2424 sve_uunpklo(dst, S, src); 2425 // pgtmp = 00000001 00000000 00000001 00000001 2426 sve_punpklo(pgtmp, mask); 2427 // Pack the active elements in size of type INT to the right, 2428 // and fill the remainings with zero. 2429 // dst = 00000000 00004444 00002222 00001111 2430 sve_compact(dst, S, dst, pgtmp); 2431 // Narrow the result back to type SHORT. 2432 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2433 sve_uzp1(dst, H, dst, vtmp2); 2434 // Count the active elements of lowest half. 2435 // rscratch1 = 3 2436 sve_cntp(rscratch1, S, ptrue, pgtmp); 2437 2438 // Repeat to the highest half. 2439 // pgtmp = 00000001 00000000 00000000 00000001 2440 sve_punpkhi(pgtmp, mask); 2441 // vtmp1 = 00008888 00007777 00006666 00005555 2442 sve_uunpkhi(vtmp1, S, src); 2443 // vtmp1 = 00000000 00000000 00008888 00005555 2444 sve_compact(vtmp1, S, vtmp1, pgtmp); 2445 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2446 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2447 2448 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2449 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2450 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2451 // TRUE_CNT is the number of active elements in the compressed low. 2452 neg(rscratch1, rscratch1); 2453 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2454 sve_index(vtmp2, H, rscratch1, 1); 2455 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2456 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2457 2458 // Combine the compressed high(after shifted) with the compressed low. 2459 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2460 sve_orr(dst, dst, vtmp1); 2461 } 2462 2463 // Clobbers: rscratch1, rscratch2 2464 // Preserves: src, mask 2465 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2466 FloatRegister vtmp1, FloatRegister vtmp2, 2467 FloatRegister vtmp3, FloatRegister vtmp4, 2468 PRegister ptmp, PRegister pgtmp) { 2469 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2470 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2471 assert_different_registers(mask, ptmp, pgtmp); 2472 // Example input: src = 88 77 66 55 44 33 22 11 2473 // mask = 01 00 00 01 01 00 01 01 2474 // Expected result: dst = 00 00 00 88 55 44 22 11 2475 2476 sve_dup(vtmp4, B, 0); 2477 // Extend lowest half to type SHORT. 2478 // vtmp1 = 0044 0033 0022 0011 2479 sve_uunpklo(vtmp1, H, src); 2480 // ptmp = 0001 0000 0001 0001 2481 sve_punpklo(ptmp, mask); 2482 // Count the active elements of lowest half. 2483 // rscratch2 = 3 2484 sve_cntp(rscratch2, H, ptrue, ptmp); 2485 // Pack the active elements in size of type SHORT to the right, 2486 // and fill the remainings with zero. 2487 // dst = 0000 0044 0022 0011 2488 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2489 // Narrow the result back to type BYTE. 2490 // dst = 00 00 00 00 00 44 22 11 2491 sve_uzp1(dst, B, dst, vtmp4); 2492 2493 // Repeat to the highest half. 2494 // ptmp = 0001 0000 0000 0001 2495 sve_punpkhi(ptmp, mask); 2496 // vtmp1 = 0088 0077 0066 0055 2497 sve_uunpkhi(vtmp2, H, src); 2498 // vtmp1 = 0000 0000 0088 0055 2499 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2500 2501 sve_dup(vtmp4, B, 0); 2502 // vtmp1 = 00 00 00 00 00 00 88 55 2503 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2504 2505 // Compressed low: dst = 00 00 00 00 00 44 22 11 2506 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2507 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2508 // TRUE_CNT is the number of active elements in the compressed low. 2509 neg(rscratch2, rscratch2); 2510 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2511 sve_index(vtmp2, B, rscratch2, 1); 2512 // vtmp1 = 00 00 00 88 55 00 00 00 2513 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2514 // Combine the compressed high(after shifted) with the compressed low. 2515 // dst = 00 00 00 88 55 44 22 11 2516 sve_orr(dst, dst, vtmp1); 2517 } 2518 2519 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2520 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2521 SIMD_Arrangement size = isQ ? T16B : T8B; 2522 if (bt == T_BYTE) { 2523 rbit(dst, size, src); 2524 } else { 2525 neon_reverse_bytes(dst, src, bt, isQ); 2526 rbit(dst, size, dst); 2527 } 2528 } 2529 2530 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2531 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2532 SIMD_Arrangement size = isQ ? T16B : T8B; 2533 switch (bt) { 2534 case T_BYTE: 2535 if (dst != src) { 2536 orr(dst, size, src, src); 2537 } 2538 break; 2539 case T_SHORT: 2540 rev16(dst, size, src); 2541 break; 2542 case T_INT: 2543 rev32(dst, size, src); 2544 break; 2545 case T_LONG: 2546 rev64(dst, size, src); 2547 break; 2548 default: 2549 assert(false, "unsupported"); 2550 ShouldNotReachHere(); 2551 } 2552 } 2553 2554 // VectorRearrange implementation for short/int/float/long/double types with NEON 2555 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. 2556 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group. 2557 // For VectorRearrange long/double, we compare the shuffle input with iota indices, 2558 // and use bsl to implement the operation. 2559 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, 2560 FloatRegister shuffle, FloatRegister tmp, 2561 BasicType bt, bool isQ) { 2562 assert_different_registers(dst, src, shuffle, tmp); 2563 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2564 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2565 2566 // Here is an example that rearranges a NEON vector with 4 ints: 2567 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] 2568 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. 2569 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector 2570 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get 2571 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. 2572 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], 2573 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] 2574 // 4. Use Vm as index register, and use V1 as table register. 2575 // Then get V2 as the result by tbl NEON instructions. 2576 switch (bt) { 2577 case T_SHORT: 2578 mov(tmp, size1, 0x02); 2579 mulv(dst, size2, shuffle, tmp); 2580 mov(tmp, size2, 0x0100); 2581 addv(dst, size1, dst, tmp); 2582 tbl(dst, size1, src, 1, dst); 2583 break; 2584 case T_INT: 2585 case T_FLOAT: 2586 mov(tmp, size1, 0x04); 2587 mulv(dst, size2, shuffle, tmp); 2588 mov(tmp, size2, 0x03020100); 2589 addv(dst, size1, dst, tmp); 2590 tbl(dst, size1, src, 1, dst); 2591 break; 2592 case T_LONG: 2593 case T_DOUBLE: 2594 // Load the iota indices for Long type. The indices are ordered by 2595 // type B/S/I/L/F/D, and the offset between two types is 16; Hence 2596 // the offset for L is 48. 2597 lea(rscratch1, 2598 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); 2599 ldrq(tmp, rscratch1); 2600 // Check whether the input "shuffle" is the same with iota indices. 2601 // Return "src" if true, otherwise swap the two elements of "src". 2602 cm(EQ, dst, size2, shuffle, tmp); 2603 ext(tmp, size1, src, src, 8); 2604 bsl(dst, size1, src, tmp); 2605 break; 2606 default: 2607 assert(false, "unsupported element type"); 2608 ShouldNotReachHere(); 2609 } 2610 } 2611 2612 // Extract a scalar element from an sve vector at position 'idx'. 2613 // The input elements in src are expected to be of integral type. 2614 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2615 int idx, FloatRegister vtmp) { 2616 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2617 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2618 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2619 if (bt == T_INT || bt == T_LONG) { 2620 umov(dst, src, size, idx); 2621 } else { 2622 smov(dst, src, size, idx); 2623 } 2624 } else { 2625 sve_orr(vtmp, src, src); 2626 sve_ext(vtmp, vtmp, idx << size); 2627 if (bt == T_INT || bt == T_LONG) { 2628 umov(dst, vtmp, size, 0); 2629 } else { 2630 smov(dst, vtmp, size, 0); 2631 } 2632 } 2633 } 2634 2635 // java.lang.Math::round intrinsics 2636 2637 // Clobbers: rscratch1, rflags 2638 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2639 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2640 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2641 switch (T) { 2642 case T2S: 2643 case T4S: 2644 fmovs(tmp1, T, 0.5f); 2645 mov(rscratch1, jint_cast(0x1.0p23f)); 2646 break; 2647 case T2D: 2648 fmovd(tmp1, T, 0.5); 2649 mov(rscratch1, julong_cast(0x1.0p52)); 2650 break; 2651 default: 2652 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2653 } 2654 fadd(tmp1, T, tmp1, src); 2655 fcvtms(tmp1, T, tmp1); 2656 // tmp1 = floor(src + 0.5, ties to even) 2657 2658 fcvtas(dst, T, src); 2659 // dst = round(src), ties to away 2660 2661 fneg(tmp3, T, src); 2662 dup(tmp2, T, rscratch1); 2663 cm(HS, tmp3, T, tmp3, tmp2); 2664 // tmp3 is now a set of flags 2665 2666 bif(dst, T16B, tmp1, tmp3); 2667 // result in dst 2668 } 2669 2670 // Clobbers: rscratch1, rflags 2671 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2672 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2673 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2674 assert_different_registers(tmp1, tmp2, src, dst); 2675 2676 switch (T) { 2677 case S: 2678 mov(rscratch1, jint_cast(0x1.0p23f)); 2679 break; 2680 case D: 2681 mov(rscratch1, julong_cast(0x1.0p52)); 2682 break; 2683 default: 2684 assert(T == S || T == D, "invalid register variant"); 2685 } 2686 2687 sve_frinta(dst, T, ptrue, src); 2688 // dst = round(src), ties to away 2689 2690 Label none; 2691 2692 sve_fneg(tmp1, T, ptrue, src); 2693 sve_dup(tmp2, T, rscratch1); 2694 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2695 br(EQ, none); 2696 { 2697 sve_cpy(tmp1, T, pgtmp, 0.5); 2698 sve_fadd(tmp1, T, pgtmp, src); 2699 sve_frintm(dst, T, pgtmp, tmp1); 2700 // dst = floor(src + 0.5, ties to even) 2701 } 2702 bind(none); 2703 2704 sve_fcvtzs(dst, T, ptrue, dst, T); 2705 // result in dst 2706 } 2707 2708 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2709 FloatRegister one, SIMD_Arrangement T) { 2710 assert_different_registers(dst, src, zero, one); 2711 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2712 2713 facgt(dst, T, src, zero); 2714 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2715 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2716 } 2717 2718 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2719 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2720 assert_different_registers(dst, src, zero, one, vtmp); 2721 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2722 2723 sve_orr(vtmp, src, src); 2724 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2725 switch (T) { 2726 case S: 2727 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2728 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2729 // on the sign of the float value 2730 break; 2731 case D: 2732 sve_and(vtmp, T, min_jlong); 2733 sve_orr(vtmp, T, jlong_cast(1.0)); 2734 break; 2735 default: 2736 assert(false, "unsupported"); 2737 ShouldNotReachHere(); 2738 } 2739 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2740 // Result in dst 2741 } 2742 2743 bool C2_MacroAssembler::in_scratch_emit_size() { 2744 if (ciEnv::current()->task() != nullptr) { 2745 PhaseOutput* phase_output = Compile::current()->output(); 2746 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2747 return true; 2748 } 2749 } 2750 return MacroAssembler::in_scratch_emit_size(); 2751 } 2752 2753 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 2754 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 2755 } 2756 2757 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) { 2758 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2759 if (t == TypeInt::INT) { 2760 return; 2761 } 2762 BLOCK_COMMENT("verify_int_in_range {"); 2763 Label L_success, L_failure; 2764 2765 jint lo = t->_lo; 2766 jint hi = t->_hi; 2767 2768 if (lo != min_jint && hi != max_jint) { 2769 subsw(rtmp, rval, lo); 2770 br(Assembler::LT, L_failure); 2771 subsw(rtmp, rval, hi); 2772 br(Assembler::LE, L_success); 2773 } else if (lo != min_jint) { 2774 subsw(rtmp, rval, lo); 2775 br(Assembler::GE, L_success); 2776 } else if (hi != max_jint) { 2777 subsw(rtmp, rval, hi); 2778 br(Assembler::LE, L_success); 2779 } else { 2780 ShouldNotReachHere(); 2781 } 2782 2783 bind(L_failure); 2784 movw(c_rarg0, idx); 2785 mov(c_rarg1, rval); 2786 movw(c_rarg2, lo); 2787 movw(c_rarg3, hi); 2788 reconstruct_frame_pointer(rtmp); 2789 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp); 2790 hlt(0); 2791 2792 bind(L_success); 2793 BLOCK_COMMENT("} verify_int_in_range"); 2794 } 2795 2796 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 2797 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 2798 } 2799 2800 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) { 2801 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2802 if (t == TypeLong::LONG) { 2803 return; 2804 } 2805 BLOCK_COMMENT("verify_long_in_range {"); 2806 Label L_success, L_failure; 2807 2808 jlong lo = t->_lo; 2809 jlong hi = t->_hi; 2810 2811 if (lo != min_jlong && hi != max_jlong) { 2812 subs(rtmp, rval, lo); 2813 br(Assembler::LT, L_failure); 2814 subs(rtmp, rval, hi); 2815 br(Assembler::LE, L_success); 2816 } else if (lo != min_jlong) { 2817 subs(rtmp, rval, lo); 2818 br(Assembler::GE, L_success); 2819 } else if (hi != max_jlong) { 2820 subs(rtmp, rval, hi); 2821 br(Assembler::LE, L_success); 2822 } else { 2823 ShouldNotReachHere(); 2824 } 2825 2826 bind(L_failure); 2827 movw(c_rarg0, idx); 2828 mov(c_rarg1, rval); 2829 mov(c_rarg2, lo); 2830 mov(c_rarg3, hi); 2831 reconstruct_frame_pointer(rtmp); 2832 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp); 2833 hlt(0); 2834 2835 bind(L_success); 2836 BLOCK_COMMENT("} verify_long_in_range"); 2837 } 2838 2839 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 2840 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 2841 if (PreserveFramePointer) { 2842 // frame pointer is valid 2843 #ifdef ASSERT 2844 // Verify frame pointer value in rfp. 2845 add(rtmp, sp, framesize - 2 * wordSize); 2846 Label L_success; 2847 cmp(rfp, rtmp); 2848 br(Assembler::EQ, L_success); 2849 stop("frame pointer mismatch"); 2850 bind(L_success); 2851 #endif // ASSERT 2852 } else { 2853 add(rfp, sp, framesize - 2 * wordSize); 2854 } 2855 }