1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 // jdk.internal.util.ArraysSupport.vectorizedHashCode 50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 51 FloatRegister vdata0, FloatRegister vdata1, 52 FloatRegister vdata2, FloatRegister vdata3, 53 FloatRegister vmul0, FloatRegister vmul1, 54 FloatRegister vmul2, FloatRegister vmul3, 55 FloatRegister vpow, FloatRegister vpowm, 56 BasicType eltype) { 57 ARRAYS_HASHCODE_REGISTERS; 58 59 Register tmp1 = rscratch1, tmp2 = rscratch2; 60 61 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 62 63 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 64 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 65 // use 4H for chars and shorts instead, but using 8H gives better performance. 66 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 67 : eltype == T_CHAR || eltype == T_SHORT ? 8 68 : eltype == T_INT ? 4 69 : 0; 70 guarantee(vf, "unsupported eltype"); 71 72 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 73 const size_t unroll_factor = 4; 74 75 switch (eltype) { 76 case T_BOOLEAN: 77 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 78 break; 79 case T_CHAR: 80 BLOCK_COMMENT("arrays_hashcode(char) {"); 81 break; 82 case T_BYTE: 83 BLOCK_COMMENT("arrays_hashcode(byte) {"); 84 break; 85 case T_SHORT: 86 BLOCK_COMMENT("arrays_hashcode(short) {"); 87 break; 88 case T_INT: 89 BLOCK_COMMENT("arrays_hashcode(int) {"); 90 break; 91 default: 92 ShouldNotReachHere(); 93 } 94 95 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 96 // implemented by the stub executes just once. Call the stub only if at least two iterations will 97 // be executed. 98 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 99 cmpw(cnt, large_threshold); 100 br(Assembler::HS, LARGE); 101 102 bind(TAIL); 103 104 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 105 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 106 // Iteration eats up the remainder, uf elements at a time. 107 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 108 andr(tmp2, cnt, unroll_factor - 1); 109 adr(tmp1, BR_BASE); 110 sub(tmp1, tmp1, tmp2, ext::sxtw, 3); 111 movw(tmp2, 0x1f); 112 br(tmp1); 113 114 bind(LOOP); 115 for (size_t i = 0; i < unroll_factor; ++i) { 116 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 117 maddw(result, result, tmp2, tmp1); 118 } 119 bind(BR_BASE); 120 subsw(cnt, cnt, unroll_factor); 121 br(Assembler::HS, LOOP); 122 123 b(DONE); 124 125 bind(LARGE); 126 127 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 128 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 129 address tpc = trampoline_call(stub); 130 if (tpc == nullptr) { 131 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 132 postcond(pc() == badAddress); 133 return nullptr; 134 } 135 136 bind(DONE); 137 138 BLOCK_COMMENT("} // arrays_hashcode"); 139 140 postcond(pc() != badAddress); 141 return pc(); 142 } 143 144 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 145 Register tmp2Reg, Register tmp3Reg) { 146 Register oop = objectReg; 147 Register box = boxReg; 148 Register disp_hdr = tmpReg; 149 Register tmp = tmp2Reg; 150 Label cont; 151 Label object_has_monitor; 152 Label count, no_count; 153 154 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 155 assert_different_registers(oop, box, tmp, disp_hdr, rscratch2); 156 157 // Load markWord from object into displaced_header. 158 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 159 160 if (DiagnoseSyncOnValueBasedClasses != 0) { 161 load_klass(tmp, oop); 162 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 163 tst(tmp, KlassFlags::_misc_is_value_based_class); 164 br(Assembler::NE, cont); 165 } 166 167 // Check for existing monitor 168 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 169 170 if (LockingMode == LM_MONITOR) { 171 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 172 b(cont); 173 } else { 174 assert(LockingMode == LM_LEGACY, "must be"); 175 // Set tmp to be (markWord of object | UNLOCK_VALUE). 176 orr(tmp, disp_hdr, markWord::unlocked_value); 177 178 // Initialize the box. (Must happen before we update the object mark!) 179 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 180 181 // Compare object markWord with an unlocked value (tmp) and if 182 // equal exchange the stack address of our box with object markWord. 183 // On failure disp_hdr contains the possibly locked markWord. 184 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 185 /*release*/ true, /*weak*/ false, disp_hdr); 186 br(Assembler::EQ, cont); 187 188 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 189 190 // If the compare-and-exchange succeeded, then we found an unlocked 191 // object, will have now locked it will continue at label cont 192 193 // Check if the owner is self by comparing the value in the 194 // markWord of object (disp_hdr) with the stack pointer. 195 mov(rscratch1, sp); 196 sub(disp_hdr, disp_hdr, rscratch1); 197 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 198 // If condition is true we are cont and hence we can store 0 as the 199 // displaced header in the box, which indicates that it is a recursive lock. 200 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 201 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 202 b(cont); 203 } 204 205 // Handle existing monitor. 206 bind(object_has_monitor); 207 208 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 209 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 210 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 211 cmpxchg(tmp, zr, rscratch2, Assembler::xword, /*acquire*/ true, 212 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 213 214 // Store a non-null value into the box to avoid looking like a re-entrant 215 // lock. The fast-path monitor unlock code checks for 216 // markWord::monitor_value so use markWord::unused_mark which has the 217 // relevant bit set, and also matches ObjectSynchronizer::enter. 218 mov(tmp, (address)markWord::unused_mark().value()); 219 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 220 221 br(Assembler::EQ, cont); // CAS success means locking succeeded 222 223 cmp(tmp3Reg, rscratch2); 224 br(Assembler::NE, cont); // Check for recursive locking 225 226 // Recursive lock case 227 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 228 // flag == EQ still from the cmp above, checking if this is a reentrant lock 229 230 bind(cont); 231 // flag == EQ indicates success 232 // flag == NE indicates failure 233 br(Assembler::NE, no_count); 234 235 bind(count); 236 if (LockingMode == LM_LEGACY) { 237 inc_held_monitor_count(rscratch1); 238 } 239 240 bind(no_count); 241 } 242 243 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 244 Register tmp2Reg) { 245 Register oop = objectReg; 246 Register box = boxReg; 247 Register disp_hdr = tmpReg; 248 Register owner_addr = tmpReg; 249 Register tmp = tmp2Reg; 250 Label cont; 251 Label object_has_monitor; 252 Label count, no_count; 253 Label unlocked; 254 255 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 256 assert_different_registers(oop, box, tmp, disp_hdr); 257 258 if (LockingMode == LM_LEGACY) { 259 // Find the lock address and load the displaced header from the stack. 260 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 261 262 // If the displaced header is 0, we have a recursive unlock. 263 cmp(disp_hdr, zr); 264 br(Assembler::EQ, cont); 265 } 266 267 // Handle existing monitor. 268 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 269 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 270 271 if (LockingMode == LM_MONITOR) { 272 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 273 b(cont); 274 } else { 275 assert(LockingMode == LM_LEGACY, "must be"); 276 // Check if it is still a light weight lock, this is is true if we 277 // see the stack address of the basicLock in the markWord of the 278 // object. 279 280 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 281 /*release*/ true, /*weak*/ false, tmp); 282 b(cont); 283 } 284 285 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 286 287 // Handle existing monitor. 288 bind(object_has_monitor); 289 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 290 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 291 292 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 293 294 Label notRecursive; 295 cbz(disp_hdr, notRecursive); 296 297 // Recursive lock 298 sub(disp_hdr, disp_hdr, 1u); 299 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 300 cmp(disp_hdr, disp_hdr); // Sets flags for result 301 b(cont); 302 303 bind(notRecursive); 304 305 // Compute owner address. 306 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 307 308 // Set owner to null. 309 // Release to satisfy the JMM 310 stlr(zr, owner_addr); 311 // We need a full fence after clearing owner to avoid stranding. 312 // StoreLoad achieves this. 313 membar(StoreLoad); 314 315 // Check if the entry lists are empty (EntryList first - by convention). 316 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 317 ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset())); 318 orr(rscratch1, rscratch1, tmpReg); 319 cmp(rscratch1, zr); 320 br(Assembler::EQ, cont); // If so we are done. 321 322 // Check if there is a successor. 323 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 324 cmp(rscratch1, zr); 325 br(Assembler::NE, unlocked); // If so we are done. 326 327 // Save the monitor pointer in the current thread, so we can try to 328 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 329 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 330 331 cmp(zr, rthread); // Set Flag to NE => slow path 332 b(cont); 333 334 bind(unlocked); 335 cmp(zr, zr); // Set Flag to EQ => fast path 336 337 // Intentional fall-through 338 339 bind(cont); 340 // flag == EQ indicates success 341 // flag == NE indicates failure 342 br(Assembler::NE, no_count); 343 344 bind(count); 345 if (LockingMode == LM_LEGACY) { 346 dec_held_monitor_count(rscratch1); 347 } 348 349 bind(no_count); 350 } 351 352 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 353 Register t2, Register t3) { 354 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 355 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 356 357 // Handle inflated monitor. 358 Label inflated; 359 // Finish fast lock successfully. MUST branch to with flag == EQ 360 Label locked; 361 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 362 Label slow_path; 363 364 if (UseObjectMonitorTable) { 365 // Clear cache in case fast locking succeeds. 366 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 367 } 368 369 if (DiagnoseSyncOnValueBasedClasses != 0) { 370 load_klass(t1, obj); 371 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 372 tst(t1, KlassFlags::_misc_is_value_based_class); 373 br(Assembler::NE, slow_path); 374 } 375 376 const Register t1_mark = t1; 377 const Register t3_t = t3; 378 379 { // Lightweight locking 380 381 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 382 Label push; 383 384 const Register t2_top = t2; 385 386 // Check if lock-stack is full. 387 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 388 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 389 br(Assembler::GT, slow_path); 390 391 // Check if recursive. 392 subw(t3_t, t2_top, oopSize); 393 ldr(t3_t, Address(rthread, t3_t)); 394 cmp(obj, t3_t); 395 br(Assembler::EQ, push); 396 397 // Relaxed normal load to check for monitor. Optimization for monitor case. 398 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 399 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 400 401 // Not inflated 402 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 403 404 // Try to lock. Transition lock-bits 0b01 => 0b00 405 orr(t1_mark, t1_mark, markWord::unlocked_value); 406 eor(t3_t, t1_mark, markWord::unlocked_value); 407 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 408 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 409 br(Assembler::NE, slow_path); 410 411 bind(push); 412 // After successful lock, push object on lock-stack. 413 str(obj, Address(rthread, t2_top)); 414 addw(t2_top, t2_top, oopSize); 415 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 416 b(locked); 417 } 418 419 { // Handle inflated monitor. 420 bind(inflated); 421 422 const Register t1_monitor = t1; 423 424 if (!UseObjectMonitorTable) { 425 assert(t1_monitor == t1_mark, "should be the same here"); 426 } else { 427 Label monitor_found; 428 429 // Load cache address 430 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 431 432 const int num_unrolled = 2; 433 for (int i = 0; i < num_unrolled; i++) { 434 ldr(t1, Address(t3_t)); 435 cmp(obj, t1); 436 br(Assembler::EQ, monitor_found); 437 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 438 } 439 440 Label loop; 441 442 // Search for obj in cache. 443 bind(loop); 444 445 // Check for match. 446 ldr(t1, Address(t3_t)); 447 cmp(obj, t1); 448 br(Assembler::EQ, monitor_found); 449 450 // Search until null encountered, guaranteed _null_sentinel at end. 451 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 452 cbnz(t1, loop); 453 // Cache Miss, NE set from cmp above, cbnz does not set flags 454 b(slow_path); 455 456 bind(monitor_found); 457 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 458 } 459 460 const Register t2_owner_addr = t2; 461 const Register t3_owner = t3; 462 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 463 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 464 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 465 466 Label monitor_locked; 467 468 // Compute owner address. 469 lea(t2_owner_addr, owner_address); 470 471 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 472 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 473 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 474 /*release*/ false, /*weak*/ false, t3_owner); 475 br(Assembler::EQ, monitor_locked); 476 477 // Check if recursive. 478 cmp(t3_owner, rscratch2); 479 br(Assembler::NE, slow_path); 480 481 // Recursive. 482 increment(recursions_address, 1); 483 484 bind(monitor_locked); 485 if (UseObjectMonitorTable) { 486 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 487 } 488 } 489 490 bind(locked); 491 492 #ifdef ASSERT 493 // Check that locked label is reached with Flags == EQ. 494 Label flag_correct; 495 br(Assembler::EQ, flag_correct); 496 stop("Fast Lock Flag != EQ"); 497 #endif 498 499 bind(slow_path); 500 #ifdef ASSERT 501 // Check that slow_path label is reached with Flags == NE. 502 br(Assembler::NE, flag_correct); 503 stop("Fast Lock Flag != NE"); 504 bind(flag_correct); 505 #endif 506 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 507 } 508 509 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 510 Register t2, Register t3) { 511 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 512 assert_different_registers(obj, box, t1, t2, t3); 513 514 // Handle inflated monitor. 515 Label inflated, inflated_load_mark; 516 // Finish fast unlock successfully. MUST branch to with flag == EQ 517 Label unlocked; 518 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 519 Label slow_path; 520 521 const Register t1_mark = t1; 522 const Register t2_top = t2; 523 const Register t3_t = t3; 524 525 { // Lightweight unlock 526 527 Label push_and_slow_path; 528 529 // Check if obj is top of lock-stack. 530 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 531 subw(t2_top, t2_top, oopSize); 532 ldr(t3_t, Address(rthread, t2_top)); 533 cmp(obj, t3_t); 534 // Top of lock stack was not obj. Must be monitor. 535 br(Assembler::NE, inflated_load_mark); 536 537 // Pop lock-stack. 538 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 539 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 540 541 // Check if recursive. 542 subw(t3_t, t2_top, oopSize); 543 ldr(t3_t, Address(rthread, t3_t)); 544 cmp(obj, t3_t); 545 br(Assembler::EQ, unlocked); 546 547 // Not recursive. 548 // Load Mark. 549 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 550 551 // Check header for monitor (0b10). 552 // Because we got here by popping (meaning we pushed in locked) 553 // there will be no monitor in the box. So we need to push back the obj 554 // so that the runtime can fix any potential anonymous owner. 555 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 556 557 // Try to unlock. Transition lock bits 0b00 => 0b01 558 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 559 orr(t3_t, t1_mark, markWord::unlocked_value); 560 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 561 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 562 br(Assembler::EQ, unlocked); 563 564 bind(push_and_slow_path); 565 // Compare and exchange failed. 566 // Restore lock-stack and handle the unlock in runtime. 567 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 568 addw(t2_top, t2_top, oopSize); 569 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 570 b(slow_path); 571 } 572 573 574 { // Handle inflated monitor. 575 bind(inflated_load_mark); 576 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 577 #ifdef ASSERT 578 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 579 stop("Fast Unlock not monitor"); 580 #endif 581 582 bind(inflated); 583 584 #ifdef ASSERT 585 Label check_done; 586 subw(t2_top, t2_top, oopSize); 587 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 588 br(Assembler::LT, check_done); 589 ldr(t3_t, Address(rthread, t2_top)); 590 cmp(obj, t3_t); 591 br(Assembler::NE, inflated); 592 stop("Fast Unlock lock on stack"); 593 bind(check_done); 594 #endif 595 596 const Register t1_monitor = t1; 597 598 if (!UseObjectMonitorTable) { 599 assert(t1_monitor == t1_mark, "should be the same here"); 600 601 // Untag the monitor. 602 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 603 } else { 604 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 605 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 606 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 607 br(Assembler::LO, slow_path); 608 } 609 610 const Register t2_recursions = t2; 611 Label not_recursive; 612 613 // Check if recursive. 614 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 615 cbz(t2_recursions, not_recursive); 616 617 // Recursive unlock. 618 sub(t2_recursions, t2_recursions, 1u); 619 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 620 // Set flag == EQ 621 cmp(t2_recursions, t2_recursions); 622 b(unlocked); 623 624 bind(not_recursive); 625 626 const Register t2_owner_addr = t2; 627 628 // Compute owner address. 629 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 630 631 // Set owner to null. 632 // Release to satisfy the JMM 633 stlr(zr, t2_owner_addr); 634 // We need a full fence after clearing owner to avoid stranding. 635 // StoreLoad achieves this. 636 membar(StoreLoad); 637 638 // Check if the entry lists are empty (EntryList first - by convention). 639 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 640 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 641 orr(rscratch1, rscratch1, t3_t); 642 cmp(rscratch1, zr); 643 br(Assembler::EQ, unlocked); // If so we are done. 644 645 // Check if there is a successor. 646 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 647 cmp(rscratch1, zr); 648 br(Assembler::NE, unlocked); // If so we are done. 649 650 // Save the monitor pointer in the current thread, so we can try to 651 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 652 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 653 654 cmp(zr, rthread); // Set Flag to NE => slow path 655 b(slow_path); 656 } 657 658 bind(unlocked); 659 cmp(zr, zr); // Set Flags to EQ => fast path 660 661 #ifdef ASSERT 662 // Check that unlocked label is reached with Flags == EQ. 663 Label flag_correct; 664 br(Assembler::EQ, flag_correct); 665 stop("Fast Unlock Flag != EQ"); 666 #endif 667 668 bind(slow_path); 669 #ifdef ASSERT 670 // Check that slow_path label is reached with Flags == NE. 671 br(Assembler::NE, flag_correct); 672 stop("Fast Unlock Flag != NE"); 673 bind(flag_correct); 674 #endif 675 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 676 } 677 678 // Search for str1 in str2 and return index or -1 679 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 680 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 681 Register cnt2, Register cnt1, 682 Register tmp1, Register tmp2, 683 Register tmp3, Register tmp4, 684 Register tmp5, Register tmp6, 685 int icnt1, Register result, int ae) { 686 // NOTE: tmp5, tmp6 can be zr depending on specific method version 687 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 688 689 Register ch1 = rscratch1; 690 Register ch2 = rscratch2; 691 Register cnt1tmp = tmp1; 692 Register cnt2tmp = tmp2; 693 Register cnt1_neg = cnt1; 694 Register cnt2_neg = cnt2; 695 Register result_tmp = tmp4; 696 697 bool isL = ae == StrIntrinsicNode::LL; 698 699 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 700 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 701 int str1_chr_shift = str1_isL ? 0:1; 702 int str2_chr_shift = str2_isL ? 0:1; 703 int str1_chr_size = str1_isL ? 1:2; 704 int str2_chr_size = str2_isL ? 1:2; 705 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 706 (chr_insn)&MacroAssembler::ldrh; 707 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 708 (chr_insn)&MacroAssembler::ldrh; 709 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 710 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 711 712 // Note, inline_string_indexOf() generates checks: 713 // if (substr.count > string.count) return -1; 714 // if (substr.count == 0) return 0; 715 716 // We have two strings, a source string in str2, cnt2 and a pattern string 717 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 718 719 // For larger pattern and source we use a simplified Boyer Moore algorithm. 720 // With a small pattern and source we use linear scan. 721 722 if (icnt1 == -1) { 723 sub(result_tmp, cnt2, cnt1); 724 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 725 br(LT, LINEARSEARCH); 726 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 727 subs(zr, cnt1, 256); 728 lsr(tmp1, cnt2, 2); 729 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 730 br(GE, LINEARSTUB); 731 } 732 733 // The Boyer Moore alogorithm is based on the description here:- 734 // 735 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 736 // 737 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 738 // and the 'Good Suffix' rule. 739 // 740 // These rules are essentially heuristics for how far we can shift the 741 // pattern along the search string. 742 // 743 // The implementation here uses the 'Bad Character' rule only because of the 744 // complexity of initialisation for the 'Good Suffix' rule. 745 // 746 // This is also known as the Boyer-Moore-Horspool algorithm:- 747 // 748 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 749 // 750 // This particular implementation has few java-specific optimizations. 751 // 752 // #define ASIZE 256 753 // 754 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 755 // int i, j; 756 // unsigned c; 757 // unsigned char bc[ASIZE]; 758 // 759 // /* Preprocessing */ 760 // for (i = 0; i < ASIZE; ++i) 761 // bc[i] = m; 762 // for (i = 0; i < m - 1; ) { 763 // c = x[i]; 764 // ++i; 765 // // c < 256 for Latin1 string, so, no need for branch 766 // #ifdef PATTERN_STRING_IS_LATIN1 767 // bc[c] = m - i; 768 // #else 769 // if (c < ASIZE) bc[c] = m - i; 770 // #endif 771 // } 772 // 773 // /* Searching */ 774 // j = 0; 775 // while (j <= n - m) { 776 // c = y[i+j]; 777 // if (x[m-1] == c) 778 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 779 // if (i < 0) return j; 780 // // c < 256 for Latin1 string, so, no need for branch 781 // #ifdef SOURCE_STRING_IS_LATIN1 782 // // LL case: (c< 256) always true. Remove branch 783 // j += bc[y[j+m-1]]; 784 // #endif 785 // #ifndef PATTERN_STRING_IS_UTF 786 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 787 // if (c < ASIZE) 788 // j += bc[y[j+m-1]]; 789 // else 790 // j += 1 791 // #endif 792 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 793 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 794 // if (c < ASIZE) 795 // j += bc[y[j+m-1]]; 796 // else 797 // j += m 798 // #endif 799 // } 800 // } 801 802 if (icnt1 == -1) { 803 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 804 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 805 Register cnt1end = tmp2; 806 Register str2end = cnt2; 807 Register skipch = tmp2; 808 809 // str1 length is >=8, so, we can read at least 1 register for cases when 810 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 811 // UL case. We'll re-read last character in inner pre-loop code to have 812 // single outer pre-loop load 813 const int firstStep = isL ? 7 : 3; 814 815 const int ASIZE = 256; 816 const int STORED_BYTES = 32; // amount of bytes stored per instruction 817 sub(sp, sp, ASIZE); 818 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 819 mov(ch1, sp); 820 BIND(BM_INIT_LOOP); 821 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 822 subs(tmp5, tmp5, 1); 823 br(GT, BM_INIT_LOOP); 824 825 sub(cnt1tmp, cnt1, 1); 826 mov(tmp5, str2); 827 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 828 sub(ch2, cnt1, 1); 829 mov(tmp3, str1); 830 BIND(BCLOOP); 831 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 832 if (!str1_isL) { 833 subs(zr, ch1, ASIZE); 834 br(HS, BCSKIP); 835 } 836 strb(ch2, Address(sp, ch1)); 837 BIND(BCSKIP); 838 subs(ch2, ch2, 1); 839 br(GT, BCLOOP); 840 841 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 842 if (str1_isL == str2_isL) { 843 // load last 8 bytes (8LL/4UU symbols) 844 ldr(tmp6, Address(tmp6, -wordSize)); 845 } else { 846 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 847 // convert Latin1 to UTF. We'll have to wait until load completed, but 848 // it's still faster than per-character loads+checks 849 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 850 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 851 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 852 andr(tmp6, tmp6, 0xFF); // str1[N-4] 853 orr(ch2, ch1, ch2, LSL, 16); 854 orr(tmp6, tmp6, tmp3, LSL, 48); 855 orr(tmp6, tmp6, ch2, LSL, 16); 856 } 857 BIND(BMLOOPSTR2); 858 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 859 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 860 if (str1_isL == str2_isL) { 861 // re-init tmp3. It's for free because it's executed in parallel with 862 // load above. Alternative is to initialize it before loop, but it'll 863 // affect performance on in-order systems with 2 or more ld/st pipelines 864 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 865 } 866 if (!isL) { // UU/UL case 867 lsl(ch2, cnt1tmp, 1); // offset in bytes 868 } 869 cmp(tmp3, skipch); 870 br(NE, BMSKIP); 871 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 872 mov(ch1, tmp6); 873 if (isL) { 874 b(BMLOOPSTR1_AFTER_LOAD); 875 } else { 876 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 877 b(BMLOOPSTR1_CMP); 878 } 879 BIND(BMLOOPSTR1); 880 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 881 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 882 BIND(BMLOOPSTR1_AFTER_LOAD); 883 subs(cnt1tmp, cnt1tmp, 1); 884 br(LT, BMLOOPSTR1_LASTCMP); 885 BIND(BMLOOPSTR1_CMP); 886 cmp(ch1, ch2); 887 br(EQ, BMLOOPSTR1); 888 BIND(BMSKIP); 889 if (!isL) { 890 // if we've met UTF symbol while searching Latin1 pattern, then we can 891 // skip cnt1 symbols 892 if (str1_isL != str2_isL) { 893 mov(result_tmp, cnt1); 894 } else { 895 mov(result_tmp, 1); 896 } 897 subs(zr, skipch, ASIZE); 898 br(HS, BMADV); 899 } 900 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 901 BIND(BMADV); 902 sub(cnt1tmp, cnt1, 1); 903 add(str2, str2, result_tmp, LSL, str2_chr_shift); 904 cmp(str2, str2end); 905 br(LE, BMLOOPSTR2); 906 add(sp, sp, ASIZE); 907 b(NOMATCH); 908 BIND(BMLOOPSTR1_LASTCMP); 909 cmp(ch1, ch2); 910 br(NE, BMSKIP); 911 BIND(BMMATCH); 912 sub(result, str2, tmp5); 913 if (!str2_isL) lsr(result, result, 1); 914 add(sp, sp, ASIZE); 915 b(DONE); 916 917 BIND(LINEARSTUB); 918 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 919 br(LT, LINEAR_MEDIUM); 920 mov(result, zr); 921 RuntimeAddress stub = nullptr; 922 if (isL) { 923 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 924 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 925 } else if (str1_isL) { 926 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 927 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 928 } else { 929 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 930 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 931 } 932 address call = trampoline_call(stub); 933 if (call == nullptr) { 934 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 935 ciEnv::current()->record_failure("CodeCache is full"); 936 return; 937 } 938 b(DONE); 939 } 940 941 BIND(LINEARSEARCH); 942 { 943 Label DO1, DO2, DO3; 944 945 Register str2tmp = tmp2; 946 Register first = tmp3; 947 948 if (icnt1 == -1) 949 { 950 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 951 952 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 953 br(LT, DOSHORT); 954 BIND(LINEAR_MEDIUM); 955 (this->*str1_load_1chr)(first, Address(str1)); 956 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 957 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 958 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 959 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 960 961 BIND(FIRST_LOOP); 962 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 963 cmp(first, ch2); 964 br(EQ, STR1_LOOP); 965 BIND(STR2_NEXT); 966 adds(cnt2_neg, cnt2_neg, str2_chr_size); 967 br(LE, FIRST_LOOP); 968 b(NOMATCH); 969 970 BIND(STR1_LOOP); 971 adds(cnt1tmp, cnt1_neg, str1_chr_size); 972 add(cnt2tmp, cnt2_neg, str2_chr_size); 973 br(GE, MATCH); 974 975 BIND(STR1_NEXT); 976 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 977 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 978 cmp(ch1, ch2); 979 br(NE, STR2_NEXT); 980 adds(cnt1tmp, cnt1tmp, str1_chr_size); 981 add(cnt2tmp, cnt2tmp, str2_chr_size); 982 br(LT, STR1_NEXT); 983 b(MATCH); 984 985 BIND(DOSHORT); 986 if (str1_isL == str2_isL) { 987 cmp(cnt1, (u1)2); 988 br(LT, DO1); 989 br(GT, DO3); 990 } 991 } 992 993 if (icnt1 == 4) { 994 Label CH1_LOOP; 995 996 (this->*load_4chr)(ch1, str1); 997 sub(result_tmp, cnt2, 4); 998 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 999 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1000 1001 BIND(CH1_LOOP); 1002 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 1003 cmp(ch1, ch2); 1004 br(EQ, MATCH); 1005 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1006 br(LE, CH1_LOOP); 1007 b(NOMATCH); 1008 } 1009 1010 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1011 Label CH1_LOOP; 1012 1013 BIND(DO2); 1014 (this->*load_2chr)(ch1, str1); 1015 if (icnt1 == 2) { 1016 sub(result_tmp, cnt2, 2); 1017 } 1018 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1019 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1020 BIND(CH1_LOOP); 1021 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1022 cmp(ch1, ch2); 1023 br(EQ, MATCH); 1024 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1025 br(LE, CH1_LOOP); 1026 b(NOMATCH); 1027 } 1028 1029 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1030 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1031 1032 BIND(DO3); 1033 (this->*load_2chr)(first, str1); 1034 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1035 if (icnt1 == 3) { 1036 sub(result_tmp, cnt2, 3); 1037 } 1038 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1039 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1040 BIND(FIRST_LOOP); 1041 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1042 cmpw(first, ch2); 1043 br(EQ, STR1_LOOP); 1044 BIND(STR2_NEXT); 1045 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1046 br(LE, FIRST_LOOP); 1047 b(NOMATCH); 1048 1049 BIND(STR1_LOOP); 1050 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1051 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1052 cmp(ch1, ch2); 1053 br(NE, STR2_NEXT); 1054 b(MATCH); 1055 } 1056 1057 if (icnt1 == -1 || icnt1 == 1) { 1058 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1059 1060 BIND(DO1); 1061 (this->*str1_load_1chr)(ch1, str1); 1062 cmp(cnt2, (u1)8); 1063 br(LT, DO1_SHORT); 1064 1065 sub(result_tmp, cnt2, 8/str2_chr_size); 1066 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1067 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1068 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1069 1070 if (str2_isL) { 1071 orr(ch1, ch1, ch1, LSL, 8); 1072 } 1073 orr(ch1, ch1, ch1, LSL, 16); 1074 orr(ch1, ch1, ch1, LSL, 32); 1075 BIND(CH1_LOOP); 1076 ldr(ch2, Address(str2, cnt2_neg)); 1077 eor(ch2, ch1, ch2); 1078 sub(tmp1, ch2, tmp3); 1079 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1080 bics(tmp1, tmp1, tmp2); 1081 br(NE, HAS_ZERO); 1082 adds(cnt2_neg, cnt2_neg, 8); 1083 br(LT, CH1_LOOP); 1084 1085 cmp(cnt2_neg, (u1)8); 1086 mov(cnt2_neg, 0); 1087 br(LT, CH1_LOOP); 1088 b(NOMATCH); 1089 1090 BIND(HAS_ZERO); 1091 rev(tmp1, tmp1); 1092 clz(tmp1, tmp1); 1093 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1094 b(MATCH); 1095 1096 BIND(DO1_SHORT); 1097 mov(result_tmp, cnt2); 1098 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1099 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1100 BIND(DO1_LOOP); 1101 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1102 cmpw(ch1, ch2); 1103 br(EQ, MATCH); 1104 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1105 br(LT, DO1_LOOP); 1106 } 1107 } 1108 BIND(NOMATCH); 1109 mov(result, -1); 1110 b(DONE); 1111 BIND(MATCH); 1112 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1113 BIND(DONE); 1114 } 1115 1116 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1117 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1118 1119 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1120 Register ch, Register result, 1121 Register tmp1, Register tmp2, Register tmp3) 1122 { 1123 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1124 Register cnt1_neg = cnt1; 1125 Register ch1 = rscratch1; 1126 Register result_tmp = rscratch2; 1127 1128 cbz(cnt1, NOMATCH); 1129 1130 cmp(cnt1, (u1)4); 1131 br(LT, DO1_SHORT); 1132 1133 orr(ch, ch, ch, LSL, 16); 1134 orr(ch, ch, ch, LSL, 32); 1135 1136 sub(cnt1, cnt1, 4); 1137 mov(result_tmp, cnt1); 1138 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1139 sub(cnt1_neg, zr, cnt1, LSL, 1); 1140 1141 mov(tmp3, 0x0001000100010001); 1142 1143 BIND(CH1_LOOP); 1144 ldr(ch1, Address(str1, cnt1_neg)); 1145 eor(ch1, ch, ch1); 1146 sub(tmp1, ch1, tmp3); 1147 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1148 bics(tmp1, tmp1, tmp2); 1149 br(NE, HAS_ZERO); 1150 adds(cnt1_neg, cnt1_neg, 8); 1151 br(LT, CH1_LOOP); 1152 1153 cmp(cnt1_neg, (u1)8); 1154 mov(cnt1_neg, 0); 1155 br(LT, CH1_LOOP); 1156 b(NOMATCH); 1157 1158 BIND(HAS_ZERO); 1159 rev(tmp1, tmp1); 1160 clz(tmp1, tmp1); 1161 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1162 b(MATCH); 1163 1164 BIND(DO1_SHORT); 1165 mov(result_tmp, cnt1); 1166 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1167 sub(cnt1_neg, zr, cnt1, LSL, 1); 1168 BIND(DO1_LOOP); 1169 ldrh(ch1, Address(str1, cnt1_neg)); 1170 cmpw(ch, ch1); 1171 br(EQ, MATCH); 1172 adds(cnt1_neg, cnt1_neg, 2); 1173 br(LT, DO1_LOOP); 1174 BIND(NOMATCH); 1175 mov(result, -1); 1176 b(DONE); 1177 BIND(MATCH); 1178 add(result, result_tmp, cnt1_neg, ASR, 1); 1179 BIND(DONE); 1180 } 1181 1182 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1183 Register ch, Register result, 1184 FloatRegister ztmp1, 1185 FloatRegister ztmp2, 1186 PRegister tmp_pg, 1187 PRegister tmp_pdn, bool isL) 1188 { 1189 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1190 assert(tmp_pg->is_governing(), 1191 "this register has to be a governing predicate register"); 1192 1193 Label LOOP, MATCH, DONE, NOMATCH; 1194 Register vec_len = rscratch1; 1195 Register idx = rscratch2; 1196 1197 SIMD_RegVariant T = (isL == true) ? B : H; 1198 1199 cbz(cnt1, NOMATCH); 1200 1201 // Assign the particular char throughout the vector. 1202 sve_dup(ztmp2, T, ch); 1203 if (isL) { 1204 sve_cntb(vec_len); 1205 } else { 1206 sve_cnth(vec_len); 1207 } 1208 mov(idx, 0); 1209 1210 // Generate a predicate to control the reading of input string. 1211 sve_whilelt(tmp_pg, T, idx, cnt1); 1212 1213 BIND(LOOP); 1214 // Read a vector of 8- or 16-bit data depending on the string type. Note 1215 // that inactive elements indicated by the predicate register won't cause 1216 // a data read from memory to the destination vector. 1217 if (isL) { 1218 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1219 } else { 1220 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1221 } 1222 add(idx, idx, vec_len); 1223 1224 // Perform the comparison. An element of the destination predicate is set 1225 // to active if the particular char is matched. 1226 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1227 1228 // Branch if the particular char is found. 1229 br(NE, MATCH); 1230 1231 sve_whilelt(tmp_pg, T, idx, cnt1); 1232 1233 // Loop back if the particular char not found. 1234 br(MI, LOOP); 1235 1236 BIND(NOMATCH); 1237 mov(result, -1); 1238 b(DONE); 1239 1240 BIND(MATCH); 1241 // Undo the index increment. 1242 sub(idx, idx, vec_len); 1243 1244 // Crop the vector to find its location. 1245 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1246 add(result, idx, -1); 1247 sve_incp(result, T, tmp_pdn); 1248 BIND(DONE); 1249 } 1250 1251 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1252 Register ch, Register result, 1253 Register tmp1, Register tmp2, Register tmp3) 1254 { 1255 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1256 Register cnt1_neg = cnt1; 1257 Register ch1 = rscratch1; 1258 Register result_tmp = rscratch2; 1259 1260 cbz(cnt1, NOMATCH); 1261 1262 cmp(cnt1, (u1)8); 1263 br(LT, DO1_SHORT); 1264 1265 orr(ch, ch, ch, LSL, 8); 1266 orr(ch, ch, ch, LSL, 16); 1267 orr(ch, ch, ch, LSL, 32); 1268 1269 sub(cnt1, cnt1, 8); 1270 mov(result_tmp, cnt1); 1271 lea(str1, Address(str1, cnt1)); 1272 sub(cnt1_neg, zr, cnt1); 1273 1274 mov(tmp3, 0x0101010101010101); 1275 1276 BIND(CH1_LOOP); 1277 ldr(ch1, Address(str1, cnt1_neg)); 1278 eor(ch1, ch, ch1); 1279 sub(tmp1, ch1, tmp3); 1280 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1281 bics(tmp1, tmp1, tmp2); 1282 br(NE, HAS_ZERO); 1283 adds(cnt1_neg, cnt1_neg, 8); 1284 br(LT, CH1_LOOP); 1285 1286 cmp(cnt1_neg, (u1)8); 1287 mov(cnt1_neg, 0); 1288 br(LT, CH1_LOOP); 1289 b(NOMATCH); 1290 1291 BIND(HAS_ZERO); 1292 rev(tmp1, tmp1); 1293 clz(tmp1, tmp1); 1294 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1295 b(MATCH); 1296 1297 BIND(DO1_SHORT); 1298 mov(result_tmp, cnt1); 1299 lea(str1, Address(str1, cnt1)); 1300 sub(cnt1_neg, zr, cnt1); 1301 BIND(DO1_LOOP); 1302 ldrb(ch1, Address(str1, cnt1_neg)); 1303 cmp(ch, ch1); 1304 br(EQ, MATCH); 1305 adds(cnt1_neg, cnt1_neg, 1); 1306 br(LT, DO1_LOOP); 1307 BIND(NOMATCH); 1308 mov(result, -1); 1309 b(DONE); 1310 BIND(MATCH); 1311 add(result, result_tmp, cnt1_neg); 1312 BIND(DONE); 1313 } 1314 1315 // Compare strings. 1316 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1317 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1318 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1319 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1320 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1321 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1322 SHORT_LOOP_START, TAIL_CHECK; 1323 1324 bool isLL = ae == StrIntrinsicNode::LL; 1325 bool isLU = ae == StrIntrinsicNode::LU; 1326 bool isUL = ae == StrIntrinsicNode::UL; 1327 1328 // The stub threshold for LL strings is: 72 (64 + 8) chars 1329 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1330 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1331 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1332 1333 bool str1_isL = isLL || isLU; 1334 bool str2_isL = isLL || isUL; 1335 1336 int str1_chr_shift = str1_isL ? 0 : 1; 1337 int str2_chr_shift = str2_isL ? 0 : 1; 1338 int str1_chr_size = str1_isL ? 1 : 2; 1339 int str2_chr_size = str2_isL ? 1 : 2; 1340 int minCharsInWord = isLL ? wordSize : wordSize/2; 1341 1342 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1343 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1344 (chr_insn)&MacroAssembler::ldrh; 1345 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1346 (chr_insn)&MacroAssembler::ldrh; 1347 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1348 (uxt_insn)&MacroAssembler::uxthw; 1349 1350 BLOCK_COMMENT("string_compare {"); 1351 1352 // Bizarrely, the counts are passed in bytes, regardless of whether they 1353 // are L or U strings, however the result is always in characters. 1354 if (!str1_isL) asrw(cnt1, cnt1, 1); 1355 if (!str2_isL) asrw(cnt2, cnt2, 1); 1356 1357 // Compute the minimum of the string lengths and save the difference. 1358 subsw(result, cnt1, cnt2); 1359 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1360 1361 // A very short string 1362 cmpw(cnt2, minCharsInWord); 1363 br(Assembler::LE, SHORT_STRING); 1364 1365 // Compare longwords 1366 // load first parts of strings and finish initialization while loading 1367 { 1368 if (str1_isL == str2_isL) { // LL or UU 1369 ldr(tmp1, Address(str1)); 1370 cmp(str1, str2); 1371 br(Assembler::EQ, DONE); 1372 ldr(tmp2, Address(str2)); 1373 cmp(cnt2, stub_threshold); 1374 br(GE, STUB); 1375 subsw(cnt2, cnt2, minCharsInWord); 1376 br(EQ, TAIL_CHECK); 1377 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1378 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1379 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1380 } else if (isLU) { 1381 ldrs(vtmp, Address(str1)); 1382 ldr(tmp2, Address(str2)); 1383 cmp(cnt2, stub_threshold); 1384 br(GE, STUB); 1385 subw(cnt2, cnt2, 4); 1386 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1387 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1388 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1389 zip1(vtmp, T8B, vtmp, vtmpZ); 1390 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1391 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1392 add(cnt1, cnt1, 4); 1393 fmovd(tmp1, vtmp); 1394 } else { // UL case 1395 ldr(tmp1, Address(str1)); 1396 ldrs(vtmp, Address(str2)); 1397 cmp(cnt2, stub_threshold); 1398 br(GE, STUB); 1399 subw(cnt2, cnt2, 4); 1400 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1401 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1402 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1403 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1404 zip1(vtmp, T8B, vtmp, vtmpZ); 1405 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1406 add(cnt1, cnt1, 8); 1407 fmovd(tmp2, vtmp); 1408 } 1409 adds(cnt2, cnt2, isUL ? 4 : 8); 1410 br(GE, TAIL); 1411 eor(rscratch2, tmp1, tmp2); 1412 cbnz(rscratch2, DIFF); 1413 // main loop 1414 bind(NEXT_WORD); 1415 if (str1_isL == str2_isL) { 1416 ldr(tmp1, Address(str1, cnt2)); 1417 ldr(tmp2, Address(str2, cnt2)); 1418 adds(cnt2, cnt2, 8); 1419 } else if (isLU) { 1420 ldrs(vtmp, Address(str1, cnt1)); 1421 ldr(tmp2, Address(str2, cnt2)); 1422 add(cnt1, cnt1, 4); 1423 zip1(vtmp, T8B, vtmp, vtmpZ); 1424 fmovd(tmp1, vtmp); 1425 adds(cnt2, cnt2, 8); 1426 } else { // UL 1427 ldrs(vtmp, Address(str2, cnt2)); 1428 ldr(tmp1, Address(str1, cnt1)); 1429 zip1(vtmp, T8B, vtmp, vtmpZ); 1430 add(cnt1, cnt1, 8); 1431 fmovd(tmp2, vtmp); 1432 adds(cnt2, cnt2, 4); 1433 } 1434 br(GE, TAIL); 1435 1436 eor(rscratch2, tmp1, tmp2); 1437 cbz(rscratch2, NEXT_WORD); 1438 b(DIFF); 1439 bind(TAIL); 1440 eor(rscratch2, tmp1, tmp2); 1441 cbnz(rscratch2, DIFF); 1442 // Last longword. In the case where length == 4 we compare the 1443 // same longword twice, but that's still faster than another 1444 // conditional branch. 1445 if (str1_isL == str2_isL) { 1446 ldr(tmp1, Address(str1)); 1447 ldr(tmp2, Address(str2)); 1448 } else if (isLU) { 1449 ldrs(vtmp, Address(str1)); 1450 ldr(tmp2, Address(str2)); 1451 zip1(vtmp, T8B, vtmp, vtmpZ); 1452 fmovd(tmp1, vtmp); 1453 } else { // UL 1454 ldrs(vtmp, Address(str2)); 1455 ldr(tmp1, Address(str1)); 1456 zip1(vtmp, T8B, vtmp, vtmpZ); 1457 fmovd(tmp2, vtmp); 1458 } 1459 bind(TAIL_CHECK); 1460 eor(rscratch2, tmp1, tmp2); 1461 cbz(rscratch2, DONE); 1462 1463 // Find the first different characters in the longwords and 1464 // compute their difference. 1465 bind(DIFF); 1466 rev(rscratch2, rscratch2); 1467 clz(rscratch2, rscratch2); 1468 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1469 lsrv(tmp1, tmp1, rscratch2); 1470 (this->*ext_chr)(tmp1, tmp1); 1471 lsrv(tmp2, tmp2, rscratch2); 1472 (this->*ext_chr)(tmp2, tmp2); 1473 subw(result, tmp1, tmp2); 1474 b(DONE); 1475 } 1476 1477 bind(STUB); 1478 RuntimeAddress stub = nullptr; 1479 switch(ae) { 1480 case StrIntrinsicNode::LL: 1481 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1482 break; 1483 case StrIntrinsicNode::UU: 1484 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1485 break; 1486 case StrIntrinsicNode::LU: 1487 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1488 break; 1489 case StrIntrinsicNode::UL: 1490 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1491 break; 1492 default: 1493 ShouldNotReachHere(); 1494 } 1495 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1496 address call = trampoline_call(stub); 1497 if (call == nullptr) { 1498 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1499 ciEnv::current()->record_failure("CodeCache is full"); 1500 return; 1501 } 1502 b(DONE); 1503 1504 bind(SHORT_STRING); 1505 // Is the minimum length zero? 1506 cbz(cnt2, DONE); 1507 // arrange code to do most branches while loading and loading next characters 1508 // while comparing previous 1509 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1510 subs(cnt2, cnt2, 1); 1511 br(EQ, SHORT_LAST_INIT); 1512 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1513 b(SHORT_LOOP_START); 1514 bind(SHORT_LOOP); 1515 subs(cnt2, cnt2, 1); 1516 br(EQ, SHORT_LAST); 1517 bind(SHORT_LOOP_START); 1518 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1519 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1520 cmp(tmp1, cnt1); 1521 br(NE, SHORT_LOOP_TAIL); 1522 subs(cnt2, cnt2, 1); 1523 br(EQ, SHORT_LAST2); 1524 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1525 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1526 cmp(tmp2, rscratch1); 1527 br(EQ, SHORT_LOOP); 1528 sub(result, tmp2, rscratch1); 1529 b(DONE); 1530 bind(SHORT_LOOP_TAIL); 1531 sub(result, tmp1, cnt1); 1532 b(DONE); 1533 bind(SHORT_LAST2); 1534 cmp(tmp2, rscratch1); 1535 br(EQ, DONE); 1536 sub(result, tmp2, rscratch1); 1537 1538 b(DONE); 1539 bind(SHORT_LAST_INIT); 1540 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1541 bind(SHORT_LAST); 1542 cmp(tmp1, cnt1); 1543 br(EQ, DONE); 1544 sub(result, tmp1, cnt1); 1545 1546 bind(DONE); 1547 1548 BLOCK_COMMENT("} string_compare"); 1549 } 1550 1551 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1552 FloatRegister src2, Condition cond, bool isQ) { 1553 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1554 FloatRegister zn = src1, zm = src2; 1555 bool needs_negation = false; 1556 switch (cond) { 1557 case LT: cond = GT; zn = src2; zm = src1; break; 1558 case LE: cond = GE; zn = src2; zm = src1; break; 1559 case LO: cond = HI; zn = src2; zm = src1; break; 1560 case LS: cond = HS; zn = src2; zm = src1; break; 1561 case NE: cond = EQ; needs_negation = true; break; 1562 default: 1563 break; 1564 } 1565 1566 if (is_floating_point_type(bt)) { 1567 fcm(cond, dst, size, zn, zm); 1568 } else { 1569 cm(cond, dst, size, zn, zm); 1570 } 1571 1572 if (needs_negation) { 1573 notr(dst, isQ ? T16B : T8B, dst); 1574 } 1575 } 1576 1577 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1578 Condition cond, bool isQ) { 1579 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1580 if (bt == T_FLOAT || bt == T_DOUBLE) { 1581 if (cond == Assembler::NE) { 1582 fcm(Assembler::EQ, dst, size, src); 1583 notr(dst, isQ ? T16B : T8B, dst); 1584 } else { 1585 fcm(cond, dst, size, src); 1586 } 1587 } else { 1588 if (cond == Assembler::NE) { 1589 cm(Assembler::EQ, dst, size, src); 1590 notr(dst, isQ ? T16B : T8B, dst); 1591 } else { 1592 cm(cond, dst, size, src); 1593 } 1594 } 1595 } 1596 1597 // Compress the least significant bit of each byte to the rightmost and clear 1598 // the higher garbage bits. 1599 void C2_MacroAssembler::bytemask_compress(Register dst) { 1600 // Example input, dst = 0x01 00 00 00 01 01 00 01 1601 // The "??" bytes are garbage. 1602 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1603 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1604 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1605 andr(dst, dst, 0xff); // dst = 0x8D 1606 } 1607 1608 // Pack the lowest-numbered bit of each mask element in src into a long value 1609 // in dst, at most the first 64 lane elements. 1610 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1611 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1612 FloatRegister vtmp1, FloatRegister vtmp2) { 1613 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1614 assert_different_registers(dst, rscratch1); 1615 assert_different_registers(vtmp1, vtmp2); 1616 1617 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1618 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1619 // Expected: dst = 0x658D 1620 1621 // Convert the mask into vector with sequential bytes. 1622 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1623 sve_cpy(vtmp1, size, src, 1, false); 1624 if (bt != T_BYTE) { 1625 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1626 } 1627 1628 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1629 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1630 // is to compress each significant bit of the byte in a cross-lane way. Due 1631 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1632 // (bit-compress in each lane) with the biggest lane size (T = D) then 1633 // concatenate the results. 1634 1635 // The second source input of BEXT, initialized with 0x01 in each byte. 1636 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1637 sve_dup(vtmp2, B, 1); 1638 1639 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1640 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1641 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1642 // --------------------------------------- 1643 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1644 sve_bext(vtmp1, D, vtmp1, vtmp2); 1645 1646 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1647 // result to dst. 1648 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1649 // dst = 0x658D 1650 if (lane_cnt <= 8) { 1651 // No need to concatenate. 1652 umov(dst, vtmp1, B, 0); 1653 } else if (lane_cnt <= 16) { 1654 ins(vtmp1, B, vtmp1, 1, 8); 1655 umov(dst, vtmp1, H, 0); 1656 } else { 1657 // As the lane count is 64 at most, the final expected value must be in 1658 // the lowest 64 bits after narrowing vtmp1 from D to B. 1659 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1660 umov(dst, vtmp1, D, 0); 1661 } 1662 } else if (UseSVE > 0) { 1663 // Compress the lowest 8 bytes. 1664 fmovd(dst, vtmp1); 1665 bytemask_compress(dst); 1666 if (lane_cnt <= 8) return; 1667 1668 // Repeat on higher bytes and join the results. 1669 // Compress 8 bytes in each iteration. 1670 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1671 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1672 bytemask_compress(rscratch1); 1673 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1674 } 1675 } else { 1676 assert(false, "unsupported"); 1677 ShouldNotReachHere(); 1678 } 1679 } 1680 1681 // Unpack the mask, a long value in src, into predicate register dst based on the 1682 // corresponding data type. Note that dst can support at most 64 lanes. 1683 // Below example gives the expected dst predicate register in different types, with 1684 // a valid src(0x658D) on a 1024-bit vector size machine. 1685 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1686 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1687 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1688 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1689 // 1690 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1691 // has 24 significant bits would be an invalid input if dst predicate register refers to 1692 // a LONG type 1024-bit vector, which has at most 16 lanes. 1693 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1694 FloatRegister vtmp1, FloatRegister vtmp2) { 1695 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1696 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1697 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1698 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1699 // Expected: dst = 0b01101001 10001101 1700 1701 // Put long value from general purpose register into the first lane of vector. 1702 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1703 sve_dup(vtmp1, B, 0); 1704 mov(vtmp1, D, 0, src); 1705 1706 // As sve_cmp generates mask value with the minimum unit in byte, we should 1707 // transform the value in the first lane which is mask in bit now to the 1708 // mask in byte, which can be done by SVE2's BDEP instruction. 1709 1710 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1711 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1712 if (lane_cnt <= 8) { 1713 // Nothing. As only one byte exsits. 1714 } else if (lane_cnt <= 16) { 1715 ins(vtmp1, B, vtmp1, 8, 1); 1716 mov(vtmp1, B, 1, zr); 1717 } else { 1718 sve_vector_extend(vtmp1, D, vtmp1, B); 1719 } 1720 1721 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1722 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1723 sve_dup(vtmp2, B, 1); 1724 1725 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1726 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1727 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1728 // --------------------------------------- 1729 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1730 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1731 1732 if (bt != T_BYTE) { 1733 sve_vector_extend(vtmp1, size, vtmp1, B); 1734 } 1735 // Generate mask according to the given vector, in which the elements have been 1736 // extended to expected type. 1737 // dst = 0b01101001 10001101 1738 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1739 } 1740 1741 // Clobbers: rflags 1742 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1743 FloatRegister zn, FloatRegister zm, Condition cond) { 1744 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1745 FloatRegister z1 = zn, z2 = zm; 1746 switch (cond) { 1747 case LE: z1 = zm; z2 = zn; cond = GE; break; 1748 case LT: z1 = zm; z2 = zn; cond = GT; break; 1749 case LO: z1 = zm; z2 = zn; cond = HI; break; 1750 case LS: z1 = zm; z2 = zn; cond = HS; break; 1751 default: 1752 break; 1753 } 1754 1755 SIMD_RegVariant size = elemType_to_regVariant(bt); 1756 if (is_floating_point_type(bt)) { 1757 sve_fcm(cond, pd, size, pg, z1, z2); 1758 } else { 1759 assert(is_integral_type(bt), "unsupported element type"); 1760 sve_cmp(cond, pd, size, pg, z1, z2); 1761 } 1762 } 1763 1764 // Get index of the last mask lane that is set 1765 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1766 SIMD_RegVariant size = elemType_to_regVariant(bt); 1767 sve_rev(ptmp, size, src); 1768 sve_brkb(ptmp, ptrue, ptmp, false); 1769 sve_cntp(dst, size, ptrue, ptmp); 1770 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1771 subw(dst, rscratch1, dst); 1772 } 1773 1774 // Extend integer vector src to dst with the same lane count 1775 // but larger element size, e.g. 4B -> 4I 1776 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1777 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1778 if (src_bt == T_BYTE) { 1779 if (dst_bt == T_SHORT) { 1780 // 4B/8B to 4S/8S 1781 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1782 } else { 1783 // 4B to 4I 1784 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1785 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1786 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1787 } 1788 } else if (src_bt == T_SHORT) { 1789 // 4S to 4I 1790 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1791 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1792 } else if (src_bt == T_INT) { 1793 // 2I to 2L 1794 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1795 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1796 } else { 1797 ShouldNotReachHere(); 1798 } 1799 } 1800 1801 // Narrow integer vector src down to dst with the same lane count 1802 // but smaller element size, e.g. 4I -> 4B 1803 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1804 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1805 if (src_bt == T_SHORT) { 1806 // 4S/8S to 4B/8B 1807 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1808 assert(dst_bt == T_BYTE, "unsupported"); 1809 xtn(dst, T8B, src, T8H); 1810 } else if (src_bt == T_INT) { 1811 // 4I to 4B/4S 1812 assert(src_vlen_in_bytes == 16, "unsupported"); 1813 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1814 xtn(dst, T4H, src, T4S); 1815 if (dst_bt == T_BYTE) { 1816 xtn(dst, T8B, dst, T8H); 1817 } 1818 } else if (src_bt == T_LONG) { 1819 // 2L to 2I 1820 assert(src_vlen_in_bytes == 16, "unsupported"); 1821 assert(dst_bt == T_INT, "unsupported"); 1822 xtn(dst, T2S, src, T2D); 1823 } else { 1824 ShouldNotReachHere(); 1825 } 1826 } 1827 1828 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1829 FloatRegister src, SIMD_RegVariant src_size, 1830 bool is_unsigned) { 1831 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1832 1833 if (src_size == B) { 1834 switch (dst_size) { 1835 case H: 1836 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1837 break; 1838 case S: 1839 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1840 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1841 break; 1842 case D: 1843 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1844 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1845 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1846 break; 1847 default: 1848 ShouldNotReachHere(); 1849 } 1850 } else if (src_size == H) { 1851 if (dst_size == S) { 1852 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1853 } else { // D 1854 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1855 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1856 } 1857 } else if (src_size == S) { 1858 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1859 } 1860 } 1861 1862 // Vector narrow from src to dst with specified element sizes. 1863 // High part of dst vector will be filled with zero. 1864 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1865 FloatRegister src, SIMD_RegVariant src_size, 1866 FloatRegister tmp) { 1867 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1868 assert_different_registers(src, tmp); 1869 sve_dup(tmp, src_size, 0); 1870 if (src_size == D) { 1871 switch (dst_size) { 1872 case S: 1873 sve_uzp1(dst, S, src, tmp); 1874 break; 1875 case H: 1876 assert_different_registers(dst, tmp); 1877 sve_uzp1(dst, S, src, tmp); 1878 sve_uzp1(dst, H, dst, tmp); 1879 break; 1880 case B: 1881 assert_different_registers(dst, tmp); 1882 sve_uzp1(dst, S, src, tmp); 1883 sve_uzp1(dst, H, dst, tmp); 1884 sve_uzp1(dst, B, dst, tmp); 1885 break; 1886 default: 1887 ShouldNotReachHere(); 1888 } 1889 } else if (src_size == S) { 1890 if (dst_size == H) { 1891 sve_uzp1(dst, H, src, tmp); 1892 } else { // B 1893 assert_different_registers(dst, tmp); 1894 sve_uzp1(dst, H, src, tmp); 1895 sve_uzp1(dst, B, dst, tmp); 1896 } 1897 } else if (src_size == H) { 1898 sve_uzp1(dst, B, src, tmp); 1899 } 1900 } 1901 1902 // Extend src predicate to dst predicate with the same lane count but larger 1903 // element size, e.g. 64Byte -> 512Long 1904 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1905 uint dst_element_length_in_bytes, 1906 uint src_element_length_in_bytes) { 1907 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1908 sve_punpklo(dst, src); 1909 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1910 sve_punpklo(dst, src); 1911 sve_punpklo(dst, dst); 1912 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1913 sve_punpklo(dst, src); 1914 sve_punpklo(dst, dst); 1915 sve_punpklo(dst, dst); 1916 } else { 1917 assert(false, "unsupported"); 1918 ShouldNotReachHere(); 1919 } 1920 } 1921 1922 // Narrow src predicate to dst predicate with the same lane count but 1923 // smaller element size, e.g. 512Long -> 64Byte 1924 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1925 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1926 // The insignificant bits in src predicate are expected to be zero. 1927 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1928 // passed as the second argument. An example narrowing operation with a given mask would be - 1929 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1930 // Mask (for 2 Longs) : TF 1931 // Predicate register for the above mask (16 bits) : 00000001 00000000 1932 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1933 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1934 assert_different_registers(src, ptmp); 1935 assert_different_registers(dst, ptmp); 1936 sve_pfalse(ptmp); 1937 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1938 sve_uzp1(dst, B, src, ptmp); 1939 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1940 sve_uzp1(dst, H, src, ptmp); 1941 sve_uzp1(dst, B, dst, ptmp); 1942 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1943 sve_uzp1(dst, S, src, ptmp); 1944 sve_uzp1(dst, H, dst, ptmp); 1945 sve_uzp1(dst, B, dst, ptmp); 1946 } else { 1947 assert(false, "unsupported"); 1948 ShouldNotReachHere(); 1949 } 1950 } 1951 1952 // Vector reduction add for integral type with ASIMD instructions. 1953 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1954 Register isrc, FloatRegister vsrc, 1955 unsigned vector_length_in_bytes, 1956 FloatRegister vtmp) { 1957 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1958 assert_different_registers(dst, isrc); 1959 bool isQ = vector_length_in_bytes == 16; 1960 1961 BLOCK_COMMENT("neon_reduce_add_integral {"); 1962 switch(bt) { 1963 case T_BYTE: 1964 addv(vtmp, isQ ? T16B : T8B, vsrc); 1965 smov(dst, vtmp, B, 0); 1966 addw(dst, dst, isrc, ext::sxtb); 1967 break; 1968 case T_SHORT: 1969 addv(vtmp, isQ ? T8H : T4H, vsrc); 1970 smov(dst, vtmp, H, 0); 1971 addw(dst, dst, isrc, ext::sxth); 1972 break; 1973 case T_INT: 1974 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1975 umov(dst, vtmp, S, 0); 1976 addw(dst, dst, isrc); 1977 break; 1978 case T_LONG: 1979 assert(isQ, "unsupported"); 1980 addpd(vtmp, vsrc); 1981 umov(dst, vtmp, D, 0); 1982 add(dst, dst, isrc); 1983 break; 1984 default: 1985 assert(false, "unsupported"); 1986 ShouldNotReachHere(); 1987 } 1988 BLOCK_COMMENT("} neon_reduce_add_integral"); 1989 } 1990 1991 // Vector reduction multiply for integral type with ASIMD instructions. 1992 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1993 // Clobbers: rscratch1 1994 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1995 Register isrc, FloatRegister vsrc, 1996 unsigned vector_length_in_bytes, 1997 FloatRegister vtmp1, FloatRegister vtmp2) { 1998 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1999 bool isQ = vector_length_in_bytes == 16; 2000 2001 BLOCK_COMMENT("neon_reduce_mul_integral {"); 2002 switch(bt) { 2003 case T_BYTE: 2004 if (isQ) { 2005 // Multiply the lower half and higher half of vector iteratively. 2006 // vtmp1 = vsrc[8:15] 2007 ins(vtmp1, D, vsrc, 0, 1); 2008 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2009 mulv(vtmp1, T8B, vtmp1, vsrc); 2010 // vtmp2 = vtmp1[4:7] 2011 ins(vtmp2, S, vtmp1, 0, 1); 2012 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2013 mulv(vtmp1, T8B, vtmp2, vtmp1); 2014 } else { 2015 ins(vtmp1, S, vsrc, 0, 1); 2016 mulv(vtmp1, T8B, vtmp1, vsrc); 2017 } 2018 // vtmp2 = vtmp1[2:3] 2019 ins(vtmp2, H, vtmp1, 0, 1); 2020 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2021 mulv(vtmp2, T8B, vtmp2, vtmp1); 2022 // dst = vtmp2[0] * isrc * vtmp2[1] 2023 umov(rscratch1, vtmp2, B, 0); 2024 mulw(dst, rscratch1, isrc); 2025 sxtb(dst, dst); 2026 umov(rscratch1, vtmp2, B, 1); 2027 mulw(dst, rscratch1, dst); 2028 sxtb(dst, dst); 2029 break; 2030 case T_SHORT: 2031 if (isQ) { 2032 ins(vtmp2, D, vsrc, 0, 1); 2033 mulv(vtmp2, T4H, vtmp2, vsrc); 2034 ins(vtmp1, S, vtmp2, 0, 1); 2035 mulv(vtmp1, T4H, vtmp1, vtmp2); 2036 } else { 2037 ins(vtmp1, S, vsrc, 0, 1); 2038 mulv(vtmp1, T4H, vtmp1, vsrc); 2039 } 2040 umov(rscratch1, vtmp1, H, 0); 2041 mulw(dst, rscratch1, isrc); 2042 sxth(dst, dst); 2043 umov(rscratch1, vtmp1, H, 1); 2044 mulw(dst, rscratch1, dst); 2045 sxth(dst, dst); 2046 break; 2047 case T_INT: 2048 if (isQ) { 2049 ins(vtmp1, D, vsrc, 0, 1); 2050 mulv(vtmp1, T2S, vtmp1, vsrc); 2051 } else { 2052 vtmp1 = vsrc; 2053 } 2054 umov(rscratch1, vtmp1, S, 0); 2055 mul(dst, rscratch1, isrc); 2056 umov(rscratch1, vtmp1, S, 1); 2057 mul(dst, rscratch1, dst); 2058 break; 2059 case T_LONG: 2060 umov(rscratch1, vsrc, D, 0); 2061 mul(dst, isrc, rscratch1); 2062 umov(rscratch1, vsrc, D, 1); 2063 mul(dst, dst, rscratch1); 2064 break; 2065 default: 2066 assert(false, "unsupported"); 2067 ShouldNotReachHere(); 2068 } 2069 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2070 } 2071 2072 // Vector reduction multiply for floating-point type with ASIMD instructions. 2073 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2074 FloatRegister fsrc, FloatRegister vsrc, 2075 unsigned vector_length_in_bytes, 2076 FloatRegister vtmp) { 2077 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2078 bool isQ = vector_length_in_bytes == 16; 2079 2080 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2081 switch(bt) { 2082 case T_FLOAT: 2083 fmuls(dst, fsrc, vsrc); 2084 ins(vtmp, S, vsrc, 0, 1); 2085 fmuls(dst, dst, vtmp); 2086 if (isQ) { 2087 ins(vtmp, S, vsrc, 0, 2); 2088 fmuls(dst, dst, vtmp); 2089 ins(vtmp, S, vsrc, 0, 3); 2090 fmuls(dst, dst, vtmp); 2091 } 2092 break; 2093 case T_DOUBLE: 2094 assert(isQ, "unsupported"); 2095 fmuld(dst, fsrc, vsrc); 2096 ins(vtmp, D, vsrc, 0, 1); 2097 fmuld(dst, dst, vtmp); 2098 break; 2099 default: 2100 assert(false, "unsupported"); 2101 ShouldNotReachHere(); 2102 } 2103 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2104 } 2105 2106 // Helper to select logical instruction 2107 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2108 Register Rn, Register Rm, 2109 enum shift_kind kind, unsigned shift) { 2110 switch(opc) { 2111 case Op_AndReductionV: 2112 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2113 break; 2114 case Op_OrReductionV: 2115 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2116 break; 2117 case Op_XorReductionV: 2118 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2119 break; 2120 default: 2121 assert(false, "unsupported"); 2122 ShouldNotReachHere(); 2123 } 2124 } 2125 2126 // Vector reduction logical operations And, Or, Xor 2127 // Clobbers: rscratch1 2128 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2129 Register isrc, FloatRegister vsrc, 2130 unsigned vector_length_in_bytes) { 2131 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2132 "unsupported"); 2133 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2134 assert_different_registers(dst, isrc); 2135 bool isQ = vector_length_in_bytes == 16; 2136 2137 BLOCK_COMMENT("neon_reduce_logical {"); 2138 umov(rscratch1, vsrc, isQ ? D : S, 0); 2139 umov(dst, vsrc, isQ ? D : S, 1); 2140 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2141 switch(bt) { 2142 case T_BYTE: 2143 if (isQ) { 2144 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2145 } 2146 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2147 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2148 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2149 sxtb(dst, dst); 2150 break; 2151 case T_SHORT: 2152 if (isQ) { 2153 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2154 } 2155 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2156 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2157 sxth(dst, dst); 2158 break; 2159 case T_INT: 2160 if (isQ) { 2161 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2162 } 2163 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2164 break; 2165 case T_LONG: 2166 assert(isQ, "unsupported"); 2167 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2168 break; 2169 default: 2170 assert(false, "unsupported"); 2171 ShouldNotReachHere(); 2172 } 2173 BLOCK_COMMENT("} neon_reduce_logical"); 2174 } 2175 2176 // Vector reduction min/max for integral type with ASIMD instructions. 2177 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2178 // Clobbers: rscratch1, rflags 2179 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2180 Register isrc, FloatRegister vsrc, 2181 unsigned vector_length_in_bytes, 2182 FloatRegister vtmp) { 2183 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2184 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2185 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2186 assert_different_registers(dst, isrc); 2187 bool isQ = vector_length_in_bytes == 16; 2188 bool is_min = opc == Op_MinReductionV; 2189 2190 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2191 if (bt == T_LONG) { 2192 assert(vtmp == fnoreg, "should be"); 2193 assert(isQ, "should be"); 2194 umov(rscratch1, vsrc, D, 0); 2195 cmp(isrc, rscratch1); 2196 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2197 umov(rscratch1, vsrc, D, 1); 2198 cmp(dst, rscratch1); 2199 csel(dst, dst, rscratch1, is_min ? LT : GT); 2200 } else { 2201 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2202 if (size == T2S) { 2203 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2204 } else { 2205 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2206 } 2207 if (bt == T_INT) { 2208 umov(dst, vtmp, S, 0); 2209 } else { 2210 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2211 } 2212 cmpw(dst, isrc); 2213 cselw(dst, dst, isrc, is_min ? LT : GT); 2214 } 2215 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2216 } 2217 2218 // Vector reduction for integral type with SVE instruction. 2219 // Supported operations are Add, And, Or, Xor, Max, Min. 2220 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2221 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2222 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2223 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2224 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2225 assert_different_registers(src1, dst); 2226 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2227 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2228 switch (opc) { 2229 case Op_AddReductionVI: { 2230 sve_uaddv(tmp, size, pg, src2); 2231 if (bt == T_BYTE) { 2232 smov(dst, tmp, size, 0); 2233 addw(dst, src1, dst, ext::sxtb); 2234 } else if (bt == T_SHORT) { 2235 smov(dst, tmp, size, 0); 2236 addw(dst, src1, dst, ext::sxth); 2237 } else { 2238 umov(dst, tmp, size, 0); 2239 addw(dst, dst, src1); 2240 } 2241 break; 2242 } 2243 case Op_AddReductionVL: { 2244 sve_uaddv(tmp, size, pg, src2); 2245 umov(dst, tmp, size, 0); 2246 add(dst, dst, src1); 2247 break; 2248 } 2249 case Op_AndReductionV: { 2250 sve_andv(tmp, size, pg, src2); 2251 if (bt == T_INT || bt == T_LONG) { 2252 umov(dst, tmp, size, 0); 2253 } else { 2254 smov(dst, tmp, size, 0); 2255 } 2256 if (bt == T_LONG) { 2257 andr(dst, dst, src1); 2258 } else { 2259 andw(dst, dst, src1); 2260 } 2261 break; 2262 } 2263 case Op_OrReductionV: { 2264 sve_orv(tmp, size, pg, src2); 2265 if (bt == T_INT || bt == T_LONG) { 2266 umov(dst, tmp, size, 0); 2267 } else { 2268 smov(dst, tmp, size, 0); 2269 } 2270 if (bt == T_LONG) { 2271 orr(dst, dst, src1); 2272 } else { 2273 orrw(dst, dst, src1); 2274 } 2275 break; 2276 } 2277 case Op_XorReductionV: { 2278 sve_eorv(tmp, size, pg, src2); 2279 if (bt == T_INT || bt == T_LONG) { 2280 umov(dst, tmp, size, 0); 2281 } else { 2282 smov(dst, tmp, size, 0); 2283 } 2284 if (bt == T_LONG) { 2285 eor(dst, dst, src1); 2286 } else { 2287 eorw(dst, dst, src1); 2288 } 2289 break; 2290 } 2291 case Op_MaxReductionV: { 2292 sve_smaxv(tmp, size, pg, src2); 2293 if (bt == T_INT || bt == T_LONG) { 2294 umov(dst, tmp, size, 0); 2295 } else { 2296 smov(dst, tmp, size, 0); 2297 } 2298 if (bt == T_LONG) { 2299 cmp(dst, src1); 2300 csel(dst, dst, src1, Assembler::GT); 2301 } else { 2302 cmpw(dst, src1); 2303 cselw(dst, dst, src1, Assembler::GT); 2304 } 2305 break; 2306 } 2307 case Op_MinReductionV: { 2308 sve_sminv(tmp, size, pg, src2); 2309 if (bt == T_INT || bt == T_LONG) { 2310 umov(dst, tmp, size, 0); 2311 } else { 2312 smov(dst, tmp, size, 0); 2313 } 2314 if (bt == T_LONG) { 2315 cmp(dst, src1); 2316 csel(dst, dst, src1, Assembler::LT); 2317 } else { 2318 cmpw(dst, src1); 2319 cselw(dst, dst, src1, Assembler::LT); 2320 } 2321 break; 2322 } 2323 default: 2324 assert(false, "unsupported"); 2325 ShouldNotReachHere(); 2326 } 2327 2328 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2329 if (bt == T_BYTE) { 2330 sxtb(dst, dst); 2331 } else if (bt == T_SHORT) { 2332 sxth(dst, dst); 2333 } 2334 } 2335 } 2336 2337 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2338 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2339 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2340 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2341 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2342 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2343 2344 // Set all elements to false if the input "lane_cnt" is zero. 2345 if (lane_cnt == 0) { 2346 sve_pfalse(dst); 2347 return; 2348 } 2349 2350 SIMD_RegVariant size = elemType_to_regVariant(bt); 2351 assert(size != Q, "invalid size"); 2352 2353 // Set all true if "lane_cnt" equals to the max lane count. 2354 if (lane_cnt == max_vector_length) { 2355 sve_ptrue(dst, size, /* ALL */ 0b11111); 2356 return; 2357 } 2358 2359 // Fixed numbers for "ptrue". 2360 switch(lane_cnt) { 2361 case 1: /* VL1 */ 2362 case 2: /* VL2 */ 2363 case 3: /* VL3 */ 2364 case 4: /* VL4 */ 2365 case 5: /* VL5 */ 2366 case 6: /* VL6 */ 2367 case 7: /* VL7 */ 2368 case 8: /* VL8 */ 2369 sve_ptrue(dst, size, lane_cnt); 2370 return; 2371 case 16: 2372 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2373 return; 2374 case 32: 2375 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2376 return; 2377 case 64: 2378 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2379 return; 2380 case 128: 2381 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2382 return; 2383 case 256: 2384 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2385 return; 2386 default: 2387 break; 2388 } 2389 2390 // Special patterns for "ptrue". 2391 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2392 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2393 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2394 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2395 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2396 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2397 } else { 2398 // Encode to "whileltw" for the remaining cases. 2399 mov(rscratch1, lane_cnt); 2400 sve_whileltw(dst, size, zr, rscratch1); 2401 } 2402 } 2403 2404 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2405 // Any remaining elements of dst will be filled with zero. 2406 // Clobbers: rscratch1 2407 // Preserves: src, mask 2408 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2409 FloatRegister vtmp1, FloatRegister vtmp2, 2410 PRegister pgtmp) { 2411 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2412 assert_different_registers(dst, src, vtmp1, vtmp2); 2413 assert_different_registers(mask, pgtmp); 2414 2415 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2416 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2417 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2418 sve_dup(vtmp2, H, 0); 2419 2420 // Extend lowest half to type INT. 2421 // dst = 00004444 00003333 00002222 00001111 2422 sve_uunpklo(dst, S, src); 2423 // pgtmp = 00000001 00000000 00000001 00000001 2424 sve_punpklo(pgtmp, mask); 2425 // Pack the active elements in size of type INT to the right, 2426 // and fill the remainings with zero. 2427 // dst = 00000000 00004444 00002222 00001111 2428 sve_compact(dst, S, dst, pgtmp); 2429 // Narrow the result back to type SHORT. 2430 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2431 sve_uzp1(dst, H, dst, vtmp2); 2432 // Count the active elements of lowest half. 2433 // rscratch1 = 3 2434 sve_cntp(rscratch1, S, ptrue, pgtmp); 2435 2436 // Repeat to the highest half. 2437 // pgtmp = 00000001 00000000 00000000 00000001 2438 sve_punpkhi(pgtmp, mask); 2439 // vtmp1 = 00008888 00007777 00006666 00005555 2440 sve_uunpkhi(vtmp1, S, src); 2441 // vtmp1 = 00000000 00000000 00008888 00005555 2442 sve_compact(vtmp1, S, vtmp1, pgtmp); 2443 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2444 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2445 2446 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2447 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2448 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2449 // TRUE_CNT is the number of active elements in the compressed low. 2450 neg(rscratch1, rscratch1); 2451 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2452 sve_index(vtmp2, H, rscratch1, 1); 2453 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2454 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2455 2456 // Combine the compressed high(after shifted) with the compressed low. 2457 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2458 sve_orr(dst, dst, vtmp1); 2459 } 2460 2461 // Clobbers: rscratch1, rscratch2 2462 // Preserves: src, mask 2463 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2464 FloatRegister vtmp1, FloatRegister vtmp2, 2465 FloatRegister vtmp3, FloatRegister vtmp4, 2466 PRegister ptmp, PRegister pgtmp) { 2467 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2468 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2469 assert_different_registers(mask, ptmp, pgtmp); 2470 // Example input: src = 88 77 66 55 44 33 22 11 2471 // mask = 01 00 00 01 01 00 01 01 2472 // Expected result: dst = 00 00 00 88 55 44 22 11 2473 2474 sve_dup(vtmp4, B, 0); 2475 // Extend lowest half to type SHORT. 2476 // vtmp1 = 0044 0033 0022 0011 2477 sve_uunpklo(vtmp1, H, src); 2478 // ptmp = 0001 0000 0001 0001 2479 sve_punpklo(ptmp, mask); 2480 // Count the active elements of lowest half. 2481 // rscratch2 = 3 2482 sve_cntp(rscratch2, H, ptrue, ptmp); 2483 // Pack the active elements in size of type SHORT to the right, 2484 // and fill the remainings with zero. 2485 // dst = 0000 0044 0022 0011 2486 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2487 // Narrow the result back to type BYTE. 2488 // dst = 00 00 00 00 00 44 22 11 2489 sve_uzp1(dst, B, dst, vtmp4); 2490 2491 // Repeat to the highest half. 2492 // ptmp = 0001 0000 0000 0001 2493 sve_punpkhi(ptmp, mask); 2494 // vtmp1 = 0088 0077 0066 0055 2495 sve_uunpkhi(vtmp2, H, src); 2496 // vtmp1 = 0000 0000 0088 0055 2497 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2498 2499 sve_dup(vtmp4, B, 0); 2500 // vtmp1 = 00 00 00 00 00 00 88 55 2501 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2502 2503 // Compressed low: dst = 00 00 00 00 00 44 22 11 2504 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2505 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2506 // TRUE_CNT is the number of active elements in the compressed low. 2507 neg(rscratch2, rscratch2); 2508 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2509 sve_index(vtmp2, B, rscratch2, 1); 2510 // vtmp1 = 00 00 00 88 55 00 00 00 2511 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2512 // Combine the compressed high(after shifted) with the compressed low. 2513 // dst = 00 00 00 88 55 44 22 11 2514 sve_orr(dst, dst, vtmp1); 2515 } 2516 2517 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2518 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2519 SIMD_Arrangement size = isQ ? T16B : T8B; 2520 if (bt == T_BYTE) { 2521 rbit(dst, size, src); 2522 } else { 2523 neon_reverse_bytes(dst, src, bt, isQ); 2524 rbit(dst, size, dst); 2525 } 2526 } 2527 2528 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2529 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2530 SIMD_Arrangement size = isQ ? T16B : T8B; 2531 switch (bt) { 2532 case T_BYTE: 2533 if (dst != src) { 2534 orr(dst, size, src, src); 2535 } 2536 break; 2537 case T_SHORT: 2538 rev16(dst, size, src); 2539 break; 2540 case T_INT: 2541 rev32(dst, size, src); 2542 break; 2543 case T_LONG: 2544 rev64(dst, size, src); 2545 break; 2546 default: 2547 assert(false, "unsupported"); 2548 ShouldNotReachHere(); 2549 } 2550 } 2551 2552 // Extract a scalar element from an sve vector at position 'idx'. 2553 // The input elements in src are expected to be of integral type. 2554 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2555 int idx, FloatRegister vtmp) { 2556 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2557 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2558 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2559 if (bt == T_INT || bt == T_LONG) { 2560 umov(dst, src, size, idx); 2561 } else { 2562 smov(dst, src, size, idx); 2563 } 2564 } else { 2565 sve_orr(vtmp, src, src); 2566 sve_ext(vtmp, vtmp, idx << size); 2567 if (bt == T_INT || bt == T_LONG) { 2568 umov(dst, vtmp, size, 0); 2569 } else { 2570 smov(dst, vtmp, size, 0); 2571 } 2572 } 2573 } 2574 2575 // java.lang.Math::round intrinsics 2576 2577 // Clobbers: rscratch1, rflags 2578 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2579 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2580 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2581 switch (T) { 2582 case T2S: 2583 case T4S: 2584 fmovs(tmp1, T, 0.5f); 2585 mov(rscratch1, jint_cast(0x1.0p23f)); 2586 break; 2587 case T2D: 2588 fmovd(tmp1, T, 0.5); 2589 mov(rscratch1, julong_cast(0x1.0p52)); 2590 break; 2591 default: 2592 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2593 } 2594 fadd(tmp1, T, tmp1, src); 2595 fcvtms(tmp1, T, tmp1); 2596 // tmp1 = floor(src + 0.5, ties to even) 2597 2598 fcvtas(dst, T, src); 2599 // dst = round(src), ties to away 2600 2601 fneg(tmp3, T, src); 2602 dup(tmp2, T, rscratch1); 2603 cm(HS, tmp3, T, tmp3, tmp2); 2604 // tmp3 is now a set of flags 2605 2606 bif(dst, T16B, tmp1, tmp3); 2607 // result in dst 2608 } 2609 2610 // Clobbers: rscratch1, rflags 2611 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2612 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2613 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2614 assert_different_registers(tmp1, tmp2, src, dst); 2615 2616 switch (T) { 2617 case S: 2618 mov(rscratch1, jint_cast(0x1.0p23f)); 2619 break; 2620 case D: 2621 mov(rscratch1, julong_cast(0x1.0p52)); 2622 break; 2623 default: 2624 assert(T == S || T == D, "invalid register variant"); 2625 } 2626 2627 sve_frinta(dst, T, ptrue, src); 2628 // dst = round(src), ties to away 2629 2630 Label none; 2631 2632 sve_fneg(tmp1, T, ptrue, src); 2633 sve_dup(tmp2, T, rscratch1); 2634 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2635 br(EQ, none); 2636 { 2637 sve_cpy(tmp1, T, pgtmp, 0.5); 2638 sve_fadd(tmp1, T, pgtmp, src); 2639 sve_frintm(dst, T, pgtmp, tmp1); 2640 // dst = floor(src + 0.5, ties to even) 2641 } 2642 bind(none); 2643 2644 sve_fcvtzs(dst, T, ptrue, dst, T); 2645 // result in dst 2646 } 2647 2648 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2649 FloatRegister one, SIMD_Arrangement T) { 2650 assert_different_registers(dst, src, zero, one); 2651 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2652 2653 facgt(dst, T, src, zero); 2654 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2655 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2656 } 2657 2658 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2659 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2660 assert_different_registers(dst, src, zero, one, vtmp); 2661 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2662 2663 sve_orr(vtmp, src, src); 2664 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2665 switch (T) { 2666 case S: 2667 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2668 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2669 // on the sign of the float value 2670 break; 2671 case D: 2672 sve_and(vtmp, T, min_jlong); 2673 sve_orr(vtmp, T, jlong_cast(1.0)); 2674 break; 2675 default: 2676 assert(false, "unsupported"); 2677 ShouldNotReachHere(); 2678 } 2679 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2680 // Result in dst 2681 } 2682 2683 bool C2_MacroAssembler::in_scratch_emit_size() { 2684 if (ciEnv::current()->task() != nullptr) { 2685 PhaseOutput* phase_output = Compile::current()->output(); 2686 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2687 return true; 2688 } 2689 } 2690 return MacroAssembler::in_scratch_emit_size(); 2691 }