1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 #include "utilities/powerOfTwo.hpp" 37 38 #ifdef PRODUCT 39 #define BLOCK_COMMENT(str) /* nothing */ 40 #define STOP(error) stop(error) 41 #else 42 #define BLOCK_COMMENT(str) block_comment(str) 43 #define STOP(error) block_comment(error); stop(error) 44 #endif 45 46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 47 48 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 49 50 // jdk.internal.util.ArraysSupport.vectorizedHashCode 51 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 52 FloatRegister vdata0, FloatRegister vdata1, 53 FloatRegister vdata2, FloatRegister vdata3, 54 FloatRegister vmul0, FloatRegister vmul1, 55 FloatRegister vmul2, FloatRegister vmul3, 56 FloatRegister vpow, FloatRegister vpowm, 57 BasicType eltype) { 58 ARRAYS_HASHCODE_REGISTERS; 59 60 Register tmp1 = rscratch1, tmp2 = rscratch2; 61 62 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 63 64 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 65 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 66 // use 4H for chars and shorts instead, but using 8H gives better performance. 67 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 68 : eltype == T_CHAR || eltype == T_SHORT ? 8 69 : eltype == T_INT ? 4 70 : 0; 71 guarantee(vf, "unsupported eltype"); 72 73 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 74 const size_t unroll_factor = 4; 75 76 switch (eltype) { 77 case T_BOOLEAN: 78 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 79 break; 80 case T_CHAR: 81 BLOCK_COMMENT("arrays_hashcode(char) {"); 82 break; 83 case T_BYTE: 84 BLOCK_COMMENT("arrays_hashcode(byte) {"); 85 break; 86 case T_SHORT: 87 BLOCK_COMMENT("arrays_hashcode(short) {"); 88 break; 89 case T_INT: 90 BLOCK_COMMENT("arrays_hashcode(int) {"); 91 break; 92 default: 93 ShouldNotReachHere(); 94 } 95 96 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 97 // implemented by the stub executes just once. Call the stub only if at least two iterations will 98 // be executed. 99 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 100 cmpw(cnt, large_threshold); 101 br(Assembler::HS, LARGE); 102 103 bind(TAIL); 104 105 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 106 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 107 // Iteration eats up the remainder, uf elements at a time. 108 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 109 andr(tmp2, cnt, unroll_factor - 1); 110 adr(tmp1, BR_BASE); 111 sub(tmp1, tmp1, tmp2, ext::sxtw, 3); 112 movw(tmp2, 0x1f); 113 br(tmp1); 114 115 bind(LOOP); 116 for (size_t i = 0; i < unroll_factor; ++i) { 117 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 118 maddw(result, result, tmp2, tmp1); 119 } 120 bind(BR_BASE); 121 subsw(cnt, cnt, unroll_factor); 122 br(Assembler::HS, LOOP); 123 124 b(DONE); 125 126 bind(LARGE); 127 128 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 129 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 130 address tpc = trampoline_call(stub); 131 if (tpc == nullptr) { 132 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 133 postcond(pc() == badAddress); 134 return nullptr; 135 } 136 137 bind(DONE); 138 139 BLOCK_COMMENT("} // arrays_hashcode"); 140 141 postcond(pc() != badAddress); 142 return pc(); 143 } 144 145 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 146 Register tmp2Reg, Register tmp3Reg) { 147 Register oop = objectReg; 148 Register box = boxReg; 149 Register disp_hdr = tmpReg; 150 Register tmp = tmp2Reg; 151 Label cont; 152 Label object_has_monitor; 153 Label count, no_count; 154 155 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 156 assert_different_registers(oop, box, tmp, disp_hdr); 157 158 // Load markWord from object into displaced_header. 159 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 160 161 if (DiagnoseSyncOnValueBasedClasses != 0) { 162 load_klass(tmp, oop); 163 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 164 tst(tmp, KlassFlags::_misc_is_value_based_class); 165 br(Assembler::NE, cont); 166 } 167 168 // Check for existing monitor 169 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 170 171 if (LockingMode == LM_MONITOR) { 172 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 173 b(cont); 174 } else { 175 assert(LockingMode == LM_LEGACY, "must be"); 176 // Set tmp to be (markWord of object | UNLOCK_VALUE). 177 orr(tmp, disp_hdr, markWord::unlocked_value); 178 179 // Initialize the box. (Must happen before we update the object mark!) 180 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 181 182 // Compare object markWord with an unlocked value (tmp) and if 183 // equal exchange the stack address of our box with object markWord. 184 // On failure disp_hdr contains the possibly locked markWord. 185 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 186 /*release*/ true, /*weak*/ false, disp_hdr); 187 br(Assembler::EQ, cont); 188 189 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 190 191 // If the compare-and-exchange succeeded, then we found an unlocked 192 // object, will have now locked it will continue at label cont 193 194 // Check if the owner is self by comparing the value in the 195 // markWord of object (disp_hdr) with the stack pointer. 196 mov(rscratch1, sp); 197 sub(disp_hdr, disp_hdr, rscratch1); 198 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 199 // If condition is true we are cont and hence we can store 0 as the 200 // displaced header in the box, which indicates that it is a recursive lock. 201 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 202 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 203 b(cont); 204 } 205 206 // Handle existing monitor. 207 bind(object_has_monitor); 208 209 // The object's monitor m is unlocked iff m->owner == nullptr, 210 // otherwise m->owner may contain a thread or a stack address. 211 // 212 // Try to CAS m->owner from null to current thread. 213 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 214 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 215 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 216 217 // Store a non-null value into the box to avoid looking like a re-entrant 218 // lock. The fast-path monitor unlock code checks for 219 // markWord::monitor_value so use markWord::unused_mark which has the 220 // relevant bit set, and also matches ObjectSynchronizer::enter. 221 mov(tmp, (address)markWord::unused_mark().value()); 222 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 223 224 br(Assembler::EQ, cont); // CAS success means locking succeeded 225 226 cmp(tmp3Reg, rthread); 227 br(Assembler::NE, cont); // Check for recursive locking 228 229 // Recursive lock case 230 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 231 // flag == EQ still from the cmp above, checking if this is a reentrant lock 232 233 bind(cont); 234 // flag == EQ indicates success 235 // flag == NE indicates failure 236 br(Assembler::NE, no_count); 237 238 bind(count); 239 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 240 241 bind(no_count); 242 } 243 244 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 245 Register tmp2Reg) { 246 Register oop = objectReg; 247 Register box = boxReg; 248 Register disp_hdr = tmpReg; 249 Register owner_addr = tmpReg; 250 Register tmp = tmp2Reg; 251 Label cont; 252 Label object_has_monitor; 253 Label count, no_count; 254 Label unlocked; 255 256 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 257 assert_different_registers(oop, box, tmp, disp_hdr); 258 259 if (LockingMode == LM_LEGACY) { 260 // Find the lock address and load the displaced header from the stack. 261 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 262 263 // If the displaced header is 0, we have a recursive unlock. 264 cmp(disp_hdr, zr); 265 br(Assembler::EQ, cont); 266 } 267 268 // Handle existing monitor. 269 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 270 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 271 272 if (LockingMode == LM_MONITOR) { 273 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 274 b(cont); 275 } else { 276 assert(LockingMode == LM_LEGACY, "must be"); 277 // Check if it is still a light weight lock, this is is true if we 278 // see the stack address of the basicLock in the markWord of the 279 // object. 280 281 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 282 /*release*/ true, /*weak*/ false, tmp); 283 b(cont); 284 } 285 286 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 287 288 // Handle existing monitor. 289 bind(object_has_monitor); 290 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 291 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 292 293 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 294 295 Label notRecursive; 296 cbz(disp_hdr, notRecursive); 297 298 // Recursive lock 299 sub(disp_hdr, disp_hdr, 1u); 300 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 301 cmp(disp_hdr, disp_hdr); // Sets flags for result 302 b(cont); 303 304 bind(notRecursive); 305 306 // Compute owner address. 307 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 308 309 // Set owner to null. 310 // Release to satisfy the JMM 311 stlr(zr, owner_addr); 312 // We need a full fence after clearing owner to avoid stranding. 313 // StoreLoad achieves this. 314 membar(StoreLoad); 315 316 // Check if the entry lists are empty (EntryList first - by convention). 317 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 318 ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset())); 319 orr(rscratch1, rscratch1, tmpReg); 320 cmp(rscratch1, zr); 321 br(Assembler::EQ, cont); // If so we are done. 322 323 // Check if there is a successor. 324 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 325 cmp(rscratch1, zr); 326 br(Assembler::NE, unlocked); // If so we are done. 327 328 // Save the monitor pointer in the current thread, so we can try to 329 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 330 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 331 332 cmp(zr, rthread); // Set Flag to NE => slow path 333 b(cont); 334 335 bind(unlocked); 336 cmp(zr, zr); // Set Flag to EQ => fast path 337 338 // Intentional fall-through 339 340 bind(cont); 341 // flag == EQ indicates success 342 // flag == NE indicates failure 343 br(Assembler::NE, no_count); 344 345 bind(count); 346 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 347 348 bind(no_count); 349 } 350 351 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 352 Register t2, Register t3) { 353 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 354 assert_different_registers(obj, box, t1, t2, t3); 355 356 // Handle inflated monitor. 357 Label inflated; 358 // Finish fast lock successfully. MUST branch to with flag == EQ 359 Label locked; 360 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 361 Label slow_path; 362 363 if (UseObjectMonitorTable) { 364 // Clear cache in case fast locking succeeds. 365 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 366 } 367 368 if (DiagnoseSyncOnValueBasedClasses != 0) { 369 load_klass(t1, obj); 370 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 371 tst(t1, KlassFlags::_misc_is_value_based_class); 372 br(Assembler::NE, slow_path); 373 } 374 375 const Register t1_mark = t1; 376 const Register t3_t = t3; 377 378 { // Lightweight locking 379 380 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 381 Label push; 382 383 const Register t2_top = t2; 384 385 // Check if lock-stack is full. 386 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 387 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 388 br(Assembler::GT, slow_path); 389 390 // Check if recursive. 391 subw(t3_t, t2_top, oopSize); 392 ldr(t3_t, Address(rthread, t3_t)); 393 cmp(obj, t3_t); 394 br(Assembler::EQ, push); 395 396 // Relaxed normal load to check for monitor. Optimization for monitor case. 397 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 398 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 399 400 // Not inflated 401 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 402 403 // Try to lock. Transition lock-bits 0b01 => 0b00 404 orr(t1_mark, t1_mark, markWord::unlocked_value); 405 eor(t3_t, t1_mark, markWord::unlocked_value); 406 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 407 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 408 br(Assembler::NE, slow_path); 409 410 bind(push); 411 // After successful lock, push object on lock-stack. 412 str(obj, Address(rthread, t2_top)); 413 addw(t2_top, t2_top, oopSize); 414 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 415 b(locked); 416 } 417 418 { // Handle inflated monitor. 419 bind(inflated); 420 421 const Register t1_monitor = t1; 422 423 if (!UseObjectMonitorTable) { 424 assert(t1_monitor == t1_mark, "should be the same here"); 425 } else { 426 Label monitor_found; 427 428 // Load cache address 429 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 430 431 const int num_unrolled = 2; 432 for (int i = 0; i < num_unrolled; i++) { 433 ldr(t1, Address(t3_t)); 434 cmp(obj, t1); 435 br(Assembler::EQ, monitor_found); 436 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 437 } 438 439 Label loop; 440 441 // Search for obj in cache. 442 bind(loop); 443 444 // Check for match. 445 ldr(t1, Address(t3_t)); 446 cmp(obj, t1); 447 br(Assembler::EQ, monitor_found); 448 449 // Search until null encountered, guaranteed _null_sentinel at end. 450 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 451 cbnz(t1, loop); 452 // Cache Miss, NE set from cmp above, cbnz does not set flags 453 b(slow_path); 454 455 bind(monitor_found); 456 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 457 } 458 459 const Register t2_owner_addr = t2; 460 const Register t3_owner = t3; 461 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 462 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 463 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 464 465 Label monitor_locked; 466 467 // Compute owner address. 468 lea(t2_owner_addr, owner_address); 469 470 // CAS owner (null => current thread). 471 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 472 /*release*/ false, /*weak*/ false, t3_owner); 473 br(Assembler::EQ, monitor_locked); 474 475 // Check if recursive. 476 cmp(t3_owner, rthread); 477 br(Assembler::NE, slow_path); 478 479 // Recursive. 480 increment(recursions_address, 1); 481 482 bind(monitor_locked); 483 if (UseObjectMonitorTable) { 484 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 485 } 486 } 487 488 bind(locked); 489 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 490 491 #ifdef ASSERT 492 // Check that locked label is reached with Flags == EQ. 493 Label flag_correct; 494 br(Assembler::EQ, flag_correct); 495 stop("Fast Lock Flag != EQ"); 496 #endif 497 498 bind(slow_path); 499 #ifdef ASSERT 500 // Check that slow_path label is reached with Flags == NE. 501 br(Assembler::NE, flag_correct); 502 stop("Fast Lock Flag != NE"); 503 bind(flag_correct); 504 #endif 505 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 506 } 507 508 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 509 Register t2, Register t3) { 510 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 511 assert_different_registers(obj, box, t1, t2, t3); 512 513 // Handle inflated monitor. 514 Label inflated, inflated_load_mark; 515 // Finish fast unlock successfully. MUST branch to with flag == EQ 516 Label unlocked; 517 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 518 Label slow_path; 519 520 const Register t1_mark = t1; 521 const Register t2_top = t2; 522 const Register t3_t = t3; 523 524 { // Lightweight unlock 525 526 Label push_and_slow_path; 527 528 // Check if obj is top of lock-stack. 529 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 530 subw(t2_top, t2_top, oopSize); 531 ldr(t3_t, Address(rthread, t2_top)); 532 cmp(obj, t3_t); 533 // Top of lock stack was not obj. Must be monitor. 534 br(Assembler::NE, inflated_load_mark); 535 536 // Pop lock-stack. 537 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 538 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 539 540 // Check if recursive. 541 subw(t3_t, t2_top, oopSize); 542 ldr(t3_t, Address(rthread, t3_t)); 543 cmp(obj, t3_t); 544 br(Assembler::EQ, unlocked); 545 546 // Not recursive. 547 // Load Mark. 548 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 549 550 // Check header for monitor (0b10). 551 // Because we got here by popping (meaning we pushed in locked) 552 // there will be no monitor in the box. So we need to push back the obj 553 // so that the runtime can fix any potential anonymous owner. 554 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 555 556 // Try to unlock. Transition lock bits 0b00 => 0b01 557 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 558 orr(t3_t, t1_mark, markWord::unlocked_value); 559 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 560 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 561 br(Assembler::EQ, unlocked); 562 563 bind(push_and_slow_path); 564 // Compare and exchange failed. 565 // Restore lock-stack and handle the unlock in runtime. 566 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 567 addw(t2_top, t2_top, oopSize); 568 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 569 b(slow_path); 570 } 571 572 573 { // Handle inflated monitor. 574 bind(inflated_load_mark); 575 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 576 #ifdef ASSERT 577 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 578 stop("Fast Unlock not monitor"); 579 #endif 580 581 bind(inflated); 582 583 #ifdef ASSERT 584 Label check_done; 585 subw(t2_top, t2_top, oopSize); 586 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 587 br(Assembler::LT, check_done); 588 ldr(t3_t, Address(rthread, t2_top)); 589 cmp(obj, t3_t); 590 br(Assembler::NE, inflated); 591 stop("Fast Unlock lock on stack"); 592 bind(check_done); 593 #endif 594 595 const Register t1_monitor = t1; 596 597 if (!UseObjectMonitorTable) { 598 assert(t1_monitor == t1_mark, "should be the same here"); 599 600 // Untag the monitor. 601 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 602 } else { 603 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 604 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 605 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 606 br(Assembler::LO, slow_path); 607 } 608 609 const Register t2_recursions = t2; 610 Label not_recursive; 611 612 // Check if recursive. 613 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 614 cbz(t2_recursions, not_recursive); 615 616 // Recursive unlock. 617 sub(t2_recursions, t2_recursions, 1u); 618 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 619 // Set flag == EQ 620 cmp(t2_recursions, t2_recursions); 621 b(unlocked); 622 623 bind(not_recursive); 624 625 const Register t2_owner_addr = t2; 626 627 // Compute owner address. 628 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 629 630 // Set owner to null. 631 // Release to satisfy the JMM 632 stlr(zr, t2_owner_addr); 633 // We need a full fence after clearing owner to avoid stranding. 634 // StoreLoad achieves this. 635 membar(StoreLoad); 636 637 // Check if the entry lists are empty (EntryList first - by convention). 638 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 639 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 640 orr(rscratch1, rscratch1, t3_t); 641 cmp(rscratch1, zr); 642 br(Assembler::EQ, unlocked); // If so we are done. 643 644 // Check if there is a successor. 645 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 646 cmp(rscratch1, zr); 647 br(Assembler::NE, unlocked); // If so we are done. 648 649 // Save the monitor pointer in the current thread, so we can try to 650 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 651 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 652 653 cmp(zr, rthread); // Set Flag to NE => slow path 654 b(slow_path); 655 } 656 657 bind(unlocked); 658 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 659 cmp(zr, zr); // Set Flags to EQ => fast path 660 661 #ifdef ASSERT 662 // Check that unlocked label is reached with Flags == EQ. 663 Label flag_correct; 664 br(Assembler::EQ, flag_correct); 665 stop("Fast Unlock Flag != EQ"); 666 #endif 667 668 bind(slow_path); 669 #ifdef ASSERT 670 // Check that slow_path label is reached with Flags == NE. 671 br(Assembler::NE, flag_correct); 672 stop("Fast Unlock Flag != NE"); 673 bind(flag_correct); 674 #endif 675 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 676 } 677 678 // Search for str1 in str2 and return index or -1 679 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 680 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 681 Register cnt2, Register cnt1, 682 Register tmp1, Register tmp2, 683 Register tmp3, Register tmp4, 684 Register tmp5, Register tmp6, 685 int icnt1, Register result, int ae) { 686 // NOTE: tmp5, tmp6 can be zr depending on specific method version 687 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 688 689 Register ch1 = rscratch1; 690 Register ch2 = rscratch2; 691 Register cnt1tmp = tmp1; 692 Register cnt2tmp = tmp2; 693 Register cnt1_neg = cnt1; 694 Register cnt2_neg = cnt2; 695 Register result_tmp = tmp4; 696 697 bool isL = ae == StrIntrinsicNode::LL; 698 699 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 700 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 701 int str1_chr_shift = str1_isL ? 0:1; 702 int str2_chr_shift = str2_isL ? 0:1; 703 int str1_chr_size = str1_isL ? 1:2; 704 int str2_chr_size = str2_isL ? 1:2; 705 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 706 (chr_insn)&MacroAssembler::ldrh; 707 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 708 (chr_insn)&MacroAssembler::ldrh; 709 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 710 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 711 712 // Note, inline_string_indexOf() generates checks: 713 // if (substr.count > string.count) return -1; 714 // if (substr.count == 0) return 0; 715 716 // We have two strings, a source string in str2, cnt2 and a pattern string 717 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 718 719 // For larger pattern and source we use a simplified Boyer Moore algorithm. 720 // With a small pattern and source we use linear scan. 721 722 if (icnt1 == -1) { 723 sub(result_tmp, cnt2, cnt1); 724 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 725 br(LT, LINEARSEARCH); 726 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 727 subs(zr, cnt1, 256); 728 lsr(tmp1, cnt2, 2); 729 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 730 br(GE, LINEARSTUB); 731 } 732 733 // The Boyer Moore alogorithm is based on the description here:- 734 // 735 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 736 // 737 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 738 // and the 'Good Suffix' rule. 739 // 740 // These rules are essentially heuristics for how far we can shift the 741 // pattern along the search string. 742 // 743 // The implementation here uses the 'Bad Character' rule only because of the 744 // complexity of initialisation for the 'Good Suffix' rule. 745 // 746 // This is also known as the Boyer-Moore-Horspool algorithm:- 747 // 748 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 749 // 750 // This particular implementation has few java-specific optimizations. 751 // 752 // #define ASIZE 256 753 // 754 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 755 // int i, j; 756 // unsigned c; 757 // unsigned char bc[ASIZE]; 758 // 759 // /* Preprocessing */ 760 // for (i = 0; i < ASIZE; ++i) 761 // bc[i] = m; 762 // for (i = 0; i < m - 1; ) { 763 // c = x[i]; 764 // ++i; 765 // // c < 256 for Latin1 string, so, no need for branch 766 // #ifdef PATTERN_STRING_IS_LATIN1 767 // bc[c] = m - i; 768 // #else 769 // if (c < ASIZE) bc[c] = m - i; 770 // #endif 771 // } 772 // 773 // /* Searching */ 774 // j = 0; 775 // while (j <= n - m) { 776 // c = y[i+j]; 777 // if (x[m-1] == c) 778 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 779 // if (i < 0) return j; 780 // // c < 256 for Latin1 string, so, no need for branch 781 // #ifdef SOURCE_STRING_IS_LATIN1 782 // // LL case: (c< 256) always true. Remove branch 783 // j += bc[y[j+m-1]]; 784 // #endif 785 // #ifndef PATTERN_STRING_IS_UTF 786 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 787 // if (c < ASIZE) 788 // j += bc[y[j+m-1]]; 789 // else 790 // j += 1 791 // #endif 792 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 793 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 794 // if (c < ASIZE) 795 // j += bc[y[j+m-1]]; 796 // else 797 // j += m 798 // #endif 799 // } 800 // } 801 802 if (icnt1 == -1) { 803 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 804 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 805 Register cnt1end = tmp2; 806 Register str2end = cnt2; 807 Register skipch = tmp2; 808 809 // str1 length is >=8, so, we can read at least 1 register for cases when 810 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 811 // UL case. We'll re-read last character in inner pre-loop code to have 812 // single outer pre-loop load 813 const int firstStep = isL ? 7 : 3; 814 815 const int ASIZE = 256; 816 const int STORED_BYTES = 32; // amount of bytes stored per instruction 817 sub(sp, sp, ASIZE); 818 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 819 mov(ch1, sp); 820 BIND(BM_INIT_LOOP); 821 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 822 subs(tmp5, tmp5, 1); 823 br(GT, BM_INIT_LOOP); 824 825 sub(cnt1tmp, cnt1, 1); 826 mov(tmp5, str2); 827 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 828 sub(ch2, cnt1, 1); 829 mov(tmp3, str1); 830 BIND(BCLOOP); 831 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 832 if (!str1_isL) { 833 subs(zr, ch1, ASIZE); 834 br(HS, BCSKIP); 835 } 836 strb(ch2, Address(sp, ch1)); 837 BIND(BCSKIP); 838 subs(ch2, ch2, 1); 839 br(GT, BCLOOP); 840 841 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 842 if (str1_isL == str2_isL) { 843 // load last 8 bytes (8LL/4UU symbols) 844 ldr(tmp6, Address(tmp6, -wordSize)); 845 } else { 846 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 847 // convert Latin1 to UTF. We'll have to wait until load completed, but 848 // it's still faster than per-character loads+checks 849 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 850 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 851 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 852 andr(tmp6, tmp6, 0xFF); // str1[N-4] 853 orr(ch2, ch1, ch2, LSL, 16); 854 orr(tmp6, tmp6, tmp3, LSL, 48); 855 orr(tmp6, tmp6, ch2, LSL, 16); 856 } 857 BIND(BMLOOPSTR2); 858 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 859 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 860 if (str1_isL == str2_isL) { 861 // re-init tmp3. It's for free because it's executed in parallel with 862 // load above. Alternative is to initialize it before loop, but it'll 863 // affect performance on in-order systems with 2 or more ld/st pipelines 864 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 865 } 866 if (!isL) { // UU/UL case 867 lsl(ch2, cnt1tmp, 1); // offset in bytes 868 } 869 cmp(tmp3, skipch); 870 br(NE, BMSKIP); 871 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 872 mov(ch1, tmp6); 873 if (isL) { 874 b(BMLOOPSTR1_AFTER_LOAD); 875 } else { 876 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 877 b(BMLOOPSTR1_CMP); 878 } 879 BIND(BMLOOPSTR1); 880 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 881 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 882 BIND(BMLOOPSTR1_AFTER_LOAD); 883 subs(cnt1tmp, cnt1tmp, 1); 884 br(LT, BMLOOPSTR1_LASTCMP); 885 BIND(BMLOOPSTR1_CMP); 886 cmp(ch1, ch2); 887 br(EQ, BMLOOPSTR1); 888 BIND(BMSKIP); 889 if (!isL) { 890 // if we've met UTF symbol while searching Latin1 pattern, then we can 891 // skip cnt1 symbols 892 if (str1_isL != str2_isL) { 893 mov(result_tmp, cnt1); 894 } else { 895 mov(result_tmp, 1); 896 } 897 subs(zr, skipch, ASIZE); 898 br(HS, BMADV); 899 } 900 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 901 BIND(BMADV); 902 sub(cnt1tmp, cnt1, 1); 903 add(str2, str2, result_tmp, LSL, str2_chr_shift); 904 cmp(str2, str2end); 905 br(LE, BMLOOPSTR2); 906 add(sp, sp, ASIZE); 907 b(NOMATCH); 908 BIND(BMLOOPSTR1_LASTCMP); 909 cmp(ch1, ch2); 910 br(NE, BMSKIP); 911 BIND(BMMATCH); 912 sub(result, str2, tmp5); 913 if (!str2_isL) lsr(result, result, 1); 914 add(sp, sp, ASIZE); 915 b(DONE); 916 917 BIND(LINEARSTUB); 918 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 919 br(LT, LINEAR_MEDIUM); 920 mov(result, zr); 921 RuntimeAddress stub = nullptr; 922 if (isL) { 923 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 924 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 925 } else if (str1_isL) { 926 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 927 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 928 } else { 929 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 930 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 931 } 932 address call = trampoline_call(stub); 933 if (call == nullptr) { 934 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 935 ciEnv::current()->record_failure("CodeCache is full"); 936 return; 937 } 938 b(DONE); 939 } 940 941 BIND(LINEARSEARCH); 942 { 943 Label DO1, DO2, DO3; 944 945 Register str2tmp = tmp2; 946 Register first = tmp3; 947 948 if (icnt1 == -1) 949 { 950 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 951 952 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 953 br(LT, DOSHORT); 954 BIND(LINEAR_MEDIUM); 955 (this->*str1_load_1chr)(first, Address(str1)); 956 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 957 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 958 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 959 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 960 961 BIND(FIRST_LOOP); 962 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 963 cmp(first, ch2); 964 br(EQ, STR1_LOOP); 965 BIND(STR2_NEXT); 966 adds(cnt2_neg, cnt2_neg, str2_chr_size); 967 br(LE, FIRST_LOOP); 968 b(NOMATCH); 969 970 BIND(STR1_LOOP); 971 adds(cnt1tmp, cnt1_neg, str1_chr_size); 972 add(cnt2tmp, cnt2_neg, str2_chr_size); 973 br(GE, MATCH); 974 975 BIND(STR1_NEXT); 976 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 977 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 978 cmp(ch1, ch2); 979 br(NE, STR2_NEXT); 980 adds(cnt1tmp, cnt1tmp, str1_chr_size); 981 add(cnt2tmp, cnt2tmp, str2_chr_size); 982 br(LT, STR1_NEXT); 983 b(MATCH); 984 985 BIND(DOSHORT); 986 if (str1_isL == str2_isL) { 987 cmp(cnt1, (u1)2); 988 br(LT, DO1); 989 br(GT, DO3); 990 } 991 } 992 993 if (icnt1 == 4) { 994 Label CH1_LOOP; 995 996 (this->*load_4chr)(ch1, str1); 997 sub(result_tmp, cnt2, 4); 998 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 999 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1000 1001 BIND(CH1_LOOP); 1002 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 1003 cmp(ch1, ch2); 1004 br(EQ, MATCH); 1005 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1006 br(LE, CH1_LOOP); 1007 b(NOMATCH); 1008 } 1009 1010 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1011 Label CH1_LOOP; 1012 1013 BIND(DO2); 1014 (this->*load_2chr)(ch1, str1); 1015 if (icnt1 == 2) { 1016 sub(result_tmp, cnt2, 2); 1017 } 1018 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1019 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1020 BIND(CH1_LOOP); 1021 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1022 cmp(ch1, ch2); 1023 br(EQ, MATCH); 1024 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1025 br(LE, CH1_LOOP); 1026 b(NOMATCH); 1027 } 1028 1029 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1030 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1031 1032 BIND(DO3); 1033 (this->*load_2chr)(first, str1); 1034 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1035 if (icnt1 == 3) { 1036 sub(result_tmp, cnt2, 3); 1037 } 1038 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1039 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1040 BIND(FIRST_LOOP); 1041 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1042 cmpw(first, ch2); 1043 br(EQ, STR1_LOOP); 1044 BIND(STR2_NEXT); 1045 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1046 br(LE, FIRST_LOOP); 1047 b(NOMATCH); 1048 1049 BIND(STR1_LOOP); 1050 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1051 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1052 cmp(ch1, ch2); 1053 br(NE, STR2_NEXT); 1054 b(MATCH); 1055 } 1056 1057 if (icnt1 == -1 || icnt1 == 1) { 1058 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1059 1060 BIND(DO1); 1061 (this->*str1_load_1chr)(ch1, str1); 1062 cmp(cnt2, (u1)8); 1063 br(LT, DO1_SHORT); 1064 1065 sub(result_tmp, cnt2, 8/str2_chr_size); 1066 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1067 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1068 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1069 1070 if (str2_isL) { 1071 orr(ch1, ch1, ch1, LSL, 8); 1072 } 1073 orr(ch1, ch1, ch1, LSL, 16); 1074 orr(ch1, ch1, ch1, LSL, 32); 1075 BIND(CH1_LOOP); 1076 ldr(ch2, Address(str2, cnt2_neg)); 1077 eor(ch2, ch1, ch2); 1078 sub(tmp1, ch2, tmp3); 1079 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1080 bics(tmp1, tmp1, tmp2); 1081 br(NE, HAS_ZERO); 1082 adds(cnt2_neg, cnt2_neg, 8); 1083 br(LT, CH1_LOOP); 1084 1085 cmp(cnt2_neg, (u1)8); 1086 mov(cnt2_neg, 0); 1087 br(LT, CH1_LOOP); 1088 b(NOMATCH); 1089 1090 BIND(HAS_ZERO); 1091 rev(tmp1, tmp1); 1092 clz(tmp1, tmp1); 1093 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1094 b(MATCH); 1095 1096 BIND(DO1_SHORT); 1097 mov(result_tmp, cnt2); 1098 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1099 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1100 BIND(DO1_LOOP); 1101 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1102 cmpw(ch1, ch2); 1103 br(EQ, MATCH); 1104 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1105 br(LT, DO1_LOOP); 1106 } 1107 } 1108 BIND(NOMATCH); 1109 mov(result, -1); 1110 b(DONE); 1111 BIND(MATCH); 1112 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1113 BIND(DONE); 1114 } 1115 1116 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1117 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1118 1119 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1120 Register ch, Register result, 1121 Register tmp1, Register tmp2, Register tmp3) 1122 { 1123 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1124 Register cnt1_neg = cnt1; 1125 Register ch1 = rscratch1; 1126 Register result_tmp = rscratch2; 1127 1128 cbz(cnt1, NOMATCH); 1129 1130 cmp(cnt1, (u1)4); 1131 br(LT, DO1_SHORT); 1132 1133 orr(ch, ch, ch, LSL, 16); 1134 orr(ch, ch, ch, LSL, 32); 1135 1136 sub(cnt1, cnt1, 4); 1137 mov(result_tmp, cnt1); 1138 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1139 sub(cnt1_neg, zr, cnt1, LSL, 1); 1140 1141 mov(tmp3, 0x0001000100010001); 1142 1143 BIND(CH1_LOOP); 1144 ldr(ch1, Address(str1, cnt1_neg)); 1145 eor(ch1, ch, ch1); 1146 sub(tmp1, ch1, tmp3); 1147 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1148 bics(tmp1, tmp1, tmp2); 1149 br(NE, HAS_ZERO); 1150 adds(cnt1_neg, cnt1_neg, 8); 1151 br(LT, CH1_LOOP); 1152 1153 cmp(cnt1_neg, (u1)8); 1154 mov(cnt1_neg, 0); 1155 br(LT, CH1_LOOP); 1156 b(NOMATCH); 1157 1158 BIND(HAS_ZERO); 1159 rev(tmp1, tmp1); 1160 clz(tmp1, tmp1); 1161 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1162 b(MATCH); 1163 1164 BIND(DO1_SHORT); 1165 mov(result_tmp, cnt1); 1166 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1167 sub(cnt1_neg, zr, cnt1, LSL, 1); 1168 BIND(DO1_LOOP); 1169 ldrh(ch1, Address(str1, cnt1_neg)); 1170 cmpw(ch, ch1); 1171 br(EQ, MATCH); 1172 adds(cnt1_neg, cnt1_neg, 2); 1173 br(LT, DO1_LOOP); 1174 BIND(NOMATCH); 1175 mov(result, -1); 1176 b(DONE); 1177 BIND(MATCH); 1178 add(result, result_tmp, cnt1_neg, ASR, 1); 1179 BIND(DONE); 1180 } 1181 1182 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1183 Register ch, Register result, 1184 FloatRegister ztmp1, 1185 FloatRegister ztmp2, 1186 PRegister tmp_pg, 1187 PRegister tmp_pdn, bool isL) 1188 { 1189 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1190 assert(tmp_pg->is_governing(), 1191 "this register has to be a governing predicate register"); 1192 1193 Label LOOP, MATCH, DONE, NOMATCH; 1194 Register vec_len = rscratch1; 1195 Register idx = rscratch2; 1196 1197 SIMD_RegVariant T = (isL == true) ? B : H; 1198 1199 cbz(cnt1, NOMATCH); 1200 1201 // Assign the particular char throughout the vector. 1202 sve_dup(ztmp2, T, ch); 1203 if (isL) { 1204 sve_cntb(vec_len); 1205 } else { 1206 sve_cnth(vec_len); 1207 } 1208 mov(idx, 0); 1209 1210 // Generate a predicate to control the reading of input string. 1211 sve_whilelt(tmp_pg, T, idx, cnt1); 1212 1213 BIND(LOOP); 1214 // Read a vector of 8- or 16-bit data depending on the string type. Note 1215 // that inactive elements indicated by the predicate register won't cause 1216 // a data read from memory to the destination vector. 1217 if (isL) { 1218 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1219 } else { 1220 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1221 } 1222 add(idx, idx, vec_len); 1223 1224 // Perform the comparison. An element of the destination predicate is set 1225 // to active if the particular char is matched. 1226 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1227 1228 // Branch if the particular char is found. 1229 br(NE, MATCH); 1230 1231 sve_whilelt(tmp_pg, T, idx, cnt1); 1232 1233 // Loop back if the particular char not found. 1234 br(MI, LOOP); 1235 1236 BIND(NOMATCH); 1237 mov(result, -1); 1238 b(DONE); 1239 1240 BIND(MATCH); 1241 // Undo the index increment. 1242 sub(idx, idx, vec_len); 1243 1244 // Crop the vector to find its location. 1245 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1246 add(result, idx, -1); 1247 sve_incp(result, T, tmp_pdn); 1248 BIND(DONE); 1249 } 1250 1251 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1252 Register ch, Register result, 1253 Register tmp1, Register tmp2, Register tmp3) 1254 { 1255 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1256 Register cnt1_neg = cnt1; 1257 Register ch1 = rscratch1; 1258 Register result_tmp = rscratch2; 1259 1260 cbz(cnt1, NOMATCH); 1261 1262 cmp(cnt1, (u1)8); 1263 br(LT, DO1_SHORT); 1264 1265 orr(ch, ch, ch, LSL, 8); 1266 orr(ch, ch, ch, LSL, 16); 1267 orr(ch, ch, ch, LSL, 32); 1268 1269 sub(cnt1, cnt1, 8); 1270 mov(result_tmp, cnt1); 1271 lea(str1, Address(str1, cnt1)); 1272 sub(cnt1_neg, zr, cnt1); 1273 1274 mov(tmp3, 0x0101010101010101); 1275 1276 BIND(CH1_LOOP); 1277 ldr(ch1, Address(str1, cnt1_neg)); 1278 eor(ch1, ch, ch1); 1279 sub(tmp1, ch1, tmp3); 1280 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1281 bics(tmp1, tmp1, tmp2); 1282 br(NE, HAS_ZERO); 1283 adds(cnt1_neg, cnt1_neg, 8); 1284 br(LT, CH1_LOOP); 1285 1286 cmp(cnt1_neg, (u1)8); 1287 mov(cnt1_neg, 0); 1288 br(LT, CH1_LOOP); 1289 b(NOMATCH); 1290 1291 BIND(HAS_ZERO); 1292 rev(tmp1, tmp1); 1293 clz(tmp1, tmp1); 1294 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1295 b(MATCH); 1296 1297 BIND(DO1_SHORT); 1298 mov(result_tmp, cnt1); 1299 lea(str1, Address(str1, cnt1)); 1300 sub(cnt1_neg, zr, cnt1); 1301 BIND(DO1_LOOP); 1302 ldrb(ch1, Address(str1, cnt1_neg)); 1303 cmp(ch, ch1); 1304 br(EQ, MATCH); 1305 adds(cnt1_neg, cnt1_neg, 1); 1306 br(LT, DO1_LOOP); 1307 BIND(NOMATCH); 1308 mov(result, -1); 1309 b(DONE); 1310 BIND(MATCH); 1311 add(result, result_tmp, cnt1_neg); 1312 BIND(DONE); 1313 } 1314 1315 // Compare strings. 1316 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1317 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1318 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1319 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1320 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1321 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1322 SHORT_LOOP_START, TAIL_CHECK; 1323 1324 bool isLL = ae == StrIntrinsicNode::LL; 1325 bool isLU = ae == StrIntrinsicNode::LU; 1326 bool isUL = ae == StrIntrinsicNode::UL; 1327 1328 // The stub threshold for LL strings is: 72 (64 + 8) chars 1329 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1330 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1331 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1332 1333 bool str1_isL = isLL || isLU; 1334 bool str2_isL = isLL || isUL; 1335 1336 int str1_chr_shift = str1_isL ? 0 : 1; 1337 int str2_chr_shift = str2_isL ? 0 : 1; 1338 int str1_chr_size = str1_isL ? 1 : 2; 1339 int str2_chr_size = str2_isL ? 1 : 2; 1340 int minCharsInWord = isLL ? wordSize : wordSize/2; 1341 1342 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1343 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1344 (chr_insn)&MacroAssembler::ldrh; 1345 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1346 (chr_insn)&MacroAssembler::ldrh; 1347 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1348 (uxt_insn)&MacroAssembler::uxthw; 1349 1350 BLOCK_COMMENT("string_compare {"); 1351 1352 // Bizarrely, the counts are passed in bytes, regardless of whether they 1353 // are L or U strings, however the result is always in characters. 1354 if (!str1_isL) asrw(cnt1, cnt1, 1); 1355 if (!str2_isL) asrw(cnt2, cnt2, 1); 1356 1357 // Compute the minimum of the string lengths and save the difference. 1358 subsw(result, cnt1, cnt2); 1359 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1360 1361 // A very short string 1362 cmpw(cnt2, minCharsInWord); 1363 br(Assembler::LE, SHORT_STRING); 1364 1365 // Compare longwords 1366 // load first parts of strings and finish initialization while loading 1367 { 1368 if (str1_isL == str2_isL) { // LL or UU 1369 ldr(tmp1, Address(str1)); 1370 cmp(str1, str2); 1371 br(Assembler::EQ, DONE); 1372 ldr(tmp2, Address(str2)); 1373 cmp(cnt2, stub_threshold); 1374 br(GE, STUB); 1375 subsw(cnt2, cnt2, minCharsInWord); 1376 br(EQ, TAIL_CHECK); 1377 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1378 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1379 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1380 } else if (isLU) { 1381 ldrs(vtmp, Address(str1)); 1382 ldr(tmp2, Address(str2)); 1383 cmp(cnt2, stub_threshold); 1384 br(GE, STUB); 1385 subw(cnt2, cnt2, 4); 1386 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1387 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1388 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1389 zip1(vtmp, T8B, vtmp, vtmpZ); 1390 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1391 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1392 add(cnt1, cnt1, 4); 1393 fmovd(tmp1, vtmp); 1394 } else { // UL case 1395 ldr(tmp1, Address(str1)); 1396 ldrs(vtmp, Address(str2)); 1397 cmp(cnt2, stub_threshold); 1398 br(GE, STUB); 1399 subw(cnt2, cnt2, 4); 1400 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1401 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1402 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1403 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1404 zip1(vtmp, T8B, vtmp, vtmpZ); 1405 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1406 add(cnt1, cnt1, 8); 1407 fmovd(tmp2, vtmp); 1408 } 1409 adds(cnt2, cnt2, isUL ? 4 : 8); 1410 br(GE, TAIL); 1411 eor(rscratch2, tmp1, tmp2); 1412 cbnz(rscratch2, DIFF); 1413 // main loop 1414 bind(NEXT_WORD); 1415 if (str1_isL == str2_isL) { 1416 ldr(tmp1, Address(str1, cnt2)); 1417 ldr(tmp2, Address(str2, cnt2)); 1418 adds(cnt2, cnt2, 8); 1419 } else if (isLU) { 1420 ldrs(vtmp, Address(str1, cnt1)); 1421 ldr(tmp2, Address(str2, cnt2)); 1422 add(cnt1, cnt1, 4); 1423 zip1(vtmp, T8B, vtmp, vtmpZ); 1424 fmovd(tmp1, vtmp); 1425 adds(cnt2, cnt2, 8); 1426 } else { // UL 1427 ldrs(vtmp, Address(str2, cnt2)); 1428 ldr(tmp1, Address(str1, cnt1)); 1429 zip1(vtmp, T8B, vtmp, vtmpZ); 1430 add(cnt1, cnt1, 8); 1431 fmovd(tmp2, vtmp); 1432 adds(cnt2, cnt2, 4); 1433 } 1434 br(GE, TAIL); 1435 1436 eor(rscratch2, tmp1, tmp2); 1437 cbz(rscratch2, NEXT_WORD); 1438 b(DIFF); 1439 bind(TAIL); 1440 eor(rscratch2, tmp1, tmp2); 1441 cbnz(rscratch2, DIFF); 1442 // Last longword. In the case where length == 4 we compare the 1443 // same longword twice, but that's still faster than another 1444 // conditional branch. 1445 if (str1_isL == str2_isL) { 1446 ldr(tmp1, Address(str1)); 1447 ldr(tmp2, Address(str2)); 1448 } else if (isLU) { 1449 ldrs(vtmp, Address(str1)); 1450 ldr(tmp2, Address(str2)); 1451 zip1(vtmp, T8B, vtmp, vtmpZ); 1452 fmovd(tmp1, vtmp); 1453 } else { // UL 1454 ldrs(vtmp, Address(str2)); 1455 ldr(tmp1, Address(str1)); 1456 zip1(vtmp, T8B, vtmp, vtmpZ); 1457 fmovd(tmp2, vtmp); 1458 } 1459 bind(TAIL_CHECK); 1460 eor(rscratch2, tmp1, tmp2); 1461 cbz(rscratch2, DONE); 1462 1463 // Find the first different characters in the longwords and 1464 // compute their difference. 1465 bind(DIFF); 1466 rev(rscratch2, rscratch2); 1467 clz(rscratch2, rscratch2); 1468 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1469 lsrv(tmp1, tmp1, rscratch2); 1470 (this->*ext_chr)(tmp1, tmp1); 1471 lsrv(tmp2, tmp2, rscratch2); 1472 (this->*ext_chr)(tmp2, tmp2); 1473 subw(result, tmp1, tmp2); 1474 b(DONE); 1475 } 1476 1477 bind(STUB); 1478 RuntimeAddress stub = nullptr; 1479 switch(ae) { 1480 case StrIntrinsicNode::LL: 1481 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1482 break; 1483 case StrIntrinsicNode::UU: 1484 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1485 break; 1486 case StrIntrinsicNode::LU: 1487 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1488 break; 1489 case StrIntrinsicNode::UL: 1490 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1491 break; 1492 default: 1493 ShouldNotReachHere(); 1494 } 1495 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1496 address call = trampoline_call(stub); 1497 if (call == nullptr) { 1498 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1499 ciEnv::current()->record_failure("CodeCache is full"); 1500 return; 1501 } 1502 b(DONE); 1503 1504 bind(SHORT_STRING); 1505 // Is the minimum length zero? 1506 cbz(cnt2, DONE); 1507 // arrange code to do most branches while loading and loading next characters 1508 // while comparing previous 1509 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1510 subs(cnt2, cnt2, 1); 1511 br(EQ, SHORT_LAST_INIT); 1512 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1513 b(SHORT_LOOP_START); 1514 bind(SHORT_LOOP); 1515 subs(cnt2, cnt2, 1); 1516 br(EQ, SHORT_LAST); 1517 bind(SHORT_LOOP_START); 1518 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1519 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1520 cmp(tmp1, cnt1); 1521 br(NE, SHORT_LOOP_TAIL); 1522 subs(cnt2, cnt2, 1); 1523 br(EQ, SHORT_LAST2); 1524 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1525 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1526 cmp(tmp2, rscratch1); 1527 br(EQ, SHORT_LOOP); 1528 sub(result, tmp2, rscratch1); 1529 b(DONE); 1530 bind(SHORT_LOOP_TAIL); 1531 sub(result, tmp1, cnt1); 1532 b(DONE); 1533 bind(SHORT_LAST2); 1534 cmp(tmp2, rscratch1); 1535 br(EQ, DONE); 1536 sub(result, tmp2, rscratch1); 1537 1538 b(DONE); 1539 bind(SHORT_LAST_INIT); 1540 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1541 bind(SHORT_LAST); 1542 cmp(tmp1, cnt1); 1543 br(EQ, DONE); 1544 sub(result, tmp1, cnt1); 1545 1546 bind(DONE); 1547 1548 BLOCK_COMMENT("} string_compare"); 1549 } 1550 1551 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1552 FloatRegister src2, Condition cond, bool isQ) { 1553 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1554 FloatRegister zn = src1, zm = src2; 1555 bool needs_negation = false; 1556 switch (cond) { 1557 case LT: cond = GT; zn = src2; zm = src1; break; 1558 case LE: cond = GE; zn = src2; zm = src1; break; 1559 case LO: cond = HI; zn = src2; zm = src1; break; 1560 case LS: cond = HS; zn = src2; zm = src1; break; 1561 case NE: cond = EQ; needs_negation = true; break; 1562 default: 1563 break; 1564 } 1565 1566 if (is_floating_point_type(bt)) { 1567 fcm(cond, dst, size, zn, zm); 1568 } else { 1569 cm(cond, dst, size, zn, zm); 1570 } 1571 1572 if (needs_negation) { 1573 notr(dst, isQ ? T16B : T8B, dst); 1574 } 1575 } 1576 1577 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1578 Condition cond, bool isQ) { 1579 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1580 if (bt == T_FLOAT || bt == T_DOUBLE) { 1581 if (cond == Assembler::NE) { 1582 fcm(Assembler::EQ, dst, size, src); 1583 notr(dst, isQ ? T16B : T8B, dst); 1584 } else { 1585 fcm(cond, dst, size, src); 1586 } 1587 } else { 1588 if (cond == Assembler::NE) { 1589 cm(Assembler::EQ, dst, size, src); 1590 notr(dst, isQ ? T16B : T8B, dst); 1591 } else { 1592 cm(cond, dst, size, src); 1593 } 1594 } 1595 } 1596 1597 // Compress the least significant bit of each byte to the rightmost and clear 1598 // the higher garbage bits. 1599 void C2_MacroAssembler::bytemask_compress(Register dst) { 1600 // Example input, dst = 0x01 00 00 00 01 01 00 01 1601 // The "??" bytes are garbage. 1602 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1603 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1604 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1605 andr(dst, dst, 0xff); // dst = 0x8D 1606 } 1607 1608 // Pack the lowest-numbered bit of each mask element in src into a long value 1609 // in dst, at most the first 64 lane elements. 1610 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1611 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1612 FloatRegister vtmp1, FloatRegister vtmp2) { 1613 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1614 assert_different_registers(dst, rscratch1); 1615 assert_different_registers(vtmp1, vtmp2); 1616 1617 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1618 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1619 // Expected: dst = 0x658D 1620 1621 // Convert the mask into vector with sequential bytes. 1622 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1623 sve_cpy(vtmp1, size, src, 1, false); 1624 if (bt != T_BYTE) { 1625 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1626 } 1627 1628 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1629 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1630 // is to compress each significant bit of the byte in a cross-lane way. Due 1631 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1632 // (bit-compress in each lane) with the biggest lane size (T = D) then 1633 // concatenate the results. 1634 1635 // The second source input of BEXT, initialized with 0x01 in each byte. 1636 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1637 sve_dup(vtmp2, B, 1); 1638 1639 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1640 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1641 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1642 // --------------------------------------- 1643 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1644 sve_bext(vtmp1, D, vtmp1, vtmp2); 1645 1646 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1647 // result to dst. 1648 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1649 // dst = 0x658D 1650 if (lane_cnt <= 8) { 1651 // No need to concatenate. 1652 umov(dst, vtmp1, B, 0); 1653 } else if (lane_cnt <= 16) { 1654 ins(vtmp1, B, vtmp1, 1, 8); 1655 umov(dst, vtmp1, H, 0); 1656 } else { 1657 // As the lane count is 64 at most, the final expected value must be in 1658 // the lowest 64 bits after narrowing vtmp1 from D to B. 1659 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1660 umov(dst, vtmp1, D, 0); 1661 } 1662 } else if (UseSVE > 0) { 1663 // Compress the lowest 8 bytes. 1664 fmovd(dst, vtmp1); 1665 bytemask_compress(dst); 1666 if (lane_cnt <= 8) return; 1667 1668 // Repeat on higher bytes and join the results. 1669 // Compress 8 bytes in each iteration. 1670 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1671 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1672 bytemask_compress(rscratch1); 1673 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1674 } 1675 } else { 1676 assert(false, "unsupported"); 1677 ShouldNotReachHere(); 1678 } 1679 } 1680 1681 // Unpack the mask, a long value in src, into predicate register dst based on the 1682 // corresponding data type. Note that dst can support at most 64 lanes. 1683 // Below example gives the expected dst predicate register in different types, with 1684 // a valid src(0x658D) on a 1024-bit vector size machine. 1685 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1686 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1687 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1688 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1689 // 1690 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1691 // has 24 significant bits would be an invalid input if dst predicate register refers to 1692 // a LONG type 1024-bit vector, which has at most 16 lanes. 1693 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1694 FloatRegister vtmp1, FloatRegister vtmp2) { 1695 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1696 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1697 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1698 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1699 // Expected: dst = 0b01101001 10001101 1700 1701 // Put long value from general purpose register into the first lane of vector. 1702 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1703 sve_dup(vtmp1, B, 0); 1704 mov(vtmp1, D, 0, src); 1705 1706 // As sve_cmp generates mask value with the minimum unit in byte, we should 1707 // transform the value in the first lane which is mask in bit now to the 1708 // mask in byte, which can be done by SVE2's BDEP instruction. 1709 1710 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1711 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1712 if (lane_cnt <= 8) { 1713 // Nothing. As only one byte exsits. 1714 } else if (lane_cnt <= 16) { 1715 ins(vtmp1, B, vtmp1, 8, 1); 1716 mov(vtmp1, B, 1, zr); 1717 } else { 1718 sve_vector_extend(vtmp1, D, vtmp1, B); 1719 } 1720 1721 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1722 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1723 sve_dup(vtmp2, B, 1); 1724 1725 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1726 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1727 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1728 // --------------------------------------- 1729 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1730 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1731 1732 if (bt != T_BYTE) { 1733 sve_vector_extend(vtmp1, size, vtmp1, B); 1734 } 1735 // Generate mask according to the given vector, in which the elements have been 1736 // extended to expected type. 1737 // dst = 0b01101001 10001101 1738 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1739 } 1740 1741 // Clobbers: rflags 1742 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1743 FloatRegister zn, FloatRegister zm, Condition cond) { 1744 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1745 FloatRegister z1 = zn, z2 = zm; 1746 switch (cond) { 1747 case LE: z1 = zm; z2 = zn; cond = GE; break; 1748 case LT: z1 = zm; z2 = zn; cond = GT; break; 1749 case LO: z1 = zm; z2 = zn; cond = HI; break; 1750 case LS: z1 = zm; z2 = zn; cond = HS; break; 1751 default: 1752 break; 1753 } 1754 1755 SIMD_RegVariant size = elemType_to_regVariant(bt); 1756 if (is_floating_point_type(bt)) { 1757 sve_fcm(cond, pd, size, pg, z1, z2); 1758 } else { 1759 assert(is_integral_type(bt), "unsupported element type"); 1760 sve_cmp(cond, pd, size, pg, z1, z2); 1761 } 1762 } 1763 1764 // Get index of the last mask lane that is set 1765 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1766 SIMD_RegVariant size = elemType_to_regVariant(bt); 1767 sve_rev(ptmp, size, src); 1768 sve_brkb(ptmp, ptrue, ptmp, false); 1769 sve_cntp(dst, size, ptrue, ptmp); 1770 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1771 subw(dst, rscratch1, dst); 1772 } 1773 1774 // Extend integer vector src to dst with the same lane count 1775 // but larger element size, e.g. 4B -> 4I 1776 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1777 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1778 if (src_bt == T_BYTE) { 1779 if (dst_bt == T_SHORT) { 1780 // 4B/8B to 4S/8S 1781 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1782 } else { 1783 // 4B to 4I 1784 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1785 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1786 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1787 } 1788 } else if (src_bt == T_SHORT) { 1789 // 4S to 4I 1790 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1791 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1792 } else if (src_bt == T_INT) { 1793 // 2I to 2L 1794 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1795 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1796 } else { 1797 ShouldNotReachHere(); 1798 } 1799 } 1800 1801 // Narrow integer vector src down to dst with the same lane count 1802 // but smaller element size, e.g. 4I -> 4B 1803 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1804 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1805 if (src_bt == T_SHORT) { 1806 // 4S/8S to 4B/8B 1807 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1808 assert(dst_bt == T_BYTE, "unsupported"); 1809 xtn(dst, T8B, src, T8H); 1810 } else if (src_bt == T_INT) { 1811 // 4I to 4B/4S 1812 assert(src_vlen_in_bytes == 16, "unsupported"); 1813 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1814 xtn(dst, T4H, src, T4S); 1815 if (dst_bt == T_BYTE) { 1816 xtn(dst, T8B, dst, T8H); 1817 } 1818 } else if (src_bt == T_LONG) { 1819 // 2L to 2I 1820 assert(src_vlen_in_bytes == 16, "unsupported"); 1821 assert(dst_bt == T_INT, "unsupported"); 1822 xtn(dst, T2S, src, T2D); 1823 } else { 1824 ShouldNotReachHere(); 1825 } 1826 } 1827 1828 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1829 FloatRegister src, SIMD_RegVariant src_size, 1830 bool is_unsigned) { 1831 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1832 1833 if (src_size == B) { 1834 switch (dst_size) { 1835 case H: 1836 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1837 break; 1838 case S: 1839 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1840 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1841 break; 1842 case D: 1843 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1844 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1845 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1846 break; 1847 default: 1848 ShouldNotReachHere(); 1849 } 1850 } else if (src_size == H) { 1851 if (dst_size == S) { 1852 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1853 } else { // D 1854 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1855 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1856 } 1857 } else if (src_size == S) { 1858 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1859 } 1860 } 1861 1862 // Vector narrow from src to dst with specified element sizes. 1863 // High part of dst vector will be filled with zero. 1864 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1865 FloatRegister src, SIMD_RegVariant src_size, 1866 FloatRegister tmp) { 1867 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1868 assert_different_registers(src, tmp); 1869 sve_dup(tmp, src_size, 0); 1870 if (src_size == D) { 1871 switch (dst_size) { 1872 case S: 1873 sve_uzp1(dst, S, src, tmp); 1874 break; 1875 case H: 1876 assert_different_registers(dst, tmp); 1877 sve_uzp1(dst, S, src, tmp); 1878 sve_uzp1(dst, H, dst, tmp); 1879 break; 1880 case B: 1881 assert_different_registers(dst, tmp); 1882 sve_uzp1(dst, S, src, tmp); 1883 sve_uzp1(dst, H, dst, tmp); 1884 sve_uzp1(dst, B, dst, tmp); 1885 break; 1886 default: 1887 ShouldNotReachHere(); 1888 } 1889 } else if (src_size == S) { 1890 if (dst_size == H) { 1891 sve_uzp1(dst, H, src, tmp); 1892 } else { // B 1893 assert_different_registers(dst, tmp); 1894 sve_uzp1(dst, H, src, tmp); 1895 sve_uzp1(dst, B, dst, tmp); 1896 } 1897 } else if (src_size == H) { 1898 sve_uzp1(dst, B, src, tmp); 1899 } 1900 } 1901 1902 // Extend src predicate to dst predicate with the same lane count but larger 1903 // element size, e.g. 64Byte -> 512Long 1904 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1905 uint dst_element_length_in_bytes, 1906 uint src_element_length_in_bytes) { 1907 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1908 sve_punpklo(dst, src); 1909 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1910 sve_punpklo(dst, src); 1911 sve_punpklo(dst, dst); 1912 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1913 sve_punpklo(dst, src); 1914 sve_punpklo(dst, dst); 1915 sve_punpklo(dst, dst); 1916 } else { 1917 assert(false, "unsupported"); 1918 ShouldNotReachHere(); 1919 } 1920 } 1921 1922 // Narrow src predicate to dst predicate with the same lane count but 1923 // smaller element size, e.g. 512Long -> 64Byte 1924 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1925 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1926 // The insignificant bits in src predicate are expected to be zero. 1927 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1928 // passed as the second argument. An example narrowing operation with a given mask would be - 1929 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1930 // Mask (for 2 Longs) : TF 1931 // Predicate register for the above mask (16 bits) : 00000001 00000000 1932 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1933 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1934 assert_different_registers(src, ptmp); 1935 assert_different_registers(dst, ptmp); 1936 sve_pfalse(ptmp); 1937 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1938 sve_uzp1(dst, B, src, ptmp); 1939 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1940 sve_uzp1(dst, H, src, ptmp); 1941 sve_uzp1(dst, B, dst, ptmp); 1942 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1943 sve_uzp1(dst, S, src, ptmp); 1944 sve_uzp1(dst, H, dst, ptmp); 1945 sve_uzp1(dst, B, dst, ptmp); 1946 } else { 1947 assert(false, "unsupported"); 1948 ShouldNotReachHere(); 1949 } 1950 } 1951 1952 // Vector reduction add for integral type with ASIMD instructions. 1953 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1954 Register isrc, FloatRegister vsrc, 1955 unsigned vector_length_in_bytes, 1956 FloatRegister vtmp) { 1957 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1958 assert_different_registers(dst, isrc); 1959 bool isQ = vector_length_in_bytes == 16; 1960 1961 BLOCK_COMMENT("neon_reduce_add_integral {"); 1962 switch(bt) { 1963 case T_BYTE: 1964 addv(vtmp, isQ ? T16B : T8B, vsrc); 1965 smov(dst, vtmp, B, 0); 1966 addw(dst, dst, isrc, ext::sxtb); 1967 break; 1968 case T_SHORT: 1969 addv(vtmp, isQ ? T8H : T4H, vsrc); 1970 smov(dst, vtmp, H, 0); 1971 addw(dst, dst, isrc, ext::sxth); 1972 break; 1973 case T_INT: 1974 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1975 umov(dst, vtmp, S, 0); 1976 addw(dst, dst, isrc); 1977 break; 1978 case T_LONG: 1979 assert(isQ, "unsupported"); 1980 addpd(vtmp, vsrc); 1981 umov(dst, vtmp, D, 0); 1982 add(dst, dst, isrc); 1983 break; 1984 default: 1985 assert(false, "unsupported"); 1986 ShouldNotReachHere(); 1987 } 1988 BLOCK_COMMENT("} neon_reduce_add_integral"); 1989 } 1990 1991 // Vector reduction multiply for integral type with ASIMD instructions. 1992 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1993 // Clobbers: rscratch1 1994 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1995 Register isrc, FloatRegister vsrc, 1996 unsigned vector_length_in_bytes, 1997 FloatRegister vtmp1, FloatRegister vtmp2) { 1998 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1999 bool isQ = vector_length_in_bytes == 16; 2000 2001 BLOCK_COMMENT("neon_reduce_mul_integral {"); 2002 switch(bt) { 2003 case T_BYTE: 2004 if (isQ) { 2005 // Multiply the lower half and higher half of vector iteratively. 2006 // vtmp1 = vsrc[8:15] 2007 ins(vtmp1, D, vsrc, 0, 1); 2008 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2009 mulv(vtmp1, T8B, vtmp1, vsrc); 2010 // vtmp2 = vtmp1[4:7] 2011 ins(vtmp2, S, vtmp1, 0, 1); 2012 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2013 mulv(vtmp1, T8B, vtmp2, vtmp1); 2014 } else { 2015 ins(vtmp1, S, vsrc, 0, 1); 2016 mulv(vtmp1, T8B, vtmp1, vsrc); 2017 } 2018 // vtmp2 = vtmp1[2:3] 2019 ins(vtmp2, H, vtmp1, 0, 1); 2020 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2021 mulv(vtmp2, T8B, vtmp2, vtmp1); 2022 // dst = vtmp2[0] * isrc * vtmp2[1] 2023 umov(rscratch1, vtmp2, B, 0); 2024 mulw(dst, rscratch1, isrc); 2025 sxtb(dst, dst); 2026 umov(rscratch1, vtmp2, B, 1); 2027 mulw(dst, rscratch1, dst); 2028 sxtb(dst, dst); 2029 break; 2030 case T_SHORT: 2031 if (isQ) { 2032 ins(vtmp2, D, vsrc, 0, 1); 2033 mulv(vtmp2, T4H, vtmp2, vsrc); 2034 ins(vtmp1, S, vtmp2, 0, 1); 2035 mulv(vtmp1, T4H, vtmp1, vtmp2); 2036 } else { 2037 ins(vtmp1, S, vsrc, 0, 1); 2038 mulv(vtmp1, T4H, vtmp1, vsrc); 2039 } 2040 umov(rscratch1, vtmp1, H, 0); 2041 mulw(dst, rscratch1, isrc); 2042 sxth(dst, dst); 2043 umov(rscratch1, vtmp1, H, 1); 2044 mulw(dst, rscratch1, dst); 2045 sxth(dst, dst); 2046 break; 2047 case T_INT: 2048 if (isQ) { 2049 ins(vtmp1, D, vsrc, 0, 1); 2050 mulv(vtmp1, T2S, vtmp1, vsrc); 2051 } else { 2052 vtmp1 = vsrc; 2053 } 2054 umov(rscratch1, vtmp1, S, 0); 2055 mul(dst, rscratch1, isrc); 2056 umov(rscratch1, vtmp1, S, 1); 2057 mul(dst, rscratch1, dst); 2058 break; 2059 case T_LONG: 2060 umov(rscratch1, vsrc, D, 0); 2061 mul(dst, isrc, rscratch1); 2062 umov(rscratch1, vsrc, D, 1); 2063 mul(dst, dst, rscratch1); 2064 break; 2065 default: 2066 assert(false, "unsupported"); 2067 ShouldNotReachHere(); 2068 } 2069 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2070 } 2071 2072 // Vector reduction multiply for floating-point type with ASIMD instructions. 2073 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2074 FloatRegister fsrc, FloatRegister vsrc, 2075 unsigned vector_length_in_bytes, 2076 FloatRegister vtmp) { 2077 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2078 bool isQ = vector_length_in_bytes == 16; 2079 2080 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2081 switch(bt) { 2082 case T_FLOAT: 2083 fmuls(dst, fsrc, vsrc); 2084 ins(vtmp, S, vsrc, 0, 1); 2085 fmuls(dst, dst, vtmp); 2086 if (isQ) { 2087 ins(vtmp, S, vsrc, 0, 2); 2088 fmuls(dst, dst, vtmp); 2089 ins(vtmp, S, vsrc, 0, 3); 2090 fmuls(dst, dst, vtmp); 2091 } 2092 break; 2093 case T_DOUBLE: 2094 assert(isQ, "unsupported"); 2095 fmuld(dst, fsrc, vsrc); 2096 ins(vtmp, D, vsrc, 0, 1); 2097 fmuld(dst, dst, vtmp); 2098 break; 2099 default: 2100 assert(false, "unsupported"); 2101 ShouldNotReachHere(); 2102 } 2103 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2104 } 2105 2106 // Helper to select logical instruction 2107 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2108 Register Rn, Register Rm, 2109 enum shift_kind kind, unsigned shift) { 2110 switch(opc) { 2111 case Op_AndReductionV: 2112 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2113 break; 2114 case Op_OrReductionV: 2115 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2116 break; 2117 case Op_XorReductionV: 2118 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2119 break; 2120 default: 2121 assert(false, "unsupported"); 2122 ShouldNotReachHere(); 2123 } 2124 } 2125 2126 // Vector reduction logical operations And, Or, Xor 2127 // Clobbers: rscratch1 2128 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2129 Register isrc, FloatRegister vsrc, 2130 unsigned vector_length_in_bytes) { 2131 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2132 "unsupported"); 2133 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2134 assert_different_registers(dst, isrc); 2135 bool isQ = vector_length_in_bytes == 16; 2136 2137 BLOCK_COMMENT("neon_reduce_logical {"); 2138 umov(rscratch1, vsrc, isQ ? D : S, 0); 2139 umov(dst, vsrc, isQ ? D : S, 1); 2140 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2141 switch(bt) { 2142 case T_BYTE: 2143 if (isQ) { 2144 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2145 } 2146 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2147 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2148 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2149 sxtb(dst, dst); 2150 break; 2151 case T_SHORT: 2152 if (isQ) { 2153 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2154 } 2155 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2156 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2157 sxth(dst, dst); 2158 break; 2159 case T_INT: 2160 if (isQ) { 2161 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2162 } 2163 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2164 break; 2165 case T_LONG: 2166 assert(isQ, "unsupported"); 2167 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2168 break; 2169 default: 2170 assert(false, "unsupported"); 2171 ShouldNotReachHere(); 2172 } 2173 BLOCK_COMMENT("} neon_reduce_logical"); 2174 } 2175 2176 // Vector reduction min/max for integral type with ASIMD instructions. 2177 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2178 // Clobbers: rscratch1, rflags 2179 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2180 Register isrc, FloatRegister vsrc, 2181 unsigned vector_length_in_bytes, 2182 FloatRegister vtmp) { 2183 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2184 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2185 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2186 assert_different_registers(dst, isrc); 2187 bool isQ = vector_length_in_bytes == 16; 2188 bool is_min = opc == Op_MinReductionV; 2189 2190 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2191 if (bt == T_LONG) { 2192 assert(vtmp == fnoreg, "should be"); 2193 assert(isQ, "should be"); 2194 umov(rscratch1, vsrc, D, 0); 2195 cmp(isrc, rscratch1); 2196 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2197 umov(rscratch1, vsrc, D, 1); 2198 cmp(dst, rscratch1); 2199 csel(dst, dst, rscratch1, is_min ? LT : GT); 2200 } else { 2201 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2202 if (size == T2S) { 2203 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2204 } else { 2205 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2206 } 2207 if (bt == T_INT) { 2208 umov(dst, vtmp, S, 0); 2209 } else { 2210 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2211 } 2212 cmpw(dst, isrc); 2213 cselw(dst, dst, isrc, is_min ? LT : GT); 2214 } 2215 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2216 } 2217 2218 // Vector reduction for integral type with SVE instruction. 2219 // Supported operations are Add, And, Or, Xor, Max, Min. 2220 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2221 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2222 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2223 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2224 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2225 assert_different_registers(src1, dst); 2226 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2227 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2228 switch (opc) { 2229 case Op_AddReductionVI: { 2230 sve_uaddv(tmp, size, pg, src2); 2231 if (bt == T_BYTE) { 2232 smov(dst, tmp, size, 0); 2233 addw(dst, src1, dst, ext::sxtb); 2234 } else if (bt == T_SHORT) { 2235 smov(dst, tmp, size, 0); 2236 addw(dst, src1, dst, ext::sxth); 2237 } else { 2238 umov(dst, tmp, size, 0); 2239 addw(dst, dst, src1); 2240 } 2241 break; 2242 } 2243 case Op_AddReductionVL: { 2244 sve_uaddv(tmp, size, pg, src2); 2245 umov(dst, tmp, size, 0); 2246 add(dst, dst, src1); 2247 break; 2248 } 2249 case Op_AndReductionV: { 2250 sve_andv(tmp, size, pg, src2); 2251 if (bt == T_INT || bt == T_LONG) { 2252 umov(dst, tmp, size, 0); 2253 } else { 2254 smov(dst, tmp, size, 0); 2255 } 2256 if (bt == T_LONG) { 2257 andr(dst, dst, src1); 2258 } else { 2259 andw(dst, dst, src1); 2260 } 2261 break; 2262 } 2263 case Op_OrReductionV: { 2264 sve_orv(tmp, size, pg, src2); 2265 if (bt == T_INT || bt == T_LONG) { 2266 umov(dst, tmp, size, 0); 2267 } else { 2268 smov(dst, tmp, size, 0); 2269 } 2270 if (bt == T_LONG) { 2271 orr(dst, dst, src1); 2272 } else { 2273 orrw(dst, dst, src1); 2274 } 2275 break; 2276 } 2277 case Op_XorReductionV: { 2278 sve_eorv(tmp, size, pg, src2); 2279 if (bt == T_INT || bt == T_LONG) { 2280 umov(dst, tmp, size, 0); 2281 } else { 2282 smov(dst, tmp, size, 0); 2283 } 2284 if (bt == T_LONG) { 2285 eor(dst, dst, src1); 2286 } else { 2287 eorw(dst, dst, src1); 2288 } 2289 break; 2290 } 2291 case Op_MaxReductionV: { 2292 sve_smaxv(tmp, size, pg, src2); 2293 if (bt == T_INT || bt == T_LONG) { 2294 umov(dst, tmp, size, 0); 2295 } else { 2296 smov(dst, tmp, size, 0); 2297 } 2298 if (bt == T_LONG) { 2299 cmp(dst, src1); 2300 csel(dst, dst, src1, Assembler::GT); 2301 } else { 2302 cmpw(dst, src1); 2303 cselw(dst, dst, src1, Assembler::GT); 2304 } 2305 break; 2306 } 2307 case Op_MinReductionV: { 2308 sve_sminv(tmp, size, pg, src2); 2309 if (bt == T_INT || bt == T_LONG) { 2310 umov(dst, tmp, size, 0); 2311 } else { 2312 smov(dst, tmp, size, 0); 2313 } 2314 if (bt == T_LONG) { 2315 cmp(dst, src1); 2316 csel(dst, dst, src1, Assembler::LT); 2317 } else { 2318 cmpw(dst, src1); 2319 cselw(dst, dst, src1, Assembler::LT); 2320 } 2321 break; 2322 } 2323 default: 2324 assert(false, "unsupported"); 2325 ShouldNotReachHere(); 2326 } 2327 2328 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2329 if (bt == T_BYTE) { 2330 sxtb(dst, dst); 2331 } else if (bt == T_SHORT) { 2332 sxth(dst, dst); 2333 } 2334 } 2335 } 2336 2337 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2338 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2339 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2340 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2341 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2342 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2343 2344 // Set all elements to false if the input "lane_cnt" is zero. 2345 if (lane_cnt == 0) { 2346 sve_pfalse(dst); 2347 return; 2348 } 2349 2350 SIMD_RegVariant size = elemType_to_regVariant(bt); 2351 assert(size != Q, "invalid size"); 2352 2353 // Set all true if "lane_cnt" equals to the max lane count. 2354 if (lane_cnt == max_vector_length) { 2355 sve_ptrue(dst, size, /* ALL */ 0b11111); 2356 return; 2357 } 2358 2359 // Fixed numbers for "ptrue". 2360 switch(lane_cnt) { 2361 case 1: /* VL1 */ 2362 case 2: /* VL2 */ 2363 case 3: /* VL3 */ 2364 case 4: /* VL4 */ 2365 case 5: /* VL5 */ 2366 case 6: /* VL6 */ 2367 case 7: /* VL7 */ 2368 case 8: /* VL8 */ 2369 sve_ptrue(dst, size, lane_cnt); 2370 return; 2371 case 16: 2372 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2373 return; 2374 case 32: 2375 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2376 return; 2377 case 64: 2378 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2379 return; 2380 case 128: 2381 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2382 return; 2383 case 256: 2384 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2385 return; 2386 default: 2387 break; 2388 } 2389 2390 // Special patterns for "ptrue". 2391 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2392 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2393 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2394 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2395 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2396 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2397 } else { 2398 // Encode to "whileltw" for the remaining cases. 2399 mov(rscratch1, lane_cnt); 2400 sve_whileltw(dst, size, zr, rscratch1); 2401 } 2402 } 2403 2404 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2405 // Any remaining elements of dst will be filled with zero. 2406 // Clobbers: rscratch1 2407 // Preserves: src, mask 2408 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2409 FloatRegister vtmp1, FloatRegister vtmp2, 2410 PRegister pgtmp) { 2411 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2412 assert_different_registers(dst, src, vtmp1, vtmp2); 2413 assert_different_registers(mask, pgtmp); 2414 2415 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2416 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2417 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2418 sve_dup(vtmp2, H, 0); 2419 2420 // Extend lowest half to type INT. 2421 // dst = 00004444 00003333 00002222 00001111 2422 sve_uunpklo(dst, S, src); 2423 // pgtmp = 00000001 00000000 00000001 00000001 2424 sve_punpklo(pgtmp, mask); 2425 // Pack the active elements in size of type INT to the right, 2426 // and fill the remainings with zero. 2427 // dst = 00000000 00004444 00002222 00001111 2428 sve_compact(dst, S, dst, pgtmp); 2429 // Narrow the result back to type SHORT. 2430 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2431 sve_uzp1(dst, H, dst, vtmp2); 2432 // Count the active elements of lowest half. 2433 // rscratch1 = 3 2434 sve_cntp(rscratch1, S, ptrue, pgtmp); 2435 2436 // Repeat to the highest half. 2437 // pgtmp = 00000001 00000000 00000000 00000001 2438 sve_punpkhi(pgtmp, mask); 2439 // vtmp1 = 00008888 00007777 00006666 00005555 2440 sve_uunpkhi(vtmp1, S, src); 2441 // vtmp1 = 00000000 00000000 00008888 00005555 2442 sve_compact(vtmp1, S, vtmp1, pgtmp); 2443 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2444 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2445 2446 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2447 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2448 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2449 // TRUE_CNT is the number of active elements in the compressed low. 2450 neg(rscratch1, rscratch1); 2451 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2452 sve_index(vtmp2, H, rscratch1, 1); 2453 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2454 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2455 2456 // Combine the compressed high(after shifted) with the compressed low. 2457 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2458 sve_orr(dst, dst, vtmp1); 2459 } 2460 2461 // Clobbers: rscratch1, rscratch2 2462 // Preserves: src, mask 2463 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2464 FloatRegister vtmp1, FloatRegister vtmp2, 2465 FloatRegister vtmp3, FloatRegister vtmp4, 2466 PRegister ptmp, PRegister pgtmp) { 2467 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2468 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2469 assert_different_registers(mask, ptmp, pgtmp); 2470 // Example input: src = 88 77 66 55 44 33 22 11 2471 // mask = 01 00 00 01 01 00 01 01 2472 // Expected result: dst = 00 00 00 88 55 44 22 11 2473 2474 sve_dup(vtmp4, B, 0); 2475 // Extend lowest half to type SHORT. 2476 // vtmp1 = 0044 0033 0022 0011 2477 sve_uunpklo(vtmp1, H, src); 2478 // ptmp = 0001 0000 0001 0001 2479 sve_punpklo(ptmp, mask); 2480 // Count the active elements of lowest half. 2481 // rscratch2 = 3 2482 sve_cntp(rscratch2, H, ptrue, ptmp); 2483 // Pack the active elements in size of type SHORT to the right, 2484 // and fill the remainings with zero. 2485 // dst = 0000 0044 0022 0011 2486 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2487 // Narrow the result back to type BYTE. 2488 // dst = 00 00 00 00 00 44 22 11 2489 sve_uzp1(dst, B, dst, vtmp4); 2490 2491 // Repeat to the highest half. 2492 // ptmp = 0001 0000 0000 0001 2493 sve_punpkhi(ptmp, mask); 2494 // vtmp1 = 0088 0077 0066 0055 2495 sve_uunpkhi(vtmp2, H, src); 2496 // vtmp1 = 0000 0000 0088 0055 2497 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2498 2499 sve_dup(vtmp4, B, 0); 2500 // vtmp1 = 00 00 00 00 00 00 88 55 2501 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2502 2503 // Compressed low: dst = 00 00 00 00 00 44 22 11 2504 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2505 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2506 // TRUE_CNT is the number of active elements in the compressed low. 2507 neg(rscratch2, rscratch2); 2508 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2509 sve_index(vtmp2, B, rscratch2, 1); 2510 // vtmp1 = 00 00 00 88 55 00 00 00 2511 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2512 // Combine the compressed high(after shifted) with the compressed low. 2513 // dst = 00 00 00 88 55 44 22 11 2514 sve_orr(dst, dst, vtmp1); 2515 } 2516 2517 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2518 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2519 SIMD_Arrangement size = isQ ? T16B : T8B; 2520 if (bt == T_BYTE) { 2521 rbit(dst, size, src); 2522 } else { 2523 neon_reverse_bytes(dst, src, bt, isQ); 2524 rbit(dst, size, dst); 2525 } 2526 } 2527 2528 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2529 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2530 SIMD_Arrangement size = isQ ? T16B : T8B; 2531 switch (bt) { 2532 case T_BYTE: 2533 if (dst != src) { 2534 orr(dst, size, src, src); 2535 } 2536 break; 2537 case T_SHORT: 2538 rev16(dst, size, src); 2539 break; 2540 case T_INT: 2541 rev32(dst, size, src); 2542 break; 2543 case T_LONG: 2544 rev64(dst, size, src); 2545 break; 2546 default: 2547 assert(false, "unsupported"); 2548 ShouldNotReachHere(); 2549 } 2550 } 2551 2552 // Extract a scalar element from an sve vector at position 'idx'. 2553 // The input elements in src are expected to be of integral type. 2554 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2555 int idx, FloatRegister vtmp) { 2556 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2557 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2558 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2559 if (bt == T_INT || bt == T_LONG) { 2560 umov(dst, src, size, idx); 2561 } else { 2562 smov(dst, src, size, idx); 2563 } 2564 } else { 2565 sve_orr(vtmp, src, src); 2566 sve_ext(vtmp, vtmp, idx << size); 2567 if (bt == T_INT || bt == T_LONG) { 2568 umov(dst, vtmp, size, 0); 2569 } else { 2570 smov(dst, vtmp, size, 0); 2571 } 2572 } 2573 } 2574 2575 // java.lang.Math::round intrinsics 2576 2577 // Clobbers: rscratch1, rflags 2578 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2579 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2580 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2581 switch (T) { 2582 case T2S: 2583 case T4S: 2584 fmovs(tmp1, T, 0.5f); 2585 mov(rscratch1, jint_cast(0x1.0p23f)); 2586 break; 2587 case T2D: 2588 fmovd(tmp1, T, 0.5); 2589 mov(rscratch1, julong_cast(0x1.0p52)); 2590 break; 2591 default: 2592 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2593 } 2594 fadd(tmp1, T, tmp1, src); 2595 fcvtms(tmp1, T, tmp1); 2596 // tmp1 = floor(src + 0.5, ties to even) 2597 2598 fcvtas(dst, T, src); 2599 // dst = round(src), ties to away 2600 2601 fneg(tmp3, T, src); 2602 dup(tmp2, T, rscratch1); 2603 cm(HS, tmp3, T, tmp3, tmp2); 2604 // tmp3 is now a set of flags 2605 2606 bif(dst, T16B, tmp1, tmp3); 2607 // result in dst 2608 } 2609 2610 // Clobbers: rscratch1, rflags 2611 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2612 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2613 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2614 assert_different_registers(tmp1, tmp2, src, dst); 2615 2616 switch (T) { 2617 case S: 2618 mov(rscratch1, jint_cast(0x1.0p23f)); 2619 break; 2620 case D: 2621 mov(rscratch1, julong_cast(0x1.0p52)); 2622 break; 2623 default: 2624 assert(T == S || T == D, "invalid register variant"); 2625 } 2626 2627 sve_frinta(dst, T, ptrue, src); 2628 // dst = round(src), ties to away 2629 2630 Label none; 2631 2632 sve_fneg(tmp1, T, ptrue, src); 2633 sve_dup(tmp2, T, rscratch1); 2634 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2635 br(EQ, none); 2636 { 2637 sve_cpy(tmp1, T, pgtmp, 0.5); 2638 sve_fadd(tmp1, T, pgtmp, src); 2639 sve_frintm(dst, T, pgtmp, tmp1); 2640 // dst = floor(src + 0.5, ties to even) 2641 } 2642 bind(none); 2643 2644 sve_fcvtzs(dst, T, ptrue, dst, T); 2645 // result in dst 2646 } 2647 2648 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2649 FloatRegister one, SIMD_Arrangement T) { 2650 assert_different_registers(dst, src, zero, one); 2651 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2652 2653 facgt(dst, T, src, zero); 2654 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2655 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2656 } 2657 2658 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2659 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2660 assert_different_registers(dst, src, zero, one, vtmp); 2661 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2662 2663 sve_orr(vtmp, src, src); 2664 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2665 switch (T) { 2666 case S: 2667 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2668 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2669 // on the sign of the float value 2670 break; 2671 case D: 2672 sve_and(vtmp, T, min_jlong); 2673 sve_orr(vtmp, T, jlong_cast(1.0)); 2674 break; 2675 default: 2676 assert(false, "unsupported"); 2677 ShouldNotReachHere(); 2678 } 2679 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2680 // Result in dst 2681 } 2682 2683 bool C2_MacroAssembler::in_scratch_emit_size() { 2684 if (ciEnv::current()->task() != nullptr) { 2685 PhaseOutput* phase_output = Compile::current()->output(); 2686 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2687 return true; 2688 } 2689 } 2690 return MacroAssembler::in_scratch_emit_size(); 2691 }