1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 50 Register tmp2Reg, Register tmp3Reg) { 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmpReg; 54 Register tmp = tmp2Reg; 55 Label cont; 56 Label object_has_monitor; 57 Label count, no_count; 58 59 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 60 assert_different_registers(oop, box, tmp, disp_hdr); 61 62 // Load markWord from object into displaced_header. 63 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 64 65 if (DiagnoseSyncOnValueBasedClasses != 0) { 66 load_klass(tmp, oop); 67 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 68 tst(tmp, KlassFlags::_misc_is_value_based_class); 69 br(Assembler::NE, cont); 70 } 71 72 // Check for existing monitor 73 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 74 75 if (LockingMode == LM_MONITOR) { 76 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 77 b(cont); 78 } else { 79 assert(LockingMode == LM_LEGACY, "must be"); 80 // Set tmp to be (markWord of object | UNLOCK_VALUE). 81 orr(tmp, disp_hdr, markWord::unlocked_value); 82 83 // Initialize the box. (Must happen before we update the object mark!) 84 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 85 86 // Compare object markWord with an unlocked value (tmp) and if 87 // equal exchange the stack address of our box with object markWord. 88 // On failure disp_hdr contains the possibly locked markWord. 89 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 90 /*release*/ true, /*weak*/ false, disp_hdr); 91 br(Assembler::EQ, cont); 92 93 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 94 95 // If the compare-and-exchange succeeded, then we found an unlocked 96 // object, will have now locked it will continue at label cont 97 98 // Check if the owner is self by comparing the value in the 99 // markWord of object (disp_hdr) with the stack pointer. 100 mov(rscratch1, sp); 101 sub(disp_hdr, disp_hdr, rscratch1); 102 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 103 // If condition is true we are cont and hence we can store 0 as the 104 // displaced header in the box, which indicates that it is a recursive lock. 105 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 106 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 107 b(cont); 108 } 109 110 // Handle existing monitor. 111 bind(object_has_monitor); 112 113 // The object's monitor m is unlocked iff m->owner == nullptr, 114 // otherwise m->owner may contain a thread or a stack address. 115 // 116 // Try to CAS m->owner from null to current thread. 117 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 118 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 119 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 120 121 // Store a non-null value into the box to avoid looking like a re-entrant 122 // lock. The fast-path monitor unlock code checks for 123 // markWord::monitor_value so use markWord::unused_mark which has the 124 // relevant bit set, and also matches ObjectSynchronizer::enter. 125 mov(tmp, (address)markWord::unused_mark().value()); 126 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 127 128 br(Assembler::EQ, cont); // CAS success means locking succeeded 129 130 cmp(tmp3Reg, rthread); 131 br(Assembler::NE, cont); // Check for recursive locking 132 133 // Recursive lock case 134 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 135 // flag == EQ still from the cmp above, checking if this is a reentrant lock 136 137 bind(cont); 138 // flag == EQ indicates success 139 // flag == NE indicates failure 140 br(Assembler::NE, no_count); 141 142 bind(count); 143 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 144 145 bind(no_count); 146 } 147 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 149 Register tmp2Reg) { 150 Register oop = objectReg; 151 Register box = boxReg; 152 Register disp_hdr = tmpReg; 153 Register owner_addr = tmpReg; 154 Register tmp = tmp2Reg; 155 Label cont; 156 Label object_has_monitor; 157 Label count, no_count; 158 Label unlocked; 159 160 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 161 assert_different_registers(oop, box, tmp, disp_hdr); 162 163 if (LockingMode == LM_LEGACY) { 164 // Find the lock address and load the displaced header from the stack. 165 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 166 167 // If the displaced header is 0, we have a recursive unlock. 168 cmp(disp_hdr, zr); 169 br(Assembler::EQ, cont); 170 } 171 172 // Handle existing monitor. 173 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 174 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 175 176 if (LockingMode == LM_MONITOR) { 177 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 178 b(cont); 179 } else { 180 assert(LockingMode == LM_LEGACY, "must be"); 181 // Check if it is still a light weight lock, this is is true if we 182 // see the stack address of the basicLock in the markWord of the 183 // object. 184 185 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 186 /*release*/ true, /*weak*/ false, tmp); 187 b(cont); 188 } 189 190 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 191 192 // Handle existing monitor. 193 bind(object_has_monitor); 194 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 195 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 196 197 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 198 199 Label notRecursive; 200 cbz(disp_hdr, notRecursive); 201 202 // Recursive lock 203 sub(disp_hdr, disp_hdr, 1u); 204 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 205 cmp(disp_hdr, disp_hdr); // Sets flags for result 206 b(cont); 207 208 bind(notRecursive); 209 210 // Compute owner address. 211 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 212 213 // Set owner to null. 214 // Release to satisfy the JMM 215 stlr(zr, owner_addr); 216 // We need a full fence after clearing owner to avoid stranding. 217 // StoreLoad achieves this. 218 membar(StoreLoad); 219 220 // Check if the entry lists are empty. 221 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 222 ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset())); 223 orr(rscratch1, rscratch1, tmpReg); 224 cmp(rscratch1, zr); 225 br(Assembler::EQ, cont); // If so we are done. 226 227 // Check if there is a successor. 228 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 229 cmp(rscratch1, zr); 230 br(Assembler::NE, unlocked); // If so we are done. 231 232 // Save the monitor pointer in the current thread, so we can try to 233 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 234 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 235 236 cmp(zr, rthread); // Set Flag to NE => slow path 237 b(cont); 238 239 bind(unlocked); 240 cmp(zr, zr); // Set Flag to EQ => fast path 241 242 // Intentional fall-through 243 244 bind(cont); 245 // flag == EQ indicates success 246 // flag == NE indicates failure 247 br(Assembler::NE, no_count); 248 249 bind(count); 250 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 251 252 bind(no_count); 253 } 254 255 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 256 Register t2, Register t3) { 257 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 258 assert_different_registers(obj, box, t1, t2, t3); 259 260 // Handle inflated monitor. 261 Label inflated; 262 // Finish fast lock successfully. MUST branch to with flag == EQ 263 Label locked; 264 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 265 Label slow_path; 266 267 if (UseObjectMonitorTable) { 268 // Clear cache in case fast locking succeeds. 269 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 270 } 271 272 if (DiagnoseSyncOnValueBasedClasses != 0) { 273 load_klass(t1, obj); 274 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 275 tst(t1, KlassFlags::_misc_is_value_based_class); 276 br(Assembler::NE, slow_path); 277 } 278 279 const Register t1_mark = t1; 280 const Register t3_t = t3; 281 282 { // Lightweight locking 283 284 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 285 Label push; 286 287 const Register t2_top = t2; 288 289 // Check if lock-stack is full. 290 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 291 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 292 br(Assembler::GT, slow_path); 293 294 // Check if recursive. 295 subw(t3_t, t2_top, oopSize); 296 ldr(t3_t, Address(rthread, t3_t)); 297 cmp(obj, t3_t); 298 br(Assembler::EQ, push); 299 300 // Relaxed normal load to check for monitor. Optimization for monitor case. 301 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 302 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 303 304 // Not inflated 305 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 306 307 // Try to lock. Transition lock-bits 0b01 => 0b00 308 orr(t1_mark, t1_mark, markWord::unlocked_value); 309 eor(t3_t, t1_mark, markWord::unlocked_value); 310 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 311 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 312 br(Assembler::NE, slow_path); 313 314 bind(push); 315 // After successful lock, push object on lock-stack. 316 str(obj, Address(rthread, t2_top)); 317 addw(t2_top, t2_top, oopSize); 318 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 319 b(locked); 320 } 321 322 { // Handle inflated monitor. 323 bind(inflated); 324 325 const Register t1_monitor = t1; 326 327 if (!UseObjectMonitorTable) { 328 assert(t1_monitor == t1_mark, "should be the same here"); 329 } else { 330 Label monitor_found; 331 332 // Load cache address 333 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 334 335 const int num_unrolled = 2; 336 for (int i = 0; i < num_unrolled; i++) { 337 ldr(t1, Address(t3_t)); 338 cmp(obj, t1); 339 br(Assembler::EQ, monitor_found); 340 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 341 } 342 343 Label loop; 344 345 // Search for obj in cache. 346 bind(loop); 347 348 // Check for match. 349 ldr(t1, Address(t3_t)); 350 cmp(obj, t1); 351 br(Assembler::EQ, monitor_found); 352 353 // Search until null encountered, guaranteed _null_sentinel at end. 354 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 355 cbnz(t1, loop); 356 // Cache Miss, NE set from cmp above, cbnz does not set flags 357 b(slow_path); 358 359 bind(monitor_found); 360 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 361 } 362 363 const Register t2_owner_addr = t2; 364 const Register t3_owner = t3; 365 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 366 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 367 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 368 369 Label monitor_locked; 370 371 // Compute owner address. 372 lea(t2_owner_addr, owner_address); 373 374 // CAS owner (null => current thread). 375 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 376 /*release*/ false, /*weak*/ false, t3_owner); 377 br(Assembler::EQ, monitor_locked); 378 379 // Check if recursive. 380 cmp(t3_owner, rthread); 381 br(Assembler::NE, slow_path); 382 383 // Recursive. 384 increment(recursions_address, 1); 385 386 bind(monitor_locked); 387 if (UseObjectMonitorTable) { 388 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 389 } 390 } 391 392 bind(locked); 393 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 394 395 #ifdef ASSERT 396 // Check that locked label is reached with Flags == EQ. 397 Label flag_correct; 398 br(Assembler::EQ, flag_correct); 399 stop("Fast Lock Flag != EQ"); 400 #endif 401 402 bind(slow_path); 403 #ifdef ASSERT 404 // Check that slow_path label is reached with Flags == NE. 405 br(Assembler::NE, flag_correct); 406 stop("Fast Lock Flag != NE"); 407 bind(flag_correct); 408 #endif 409 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 410 } 411 412 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 413 Register t2, Register t3) { 414 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 415 assert_different_registers(obj, box, t1, t2, t3); 416 417 // Handle inflated monitor. 418 Label inflated, inflated_load_mark; 419 // Finish fast unlock successfully. MUST branch to with flag == EQ 420 Label unlocked; 421 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 422 Label slow_path; 423 424 const Register t1_mark = t1; 425 const Register t2_top = t2; 426 const Register t3_t = t3; 427 428 { // Lightweight unlock 429 430 Label push_and_slow_path; 431 432 // Check if obj is top of lock-stack. 433 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 434 subw(t2_top, t2_top, oopSize); 435 ldr(t3_t, Address(rthread, t2_top)); 436 cmp(obj, t3_t); 437 // Top of lock stack was not obj. Must be monitor. 438 br(Assembler::NE, inflated_load_mark); 439 440 // Pop lock-stack. 441 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 442 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 443 444 // Check if recursive. 445 subw(t3_t, t2_top, oopSize); 446 ldr(t3_t, Address(rthread, t3_t)); 447 cmp(obj, t3_t); 448 br(Assembler::EQ, unlocked); 449 450 // Not recursive. 451 // Load Mark. 452 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 453 454 // Check header for monitor (0b10). 455 // Because we got here by popping (meaning we pushed in locked) 456 // there will be no monitor in the box. So we need to push back the obj 457 // so that the runtime can fix any potential anonymous owner. 458 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 459 460 // Try to unlock. Transition lock bits 0b00 => 0b01 461 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 462 orr(t3_t, t1_mark, markWord::unlocked_value); 463 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 464 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 465 br(Assembler::EQ, unlocked); 466 467 bind(push_and_slow_path); 468 // Compare and exchange failed. 469 // Restore lock-stack and handle the unlock in runtime. 470 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 471 addw(t2_top, t2_top, oopSize); 472 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 473 b(slow_path); 474 } 475 476 477 { // Handle inflated monitor. 478 bind(inflated_load_mark); 479 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 480 #ifdef ASSERT 481 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 482 stop("Fast Unlock not monitor"); 483 #endif 484 485 bind(inflated); 486 487 #ifdef ASSERT 488 Label check_done; 489 subw(t2_top, t2_top, oopSize); 490 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 491 br(Assembler::LT, check_done); 492 ldr(t3_t, Address(rthread, t2_top)); 493 cmp(obj, t3_t); 494 br(Assembler::NE, inflated); 495 stop("Fast Unlock lock on stack"); 496 bind(check_done); 497 #endif 498 499 const Register t1_monitor = t1; 500 501 if (!UseObjectMonitorTable) { 502 assert(t1_monitor == t1_mark, "should be the same here"); 503 504 // Untag the monitor. 505 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 506 } else { 507 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 508 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 509 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 510 br(Assembler::LO, slow_path); 511 } 512 513 const Register t2_recursions = t2; 514 Label not_recursive; 515 516 // Check if recursive. 517 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 518 cbz(t2_recursions, not_recursive); 519 520 // Recursive unlock. 521 sub(t2_recursions, t2_recursions, 1u); 522 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 523 // Set flag == EQ 524 cmp(t2_recursions, t2_recursions); 525 b(unlocked); 526 527 bind(not_recursive); 528 529 const Register t2_owner_addr = t2; 530 531 // Compute owner address. 532 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 533 534 // Set owner to null. 535 // Release to satisfy the JMM 536 stlr(zr, t2_owner_addr); 537 // We need a full fence after clearing owner to avoid stranding. 538 // StoreLoad achieves this. 539 membar(StoreLoad); 540 541 // Check if the entry lists are empty. 542 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 543 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 544 orr(rscratch1, rscratch1, t3_t); 545 cmp(rscratch1, zr); 546 br(Assembler::EQ, unlocked); // If so we are done. 547 548 // Check if there is a successor. 549 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 550 cmp(rscratch1, zr); 551 br(Assembler::NE, unlocked); // If so we are done. 552 553 // Save the monitor pointer in the current thread, so we can try to 554 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 555 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 556 557 cmp(zr, rthread); // Set Flag to NE => slow path 558 b(slow_path); 559 } 560 561 bind(unlocked); 562 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 563 cmp(zr, zr); // Set Flags to EQ => fast path 564 565 #ifdef ASSERT 566 // Check that unlocked label is reached with Flags == EQ. 567 Label flag_correct; 568 br(Assembler::EQ, flag_correct); 569 stop("Fast Unlock Flag != EQ"); 570 #endif 571 572 bind(slow_path); 573 #ifdef ASSERT 574 // Check that slow_path label is reached with Flags == NE. 575 br(Assembler::NE, flag_correct); 576 stop("Fast Unlock Flag != NE"); 577 bind(flag_correct); 578 #endif 579 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 580 } 581 582 // Search for str1 in str2 and return index or -1 583 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 584 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 585 Register cnt2, Register cnt1, 586 Register tmp1, Register tmp2, 587 Register tmp3, Register tmp4, 588 Register tmp5, Register tmp6, 589 int icnt1, Register result, int ae) { 590 // NOTE: tmp5, tmp6 can be zr depending on specific method version 591 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 592 593 Register ch1 = rscratch1; 594 Register ch2 = rscratch2; 595 Register cnt1tmp = tmp1; 596 Register cnt2tmp = tmp2; 597 Register cnt1_neg = cnt1; 598 Register cnt2_neg = cnt2; 599 Register result_tmp = tmp4; 600 601 bool isL = ae == StrIntrinsicNode::LL; 602 603 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 604 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 605 int str1_chr_shift = str1_isL ? 0:1; 606 int str2_chr_shift = str2_isL ? 0:1; 607 int str1_chr_size = str1_isL ? 1:2; 608 int str2_chr_size = str2_isL ? 1:2; 609 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 610 (chr_insn)&MacroAssembler::ldrh; 611 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 612 (chr_insn)&MacroAssembler::ldrh; 613 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 614 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 615 616 // Note, inline_string_indexOf() generates checks: 617 // if (substr.count > string.count) return -1; 618 // if (substr.count == 0) return 0; 619 620 // We have two strings, a source string in str2, cnt2 and a pattern string 621 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 622 623 // For larger pattern and source we use a simplified Boyer Moore algorithm. 624 // With a small pattern and source we use linear scan. 625 626 if (icnt1 == -1) { 627 sub(result_tmp, cnt2, cnt1); 628 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 629 br(LT, LINEARSEARCH); 630 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 631 subs(zr, cnt1, 256); 632 lsr(tmp1, cnt2, 2); 633 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 634 br(GE, LINEARSTUB); 635 } 636 637 // The Boyer Moore alogorithm is based on the description here:- 638 // 639 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 640 // 641 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 642 // and the 'Good Suffix' rule. 643 // 644 // These rules are essentially heuristics for how far we can shift the 645 // pattern along the search string. 646 // 647 // The implementation here uses the 'Bad Character' rule only because of the 648 // complexity of initialisation for the 'Good Suffix' rule. 649 // 650 // This is also known as the Boyer-Moore-Horspool algorithm:- 651 // 652 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 653 // 654 // This particular implementation has few java-specific optimizations. 655 // 656 // #define ASIZE 256 657 // 658 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 659 // int i, j; 660 // unsigned c; 661 // unsigned char bc[ASIZE]; 662 // 663 // /* Preprocessing */ 664 // for (i = 0; i < ASIZE; ++i) 665 // bc[i] = m; 666 // for (i = 0; i < m - 1; ) { 667 // c = x[i]; 668 // ++i; 669 // // c < 256 for Latin1 string, so, no need for branch 670 // #ifdef PATTERN_STRING_IS_LATIN1 671 // bc[c] = m - i; 672 // #else 673 // if (c < ASIZE) bc[c] = m - i; 674 // #endif 675 // } 676 // 677 // /* Searching */ 678 // j = 0; 679 // while (j <= n - m) { 680 // c = y[i+j]; 681 // if (x[m-1] == c) 682 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 683 // if (i < 0) return j; 684 // // c < 256 for Latin1 string, so, no need for branch 685 // #ifdef SOURCE_STRING_IS_LATIN1 686 // // LL case: (c< 256) always true. Remove branch 687 // j += bc[y[j+m-1]]; 688 // #endif 689 // #ifndef PATTERN_STRING_IS_UTF 690 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 691 // if (c < ASIZE) 692 // j += bc[y[j+m-1]]; 693 // else 694 // j += 1 695 // #endif 696 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 697 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 698 // if (c < ASIZE) 699 // j += bc[y[j+m-1]]; 700 // else 701 // j += m 702 // #endif 703 // } 704 // } 705 706 if (icnt1 == -1) { 707 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 708 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 709 Register cnt1end = tmp2; 710 Register str2end = cnt2; 711 Register skipch = tmp2; 712 713 // str1 length is >=8, so, we can read at least 1 register for cases when 714 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 715 // UL case. We'll re-read last character in inner pre-loop code to have 716 // single outer pre-loop load 717 const int firstStep = isL ? 7 : 3; 718 719 const int ASIZE = 256; 720 const int STORED_BYTES = 32; // amount of bytes stored per instruction 721 sub(sp, sp, ASIZE); 722 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 723 mov(ch1, sp); 724 BIND(BM_INIT_LOOP); 725 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 726 subs(tmp5, tmp5, 1); 727 br(GT, BM_INIT_LOOP); 728 729 sub(cnt1tmp, cnt1, 1); 730 mov(tmp5, str2); 731 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 732 sub(ch2, cnt1, 1); 733 mov(tmp3, str1); 734 BIND(BCLOOP); 735 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 736 if (!str1_isL) { 737 subs(zr, ch1, ASIZE); 738 br(HS, BCSKIP); 739 } 740 strb(ch2, Address(sp, ch1)); 741 BIND(BCSKIP); 742 subs(ch2, ch2, 1); 743 br(GT, BCLOOP); 744 745 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 746 if (str1_isL == str2_isL) { 747 // load last 8 bytes (8LL/4UU symbols) 748 ldr(tmp6, Address(tmp6, -wordSize)); 749 } else { 750 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 751 // convert Latin1 to UTF. We'll have to wait until load completed, but 752 // it's still faster than per-character loads+checks 753 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 754 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 755 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 756 andr(tmp6, tmp6, 0xFF); // str1[N-4] 757 orr(ch2, ch1, ch2, LSL, 16); 758 orr(tmp6, tmp6, tmp3, LSL, 48); 759 orr(tmp6, tmp6, ch2, LSL, 16); 760 } 761 BIND(BMLOOPSTR2); 762 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 763 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 764 if (str1_isL == str2_isL) { 765 // re-init tmp3. It's for free because it's executed in parallel with 766 // load above. Alternative is to initialize it before loop, but it'll 767 // affect performance on in-order systems with 2 or more ld/st pipelines 768 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 769 } 770 if (!isL) { // UU/UL case 771 lsl(ch2, cnt1tmp, 1); // offset in bytes 772 } 773 cmp(tmp3, skipch); 774 br(NE, BMSKIP); 775 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 776 mov(ch1, tmp6); 777 if (isL) { 778 b(BMLOOPSTR1_AFTER_LOAD); 779 } else { 780 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 781 b(BMLOOPSTR1_CMP); 782 } 783 BIND(BMLOOPSTR1); 784 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 785 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 786 BIND(BMLOOPSTR1_AFTER_LOAD); 787 subs(cnt1tmp, cnt1tmp, 1); 788 br(LT, BMLOOPSTR1_LASTCMP); 789 BIND(BMLOOPSTR1_CMP); 790 cmp(ch1, ch2); 791 br(EQ, BMLOOPSTR1); 792 BIND(BMSKIP); 793 if (!isL) { 794 // if we've met UTF symbol while searching Latin1 pattern, then we can 795 // skip cnt1 symbols 796 if (str1_isL != str2_isL) { 797 mov(result_tmp, cnt1); 798 } else { 799 mov(result_tmp, 1); 800 } 801 subs(zr, skipch, ASIZE); 802 br(HS, BMADV); 803 } 804 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 805 BIND(BMADV); 806 sub(cnt1tmp, cnt1, 1); 807 add(str2, str2, result_tmp, LSL, str2_chr_shift); 808 cmp(str2, str2end); 809 br(LE, BMLOOPSTR2); 810 add(sp, sp, ASIZE); 811 b(NOMATCH); 812 BIND(BMLOOPSTR1_LASTCMP); 813 cmp(ch1, ch2); 814 br(NE, BMSKIP); 815 BIND(BMMATCH); 816 sub(result, str2, tmp5); 817 if (!str2_isL) lsr(result, result, 1); 818 add(sp, sp, ASIZE); 819 b(DONE); 820 821 BIND(LINEARSTUB); 822 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 823 br(LT, LINEAR_MEDIUM); 824 mov(result, zr); 825 RuntimeAddress stub = nullptr; 826 if (isL) { 827 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 828 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 829 } else if (str1_isL) { 830 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 831 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 832 } else { 833 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 834 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 835 } 836 address call = trampoline_call(stub); 837 if (call == nullptr) { 838 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 839 ciEnv::current()->record_failure("CodeCache is full"); 840 return; 841 } 842 b(DONE); 843 } 844 845 BIND(LINEARSEARCH); 846 { 847 Label DO1, DO2, DO3; 848 849 Register str2tmp = tmp2; 850 Register first = tmp3; 851 852 if (icnt1 == -1) 853 { 854 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 855 856 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 857 br(LT, DOSHORT); 858 BIND(LINEAR_MEDIUM); 859 (this->*str1_load_1chr)(first, Address(str1)); 860 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 861 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 862 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 863 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 864 865 BIND(FIRST_LOOP); 866 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 867 cmp(first, ch2); 868 br(EQ, STR1_LOOP); 869 BIND(STR2_NEXT); 870 adds(cnt2_neg, cnt2_neg, str2_chr_size); 871 br(LE, FIRST_LOOP); 872 b(NOMATCH); 873 874 BIND(STR1_LOOP); 875 adds(cnt1tmp, cnt1_neg, str1_chr_size); 876 add(cnt2tmp, cnt2_neg, str2_chr_size); 877 br(GE, MATCH); 878 879 BIND(STR1_NEXT); 880 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 881 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 882 cmp(ch1, ch2); 883 br(NE, STR2_NEXT); 884 adds(cnt1tmp, cnt1tmp, str1_chr_size); 885 add(cnt2tmp, cnt2tmp, str2_chr_size); 886 br(LT, STR1_NEXT); 887 b(MATCH); 888 889 BIND(DOSHORT); 890 if (str1_isL == str2_isL) { 891 cmp(cnt1, (u1)2); 892 br(LT, DO1); 893 br(GT, DO3); 894 } 895 } 896 897 if (icnt1 == 4) { 898 Label CH1_LOOP; 899 900 (this->*load_4chr)(ch1, str1); 901 sub(result_tmp, cnt2, 4); 902 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 903 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 904 905 BIND(CH1_LOOP); 906 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 907 cmp(ch1, ch2); 908 br(EQ, MATCH); 909 adds(cnt2_neg, cnt2_neg, str2_chr_size); 910 br(LE, CH1_LOOP); 911 b(NOMATCH); 912 } 913 914 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 915 Label CH1_LOOP; 916 917 BIND(DO2); 918 (this->*load_2chr)(ch1, str1); 919 if (icnt1 == 2) { 920 sub(result_tmp, cnt2, 2); 921 } 922 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 923 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 924 BIND(CH1_LOOP); 925 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 926 cmp(ch1, ch2); 927 br(EQ, MATCH); 928 adds(cnt2_neg, cnt2_neg, str2_chr_size); 929 br(LE, CH1_LOOP); 930 b(NOMATCH); 931 } 932 933 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 934 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 935 936 BIND(DO3); 937 (this->*load_2chr)(first, str1); 938 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 939 if (icnt1 == 3) { 940 sub(result_tmp, cnt2, 3); 941 } 942 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 943 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 944 BIND(FIRST_LOOP); 945 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 946 cmpw(first, ch2); 947 br(EQ, STR1_LOOP); 948 BIND(STR2_NEXT); 949 adds(cnt2_neg, cnt2_neg, str2_chr_size); 950 br(LE, FIRST_LOOP); 951 b(NOMATCH); 952 953 BIND(STR1_LOOP); 954 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 955 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 956 cmp(ch1, ch2); 957 br(NE, STR2_NEXT); 958 b(MATCH); 959 } 960 961 if (icnt1 == -1 || icnt1 == 1) { 962 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 963 964 BIND(DO1); 965 (this->*str1_load_1chr)(ch1, str1); 966 cmp(cnt2, (u1)8); 967 br(LT, DO1_SHORT); 968 969 sub(result_tmp, cnt2, 8/str2_chr_size); 970 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 971 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 972 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 973 974 if (str2_isL) { 975 orr(ch1, ch1, ch1, LSL, 8); 976 } 977 orr(ch1, ch1, ch1, LSL, 16); 978 orr(ch1, ch1, ch1, LSL, 32); 979 BIND(CH1_LOOP); 980 ldr(ch2, Address(str2, cnt2_neg)); 981 eor(ch2, ch1, ch2); 982 sub(tmp1, ch2, tmp3); 983 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 984 bics(tmp1, tmp1, tmp2); 985 br(NE, HAS_ZERO); 986 adds(cnt2_neg, cnt2_neg, 8); 987 br(LT, CH1_LOOP); 988 989 cmp(cnt2_neg, (u1)8); 990 mov(cnt2_neg, 0); 991 br(LT, CH1_LOOP); 992 b(NOMATCH); 993 994 BIND(HAS_ZERO); 995 rev(tmp1, tmp1); 996 clz(tmp1, tmp1); 997 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 998 b(MATCH); 999 1000 BIND(DO1_SHORT); 1001 mov(result_tmp, cnt2); 1002 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1003 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1004 BIND(DO1_LOOP); 1005 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1006 cmpw(ch1, ch2); 1007 br(EQ, MATCH); 1008 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1009 br(LT, DO1_LOOP); 1010 } 1011 } 1012 BIND(NOMATCH); 1013 mov(result, -1); 1014 b(DONE); 1015 BIND(MATCH); 1016 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1017 BIND(DONE); 1018 } 1019 1020 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1021 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1022 1023 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1024 Register ch, Register result, 1025 Register tmp1, Register tmp2, Register tmp3) 1026 { 1027 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1028 Register cnt1_neg = cnt1; 1029 Register ch1 = rscratch1; 1030 Register result_tmp = rscratch2; 1031 1032 cbz(cnt1, NOMATCH); 1033 1034 cmp(cnt1, (u1)4); 1035 br(LT, DO1_SHORT); 1036 1037 orr(ch, ch, ch, LSL, 16); 1038 orr(ch, ch, ch, LSL, 32); 1039 1040 sub(cnt1, cnt1, 4); 1041 mov(result_tmp, cnt1); 1042 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1043 sub(cnt1_neg, zr, cnt1, LSL, 1); 1044 1045 mov(tmp3, 0x0001000100010001); 1046 1047 BIND(CH1_LOOP); 1048 ldr(ch1, Address(str1, cnt1_neg)); 1049 eor(ch1, ch, ch1); 1050 sub(tmp1, ch1, tmp3); 1051 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1052 bics(tmp1, tmp1, tmp2); 1053 br(NE, HAS_ZERO); 1054 adds(cnt1_neg, cnt1_neg, 8); 1055 br(LT, CH1_LOOP); 1056 1057 cmp(cnt1_neg, (u1)8); 1058 mov(cnt1_neg, 0); 1059 br(LT, CH1_LOOP); 1060 b(NOMATCH); 1061 1062 BIND(HAS_ZERO); 1063 rev(tmp1, tmp1); 1064 clz(tmp1, tmp1); 1065 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1066 b(MATCH); 1067 1068 BIND(DO1_SHORT); 1069 mov(result_tmp, cnt1); 1070 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1071 sub(cnt1_neg, zr, cnt1, LSL, 1); 1072 BIND(DO1_LOOP); 1073 ldrh(ch1, Address(str1, cnt1_neg)); 1074 cmpw(ch, ch1); 1075 br(EQ, MATCH); 1076 adds(cnt1_neg, cnt1_neg, 2); 1077 br(LT, DO1_LOOP); 1078 BIND(NOMATCH); 1079 mov(result, -1); 1080 b(DONE); 1081 BIND(MATCH); 1082 add(result, result_tmp, cnt1_neg, ASR, 1); 1083 BIND(DONE); 1084 } 1085 1086 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1087 Register ch, Register result, 1088 FloatRegister ztmp1, 1089 FloatRegister ztmp2, 1090 PRegister tmp_pg, 1091 PRegister tmp_pdn, bool isL) 1092 { 1093 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1094 assert(tmp_pg->is_governing(), 1095 "this register has to be a governing predicate register"); 1096 1097 Label LOOP, MATCH, DONE, NOMATCH; 1098 Register vec_len = rscratch1; 1099 Register idx = rscratch2; 1100 1101 SIMD_RegVariant T = (isL == true) ? B : H; 1102 1103 cbz(cnt1, NOMATCH); 1104 1105 // Assign the particular char throughout the vector. 1106 sve_dup(ztmp2, T, ch); 1107 if (isL) { 1108 sve_cntb(vec_len); 1109 } else { 1110 sve_cnth(vec_len); 1111 } 1112 mov(idx, 0); 1113 1114 // Generate a predicate to control the reading of input string. 1115 sve_whilelt(tmp_pg, T, idx, cnt1); 1116 1117 BIND(LOOP); 1118 // Read a vector of 8- or 16-bit data depending on the string type. Note 1119 // that inactive elements indicated by the predicate register won't cause 1120 // a data read from memory to the destination vector. 1121 if (isL) { 1122 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1123 } else { 1124 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1125 } 1126 add(idx, idx, vec_len); 1127 1128 // Perform the comparison. An element of the destination predicate is set 1129 // to active if the particular char is matched. 1130 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1131 1132 // Branch if the particular char is found. 1133 br(NE, MATCH); 1134 1135 sve_whilelt(tmp_pg, T, idx, cnt1); 1136 1137 // Loop back if the particular char not found. 1138 br(MI, LOOP); 1139 1140 BIND(NOMATCH); 1141 mov(result, -1); 1142 b(DONE); 1143 1144 BIND(MATCH); 1145 // Undo the index increment. 1146 sub(idx, idx, vec_len); 1147 1148 // Crop the vector to find its location. 1149 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1150 add(result, idx, -1); 1151 sve_incp(result, T, tmp_pdn); 1152 BIND(DONE); 1153 } 1154 1155 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1156 Register ch, Register result, 1157 Register tmp1, Register tmp2, Register tmp3) 1158 { 1159 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1160 Register cnt1_neg = cnt1; 1161 Register ch1 = rscratch1; 1162 Register result_tmp = rscratch2; 1163 1164 cbz(cnt1, NOMATCH); 1165 1166 cmp(cnt1, (u1)8); 1167 br(LT, DO1_SHORT); 1168 1169 orr(ch, ch, ch, LSL, 8); 1170 orr(ch, ch, ch, LSL, 16); 1171 orr(ch, ch, ch, LSL, 32); 1172 1173 sub(cnt1, cnt1, 8); 1174 mov(result_tmp, cnt1); 1175 lea(str1, Address(str1, cnt1)); 1176 sub(cnt1_neg, zr, cnt1); 1177 1178 mov(tmp3, 0x0101010101010101); 1179 1180 BIND(CH1_LOOP); 1181 ldr(ch1, Address(str1, cnt1_neg)); 1182 eor(ch1, ch, ch1); 1183 sub(tmp1, ch1, tmp3); 1184 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1185 bics(tmp1, tmp1, tmp2); 1186 br(NE, HAS_ZERO); 1187 adds(cnt1_neg, cnt1_neg, 8); 1188 br(LT, CH1_LOOP); 1189 1190 cmp(cnt1_neg, (u1)8); 1191 mov(cnt1_neg, 0); 1192 br(LT, CH1_LOOP); 1193 b(NOMATCH); 1194 1195 BIND(HAS_ZERO); 1196 rev(tmp1, tmp1); 1197 clz(tmp1, tmp1); 1198 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1199 b(MATCH); 1200 1201 BIND(DO1_SHORT); 1202 mov(result_tmp, cnt1); 1203 lea(str1, Address(str1, cnt1)); 1204 sub(cnt1_neg, zr, cnt1); 1205 BIND(DO1_LOOP); 1206 ldrb(ch1, Address(str1, cnt1_neg)); 1207 cmp(ch, ch1); 1208 br(EQ, MATCH); 1209 adds(cnt1_neg, cnt1_neg, 1); 1210 br(LT, DO1_LOOP); 1211 BIND(NOMATCH); 1212 mov(result, -1); 1213 b(DONE); 1214 BIND(MATCH); 1215 add(result, result_tmp, cnt1_neg); 1216 BIND(DONE); 1217 } 1218 1219 // Compare strings. 1220 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1221 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1222 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1223 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1224 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1225 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1226 SHORT_LOOP_START, TAIL_CHECK; 1227 1228 bool isLL = ae == StrIntrinsicNode::LL; 1229 bool isLU = ae == StrIntrinsicNode::LU; 1230 bool isUL = ae == StrIntrinsicNode::UL; 1231 1232 // The stub threshold for LL strings is: 72 (64 + 8) chars 1233 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1234 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1235 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1236 1237 bool str1_isL = isLL || isLU; 1238 bool str2_isL = isLL || isUL; 1239 1240 int str1_chr_shift = str1_isL ? 0 : 1; 1241 int str2_chr_shift = str2_isL ? 0 : 1; 1242 int str1_chr_size = str1_isL ? 1 : 2; 1243 int str2_chr_size = str2_isL ? 1 : 2; 1244 int minCharsInWord = isLL ? wordSize : wordSize/2; 1245 1246 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1247 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1248 (chr_insn)&MacroAssembler::ldrh; 1249 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1250 (chr_insn)&MacroAssembler::ldrh; 1251 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1252 (uxt_insn)&MacroAssembler::uxthw; 1253 1254 BLOCK_COMMENT("string_compare {"); 1255 1256 // Bizarrely, the counts are passed in bytes, regardless of whether they 1257 // are L or U strings, however the result is always in characters. 1258 if (!str1_isL) asrw(cnt1, cnt1, 1); 1259 if (!str2_isL) asrw(cnt2, cnt2, 1); 1260 1261 // Compute the minimum of the string lengths and save the difference. 1262 subsw(result, cnt1, cnt2); 1263 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1264 1265 // A very short string 1266 cmpw(cnt2, minCharsInWord); 1267 br(Assembler::LE, SHORT_STRING); 1268 1269 // Compare longwords 1270 // load first parts of strings and finish initialization while loading 1271 { 1272 if (str1_isL == str2_isL) { // LL or UU 1273 ldr(tmp1, Address(str1)); 1274 cmp(str1, str2); 1275 br(Assembler::EQ, DONE); 1276 ldr(tmp2, Address(str2)); 1277 cmp(cnt2, stub_threshold); 1278 br(GE, STUB); 1279 subsw(cnt2, cnt2, minCharsInWord); 1280 br(EQ, TAIL_CHECK); 1281 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1282 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1283 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1284 } else if (isLU) { 1285 ldrs(vtmp, Address(str1)); 1286 ldr(tmp2, Address(str2)); 1287 cmp(cnt2, stub_threshold); 1288 br(GE, STUB); 1289 subw(cnt2, cnt2, 4); 1290 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1291 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1292 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1293 zip1(vtmp, T8B, vtmp, vtmpZ); 1294 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1295 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1296 add(cnt1, cnt1, 4); 1297 fmovd(tmp1, vtmp); 1298 } else { // UL case 1299 ldr(tmp1, Address(str1)); 1300 ldrs(vtmp, Address(str2)); 1301 cmp(cnt2, stub_threshold); 1302 br(GE, STUB); 1303 subw(cnt2, cnt2, 4); 1304 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1305 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1306 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1307 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1308 zip1(vtmp, T8B, vtmp, vtmpZ); 1309 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1310 add(cnt1, cnt1, 8); 1311 fmovd(tmp2, vtmp); 1312 } 1313 adds(cnt2, cnt2, isUL ? 4 : 8); 1314 br(GE, TAIL); 1315 eor(rscratch2, tmp1, tmp2); 1316 cbnz(rscratch2, DIFF); 1317 // main loop 1318 bind(NEXT_WORD); 1319 if (str1_isL == str2_isL) { 1320 ldr(tmp1, Address(str1, cnt2)); 1321 ldr(tmp2, Address(str2, cnt2)); 1322 adds(cnt2, cnt2, 8); 1323 } else if (isLU) { 1324 ldrs(vtmp, Address(str1, cnt1)); 1325 ldr(tmp2, Address(str2, cnt2)); 1326 add(cnt1, cnt1, 4); 1327 zip1(vtmp, T8B, vtmp, vtmpZ); 1328 fmovd(tmp1, vtmp); 1329 adds(cnt2, cnt2, 8); 1330 } else { // UL 1331 ldrs(vtmp, Address(str2, cnt2)); 1332 ldr(tmp1, Address(str1, cnt1)); 1333 zip1(vtmp, T8B, vtmp, vtmpZ); 1334 add(cnt1, cnt1, 8); 1335 fmovd(tmp2, vtmp); 1336 adds(cnt2, cnt2, 4); 1337 } 1338 br(GE, TAIL); 1339 1340 eor(rscratch2, tmp1, tmp2); 1341 cbz(rscratch2, NEXT_WORD); 1342 b(DIFF); 1343 bind(TAIL); 1344 eor(rscratch2, tmp1, tmp2); 1345 cbnz(rscratch2, DIFF); 1346 // Last longword. In the case where length == 4 we compare the 1347 // same longword twice, but that's still faster than another 1348 // conditional branch. 1349 if (str1_isL == str2_isL) { 1350 ldr(tmp1, Address(str1)); 1351 ldr(tmp2, Address(str2)); 1352 } else if (isLU) { 1353 ldrs(vtmp, Address(str1)); 1354 ldr(tmp2, Address(str2)); 1355 zip1(vtmp, T8B, vtmp, vtmpZ); 1356 fmovd(tmp1, vtmp); 1357 } else { // UL 1358 ldrs(vtmp, Address(str2)); 1359 ldr(tmp1, Address(str1)); 1360 zip1(vtmp, T8B, vtmp, vtmpZ); 1361 fmovd(tmp2, vtmp); 1362 } 1363 bind(TAIL_CHECK); 1364 eor(rscratch2, tmp1, tmp2); 1365 cbz(rscratch2, DONE); 1366 1367 // Find the first different characters in the longwords and 1368 // compute their difference. 1369 bind(DIFF); 1370 rev(rscratch2, rscratch2); 1371 clz(rscratch2, rscratch2); 1372 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1373 lsrv(tmp1, tmp1, rscratch2); 1374 (this->*ext_chr)(tmp1, tmp1); 1375 lsrv(tmp2, tmp2, rscratch2); 1376 (this->*ext_chr)(tmp2, tmp2); 1377 subw(result, tmp1, tmp2); 1378 b(DONE); 1379 } 1380 1381 bind(STUB); 1382 RuntimeAddress stub = nullptr; 1383 switch(ae) { 1384 case StrIntrinsicNode::LL: 1385 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1386 break; 1387 case StrIntrinsicNode::UU: 1388 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1389 break; 1390 case StrIntrinsicNode::LU: 1391 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1392 break; 1393 case StrIntrinsicNode::UL: 1394 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1395 break; 1396 default: 1397 ShouldNotReachHere(); 1398 } 1399 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1400 address call = trampoline_call(stub); 1401 if (call == nullptr) { 1402 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1403 ciEnv::current()->record_failure("CodeCache is full"); 1404 return; 1405 } 1406 b(DONE); 1407 1408 bind(SHORT_STRING); 1409 // Is the minimum length zero? 1410 cbz(cnt2, DONE); 1411 // arrange code to do most branches while loading and loading next characters 1412 // while comparing previous 1413 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1414 subs(cnt2, cnt2, 1); 1415 br(EQ, SHORT_LAST_INIT); 1416 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1417 b(SHORT_LOOP_START); 1418 bind(SHORT_LOOP); 1419 subs(cnt2, cnt2, 1); 1420 br(EQ, SHORT_LAST); 1421 bind(SHORT_LOOP_START); 1422 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1423 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1424 cmp(tmp1, cnt1); 1425 br(NE, SHORT_LOOP_TAIL); 1426 subs(cnt2, cnt2, 1); 1427 br(EQ, SHORT_LAST2); 1428 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1429 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1430 cmp(tmp2, rscratch1); 1431 br(EQ, SHORT_LOOP); 1432 sub(result, tmp2, rscratch1); 1433 b(DONE); 1434 bind(SHORT_LOOP_TAIL); 1435 sub(result, tmp1, cnt1); 1436 b(DONE); 1437 bind(SHORT_LAST2); 1438 cmp(tmp2, rscratch1); 1439 br(EQ, DONE); 1440 sub(result, tmp2, rscratch1); 1441 1442 b(DONE); 1443 bind(SHORT_LAST_INIT); 1444 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1445 bind(SHORT_LAST); 1446 cmp(tmp1, cnt1); 1447 br(EQ, DONE); 1448 sub(result, tmp1, cnt1); 1449 1450 bind(DONE); 1451 1452 BLOCK_COMMENT("} string_compare"); 1453 } 1454 1455 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1456 FloatRegister src2, Condition cond, bool isQ) { 1457 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1458 FloatRegister zn = src1, zm = src2; 1459 bool needs_negation = false; 1460 switch (cond) { 1461 case LT: cond = GT; zn = src2; zm = src1; break; 1462 case LE: cond = GE; zn = src2; zm = src1; break; 1463 case LO: cond = HI; zn = src2; zm = src1; break; 1464 case LS: cond = HS; zn = src2; zm = src1; break; 1465 case NE: cond = EQ; needs_negation = true; break; 1466 default: 1467 break; 1468 } 1469 1470 if (is_floating_point_type(bt)) { 1471 fcm(cond, dst, size, zn, zm); 1472 } else { 1473 cm(cond, dst, size, zn, zm); 1474 } 1475 1476 if (needs_negation) { 1477 notr(dst, isQ ? T16B : T8B, dst); 1478 } 1479 } 1480 1481 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1482 Condition cond, bool isQ) { 1483 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1484 if (bt == T_FLOAT || bt == T_DOUBLE) { 1485 if (cond == Assembler::NE) { 1486 fcm(Assembler::EQ, dst, size, src); 1487 notr(dst, isQ ? T16B : T8B, dst); 1488 } else { 1489 fcm(cond, dst, size, src); 1490 } 1491 } else { 1492 if (cond == Assembler::NE) { 1493 cm(Assembler::EQ, dst, size, src); 1494 notr(dst, isQ ? T16B : T8B, dst); 1495 } else { 1496 cm(cond, dst, size, src); 1497 } 1498 } 1499 } 1500 1501 // Compress the least significant bit of each byte to the rightmost and clear 1502 // the higher garbage bits. 1503 void C2_MacroAssembler::bytemask_compress(Register dst) { 1504 // Example input, dst = 0x01 00 00 00 01 01 00 01 1505 // The "??" bytes are garbage. 1506 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1507 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1508 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1509 andr(dst, dst, 0xff); // dst = 0x8D 1510 } 1511 1512 // Pack the lowest-numbered bit of each mask element in src into a long value 1513 // in dst, at most the first 64 lane elements. 1514 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1515 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1516 FloatRegister vtmp1, FloatRegister vtmp2) { 1517 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1518 assert_different_registers(dst, rscratch1); 1519 assert_different_registers(vtmp1, vtmp2); 1520 1521 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1522 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1523 // Expected: dst = 0x658D 1524 1525 // Convert the mask into vector with sequential bytes. 1526 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1527 sve_cpy(vtmp1, size, src, 1, false); 1528 if (bt != T_BYTE) { 1529 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1530 } 1531 1532 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1533 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1534 // is to compress each significant bit of the byte in a cross-lane way. Due 1535 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1536 // (bit-compress in each lane) with the biggest lane size (T = D) then 1537 // concatenate the results. 1538 1539 // The second source input of BEXT, initialized with 0x01 in each byte. 1540 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1541 sve_dup(vtmp2, B, 1); 1542 1543 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1544 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1545 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1546 // --------------------------------------- 1547 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1548 sve_bext(vtmp1, D, vtmp1, vtmp2); 1549 1550 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1551 // result to dst. 1552 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1553 // dst = 0x658D 1554 if (lane_cnt <= 8) { 1555 // No need to concatenate. 1556 umov(dst, vtmp1, B, 0); 1557 } else if (lane_cnt <= 16) { 1558 ins(vtmp1, B, vtmp1, 1, 8); 1559 umov(dst, vtmp1, H, 0); 1560 } else { 1561 // As the lane count is 64 at most, the final expected value must be in 1562 // the lowest 64 bits after narrowing vtmp1 from D to B. 1563 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1564 umov(dst, vtmp1, D, 0); 1565 } 1566 } else if (UseSVE > 0) { 1567 // Compress the lowest 8 bytes. 1568 fmovd(dst, vtmp1); 1569 bytemask_compress(dst); 1570 if (lane_cnt <= 8) return; 1571 1572 // Repeat on higher bytes and join the results. 1573 // Compress 8 bytes in each iteration. 1574 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1575 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1576 bytemask_compress(rscratch1); 1577 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1578 } 1579 } else { 1580 assert(false, "unsupported"); 1581 ShouldNotReachHere(); 1582 } 1583 } 1584 1585 // Unpack the mask, a long value in src, into predicate register dst based on the 1586 // corresponding data type. Note that dst can support at most 64 lanes. 1587 // Below example gives the expected dst predicate register in different types, with 1588 // a valid src(0x658D) on a 1024-bit vector size machine. 1589 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1590 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1591 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1592 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1593 // 1594 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1595 // has 24 significant bits would be an invalid input if dst predicate register refers to 1596 // a LONG type 1024-bit vector, which has at most 16 lanes. 1597 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1598 FloatRegister vtmp1, FloatRegister vtmp2) { 1599 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1600 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1601 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1602 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1603 // Expected: dst = 0b01101001 10001101 1604 1605 // Put long value from general purpose register into the first lane of vector. 1606 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1607 sve_dup(vtmp1, B, 0); 1608 mov(vtmp1, D, 0, src); 1609 1610 // As sve_cmp generates mask value with the minimum unit in byte, we should 1611 // transform the value in the first lane which is mask in bit now to the 1612 // mask in byte, which can be done by SVE2's BDEP instruction. 1613 1614 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1615 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1616 if (lane_cnt <= 8) { 1617 // Nothing. As only one byte exsits. 1618 } else if (lane_cnt <= 16) { 1619 ins(vtmp1, B, vtmp1, 8, 1); 1620 mov(vtmp1, B, 1, zr); 1621 } else { 1622 sve_vector_extend(vtmp1, D, vtmp1, B); 1623 } 1624 1625 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1626 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1627 sve_dup(vtmp2, B, 1); 1628 1629 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1630 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1631 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1632 // --------------------------------------- 1633 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1634 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1635 1636 if (bt != T_BYTE) { 1637 sve_vector_extend(vtmp1, size, vtmp1, B); 1638 } 1639 // Generate mask according to the given vector, in which the elements have been 1640 // extended to expected type. 1641 // dst = 0b01101001 10001101 1642 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1643 } 1644 1645 // Clobbers: rflags 1646 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1647 FloatRegister zn, FloatRegister zm, Condition cond) { 1648 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1649 FloatRegister z1 = zn, z2 = zm; 1650 switch (cond) { 1651 case LE: z1 = zm; z2 = zn; cond = GE; break; 1652 case LT: z1 = zm; z2 = zn; cond = GT; break; 1653 case LO: z1 = zm; z2 = zn; cond = HI; break; 1654 case LS: z1 = zm; z2 = zn; cond = HS; break; 1655 default: 1656 break; 1657 } 1658 1659 SIMD_RegVariant size = elemType_to_regVariant(bt); 1660 if (is_floating_point_type(bt)) { 1661 sve_fcm(cond, pd, size, pg, z1, z2); 1662 } else { 1663 assert(is_integral_type(bt), "unsupported element type"); 1664 sve_cmp(cond, pd, size, pg, z1, z2); 1665 } 1666 } 1667 1668 // Get index of the last mask lane that is set 1669 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1670 SIMD_RegVariant size = elemType_to_regVariant(bt); 1671 sve_rev(ptmp, size, src); 1672 sve_brkb(ptmp, ptrue, ptmp, false); 1673 sve_cntp(dst, size, ptrue, ptmp); 1674 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1675 subw(dst, rscratch1, dst); 1676 } 1677 1678 // Extend integer vector src to dst with the same lane count 1679 // but larger element size, e.g. 4B -> 4I 1680 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1681 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1682 if (src_bt == T_BYTE) { 1683 if (dst_bt == T_SHORT) { 1684 // 4B/8B to 4S/8S 1685 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1686 } else { 1687 // 4B to 4I 1688 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1689 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1690 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1691 } 1692 } else if (src_bt == T_SHORT) { 1693 // 4S to 4I 1694 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1695 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1696 } else if (src_bt == T_INT) { 1697 // 2I to 2L 1698 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1699 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1700 } else { 1701 ShouldNotReachHere(); 1702 } 1703 } 1704 1705 // Narrow integer vector src down to dst with the same lane count 1706 // but smaller element size, e.g. 4I -> 4B 1707 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1708 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1709 if (src_bt == T_SHORT) { 1710 // 4S/8S to 4B/8B 1711 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1712 assert(dst_bt == T_BYTE, "unsupported"); 1713 xtn(dst, T8B, src, T8H); 1714 } else if (src_bt == T_INT) { 1715 // 4I to 4B/4S 1716 assert(src_vlen_in_bytes == 16, "unsupported"); 1717 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1718 xtn(dst, T4H, src, T4S); 1719 if (dst_bt == T_BYTE) { 1720 xtn(dst, T8B, dst, T8H); 1721 } 1722 } else if (src_bt == T_LONG) { 1723 // 2L to 2I 1724 assert(src_vlen_in_bytes == 16, "unsupported"); 1725 assert(dst_bt == T_INT, "unsupported"); 1726 xtn(dst, T2S, src, T2D); 1727 } else { 1728 ShouldNotReachHere(); 1729 } 1730 } 1731 1732 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1733 FloatRegister src, SIMD_RegVariant src_size, 1734 bool is_unsigned) { 1735 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1736 1737 if (src_size == B) { 1738 switch (dst_size) { 1739 case H: 1740 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1741 break; 1742 case S: 1743 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1744 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1745 break; 1746 case D: 1747 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1748 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1749 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1750 break; 1751 default: 1752 ShouldNotReachHere(); 1753 } 1754 } else if (src_size == H) { 1755 if (dst_size == S) { 1756 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1757 } else { // D 1758 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1759 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1760 } 1761 } else if (src_size == S) { 1762 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1763 } 1764 } 1765 1766 // Vector narrow from src to dst with specified element sizes. 1767 // High part of dst vector will be filled with zero. 1768 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1769 FloatRegister src, SIMD_RegVariant src_size, 1770 FloatRegister tmp) { 1771 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1772 assert_different_registers(src, tmp); 1773 sve_dup(tmp, src_size, 0); 1774 if (src_size == D) { 1775 switch (dst_size) { 1776 case S: 1777 sve_uzp1(dst, S, src, tmp); 1778 break; 1779 case H: 1780 assert_different_registers(dst, tmp); 1781 sve_uzp1(dst, S, src, tmp); 1782 sve_uzp1(dst, H, dst, tmp); 1783 break; 1784 case B: 1785 assert_different_registers(dst, tmp); 1786 sve_uzp1(dst, S, src, tmp); 1787 sve_uzp1(dst, H, dst, tmp); 1788 sve_uzp1(dst, B, dst, tmp); 1789 break; 1790 default: 1791 ShouldNotReachHere(); 1792 } 1793 } else if (src_size == S) { 1794 if (dst_size == H) { 1795 sve_uzp1(dst, H, src, tmp); 1796 } else { // B 1797 assert_different_registers(dst, tmp); 1798 sve_uzp1(dst, H, src, tmp); 1799 sve_uzp1(dst, B, dst, tmp); 1800 } 1801 } else if (src_size == H) { 1802 sve_uzp1(dst, B, src, tmp); 1803 } 1804 } 1805 1806 // Extend src predicate to dst predicate with the same lane count but larger 1807 // element size, e.g. 64Byte -> 512Long 1808 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1809 uint dst_element_length_in_bytes, 1810 uint src_element_length_in_bytes) { 1811 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1812 sve_punpklo(dst, src); 1813 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1814 sve_punpklo(dst, src); 1815 sve_punpklo(dst, dst); 1816 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1817 sve_punpklo(dst, src); 1818 sve_punpklo(dst, dst); 1819 sve_punpklo(dst, dst); 1820 } else { 1821 assert(false, "unsupported"); 1822 ShouldNotReachHere(); 1823 } 1824 } 1825 1826 // Narrow src predicate to dst predicate with the same lane count but 1827 // smaller element size, e.g. 512Long -> 64Byte 1828 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1829 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1830 // The insignificant bits in src predicate are expected to be zero. 1831 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1832 // passed as the second argument. An example narrowing operation with a given mask would be - 1833 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1834 // Mask (for 2 Longs) : TF 1835 // Predicate register for the above mask (16 bits) : 00000001 00000000 1836 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1837 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1838 assert_different_registers(src, ptmp); 1839 assert_different_registers(dst, ptmp); 1840 sve_pfalse(ptmp); 1841 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1842 sve_uzp1(dst, B, src, ptmp); 1843 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1844 sve_uzp1(dst, H, src, ptmp); 1845 sve_uzp1(dst, B, dst, ptmp); 1846 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1847 sve_uzp1(dst, S, src, ptmp); 1848 sve_uzp1(dst, H, dst, ptmp); 1849 sve_uzp1(dst, B, dst, ptmp); 1850 } else { 1851 assert(false, "unsupported"); 1852 ShouldNotReachHere(); 1853 } 1854 } 1855 1856 // Vector reduction add for integral type with ASIMD instructions. 1857 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1858 Register isrc, FloatRegister vsrc, 1859 unsigned vector_length_in_bytes, 1860 FloatRegister vtmp) { 1861 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1862 assert_different_registers(dst, isrc); 1863 bool isQ = vector_length_in_bytes == 16; 1864 1865 BLOCK_COMMENT("neon_reduce_add_integral {"); 1866 switch(bt) { 1867 case T_BYTE: 1868 addv(vtmp, isQ ? T16B : T8B, vsrc); 1869 smov(dst, vtmp, B, 0); 1870 addw(dst, dst, isrc, ext::sxtb); 1871 break; 1872 case T_SHORT: 1873 addv(vtmp, isQ ? T8H : T4H, vsrc); 1874 smov(dst, vtmp, H, 0); 1875 addw(dst, dst, isrc, ext::sxth); 1876 break; 1877 case T_INT: 1878 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1879 umov(dst, vtmp, S, 0); 1880 addw(dst, dst, isrc); 1881 break; 1882 case T_LONG: 1883 assert(isQ, "unsupported"); 1884 addpd(vtmp, vsrc); 1885 umov(dst, vtmp, D, 0); 1886 add(dst, dst, isrc); 1887 break; 1888 default: 1889 assert(false, "unsupported"); 1890 ShouldNotReachHere(); 1891 } 1892 BLOCK_COMMENT("} neon_reduce_add_integral"); 1893 } 1894 1895 // Vector reduction multiply for integral type with ASIMD instructions. 1896 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1897 // Clobbers: rscratch1 1898 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1899 Register isrc, FloatRegister vsrc, 1900 unsigned vector_length_in_bytes, 1901 FloatRegister vtmp1, FloatRegister vtmp2) { 1902 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1903 bool isQ = vector_length_in_bytes == 16; 1904 1905 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1906 switch(bt) { 1907 case T_BYTE: 1908 if (isQ) { 1909 // Multiply the lower half and higher half of vector iteratively. 1910 // vtmp1 = vsrc[8:15] 1911 ins(vtmp1, D, vsrc, 0, 1); 1912 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1913 mulv(vtmp1, T8B, vtmp1, vsrc); 1914 // vtmp2 = vtmp1[4:7] 1915 ins(vtmp2, S, vtmp1, 0, 1); 1916 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1917 mulv(vtmp1, T8B, vtmp2, vtmp1); 1918 } else { 1919 ins(vtmp1, S, vsrc, 0, 1); 1920 mulv(vtmp1, T8B, vtmp1, vsrc); 1921 } 1922 // vtmp2 = vtmp1[2:3] 1923 ins(vtmp2, H, vtmp1, 0, 1); 1924 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1925 mulv(vtmp2, T8B, vtmp2, vtmp1); 1926 // dst = vtmp2[0] * isrc * vtmp2[1] 1927 umov(rscratch1, vtmp2, B, 0); 1928 mulw(dst, rscratch1, isrc); 1929 sxtb(dst, dst); 1930 umov(rscratch1, vtmp2, B, 1); 1931 mulw(dst, rscratch1, dst); 1932 sxtb(dst, dst); 1933 break; 1934 case T_SHORT: 1935 if (isQ) { 1936 ins(vtmp2, D, vsrc, 0, 1); 1937 mulv(vtmp2, T4H, vtmp2, vsrc); 1938 ins(vtmp1, S, vtmp2, 0, 1); 1939 mulv(vtmp1, T4H, vtmp1, vtmp2); 1940 } else { 1941 ins(vtmp1, S, vsrc, 0, 1); 1942 mulv(vtmp1, T4H, vtmp1, vsrc); 1943 } 1944 umov(rscratch1, vtmp1, H, 0); 1945 mulw(dst, rscratch1, isrc); 1946 sxth(dst, dst); 1947 umov(rscratch1, vtmp1, H, 1); 1948 mulw(dst, rscratch1, dst); 1949 sxth(dst, dst); 1950 break; 1951 case T_INT: 1952 if (isQ) { 1953 ins(vtmp1, D, vsrc, 0, 1); 1954 mulv(vtmp1, T2S, vtmp1, vsrc); 1955 } else { 1956 vtmp1 = vsrc; 1957 } 1958 umov(rscratch1, vtmp1, S, 0); 1959 mul(dst, rscratch1, isrc); 1960 umov(rscratch1, vtmp1, S, 1); 1961 mul(dst, rscratch1, dst); 1962 break; 1963 case T_LONG: 1964 umov(rscratch1, vsrc, D, 0); 1965 mul(dst, isrc, rscratch1); 1966 umov(rscratch1, vsrc, D, 1); 1967 mul(dst, dst, rscratch1); 1968 break; 1969 default: 1970 assert(false, "unsupported"); 1971 ShouldNotReachHere(); 1972 } 1973 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1974 } 1975 1976 // Vector reduction multiply for floating-point type with ASIMD instructions. 1977 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1978 FloatRegister fsrc, FloatRegister vsrc, 1979 unsigned vector_length_in_bytes, 1980 FloatRegister vtmp) { 1981 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1982 bool isQ = vector_length_in_bytes == 16; 1983 1984 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1985 switch(bt) { 1986 case T_FLOAT: 1987 fmuls(dst, fsrc, vsrc); 1988 ins(vtmp, S, vsrc, 0, 1); 1989 fmuls(dst, dst, vtmp); 1990 if (isQ) { 1991 ins(vtmp, S, vsrc, 0, 2); 1992 fmuls(dst, dst, vtmp); 1993 ins(vtmp, S, vsrc, 0, 3); 1994 fmuls(dst, dst, vtmp); 1995 } 1996 break; 1997 case T_DOUBLE: 1998 assert(isQ, "unsupported"); 1999 fmuld(dst, fsrc, vsrc); 2000 ins(vtmp, D, vsrc, 0, 1); 2001 fmuld(dst, dst, vtmp); 2002 break; 2003 default: 2004 assert(false, "unsupported"); 2005 ShouldNotReachHere(); 2006 } 2007 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2008 } 2009 2010 // Helper to select logical instruction 2011 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2012 Register Rn, Register Rm, 2013 enum shift_kind kind, unsigned shift) { 2014 switch(opc) { 2015 case Op_AndReductionV: 2016 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2017 break; 2018 case Op_OrReductionV: 2019 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2020 break; 2021 case Op_XorReductionV: 2022 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2023 break; 2024 default: 2025 assert(false, "unsupported"); 2026 ShouldNotReachHere(); 2027 } 2028 } 2029 2030 // Vector reduction logical operations And, Or, Xor 2031 // Clobbers: rscratch1 2032 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2033 Register isrc, FloatRegister vsrc, 2034 unsigned vector_length_in_bytes) { 2035 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2036 "unsupported"); 2037 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2038 assert_different_registers(dst, isrc); 2039 bool isQ = vector_length_in_bytes == 16; 2040 2041 BLOCK_COMMENT("neon_reduce_logical {"); 2042 umov(rscratch1, vsrc, isQ ? D : S, 0); 2043 umov(dst, vsrc, isQ ? D : S, 1); 2044 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2045 switch(bt) { 2046 case T_BYTE: 2047 if (isQ) { 2048 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2049 } 2050 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2051 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2052 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2053 sxtb(dst, dst); 2054 break; 2055 case T_SHORT: 2056 if (isQ) { 2057 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2058 } 2059 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2060 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2061 sxth(dst, dst); 2062 break; 2063 case T_INT: 2064 if (isQ) { 2065 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2066 } 2067 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2068 break; 2069 case T_LONG: 2070 assert(isQ, "unsupported"); 2071 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2072 break; 2073 default: 2074 assert(false, "unsupported"); 2075 ShouldNotReachHere(); 2076 } 2077 BLOCK_COMMENT("} neon_reduce_logical"); 2078 } 2079 2080 // Vector reduction min/max for integral type with ASIMD instructions. 2081 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2082 // Clobbers: rscratch1, rflags 2083 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2084 Register isrc, FloatRegister vsrc, 2085 unsigned vector_length_in_bytes, 2086 FloatRegister vtmp) { 2087 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2088 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2089 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2090 assert_different_registers(dst, isrc); 2091 bool isQ = vector_length_in_bytes == 16; 2092 bool is_min = opc == Op_MinReductionV; 2093 2094 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2095 if (bt == T_LONG) { 2096 assert(vtmp == fnoreg, "should be"); 2097 assert(isQ, "should be"); 2098 umov(rscratch1, vsrc, D, 0); 2099 cmp(isrc, rscratch1); 2100 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2101 umov(rscratch1, vsrc, D, 1); 2102 cmp(dst, rscratch1); 2103 csel(dst, dst, rscratch1, is_min ? LT : GT); 2104 } else { 2105 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2106 if (size == T2S) { 2107 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2108 } else { 2109 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2110 } 2111 if (bt == T_INT) { 2112 umov(dst, vtmp, S, 0); 2113 } else { 2114 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2115 } 2116 cmpw(dst, isrc); 2117 cselw(dst, dst, isrc, is_min ? LT : GT); 2118 } 2119 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2120 } 2121 2122 // Vector reduction for integral type with SVE instruction. 2123 // Supported operations are Add, And, Or, Xor, Max, Min. 2124 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2125 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2126 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2127 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2128 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2129 assert_different_registers(src1, dst); 2130 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2131 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2132 switch (opc) { 2133 case Op_AddReductionVI: { 2134 sve_uaddv(tmp, size, pg, src2); 2135 if (bt == T_BYTE) { 2136 smov(dst, tmp, size, 0); 2137 addw(dst, src1, dst, ext::sxtb); 2138 } else if (bt == T_SHORT) { 2139 smov(dst, tmp, size, 0); 2140 addw(dst, src1, dst, ext::sxth); 2141 } else { 2142 umov(dst, tmp, size, 0); 2143 addw(dst, dst, src1); 2144 } 2145 break; 2146 } 2147 case Op_AddReductionVL: { 2148 sve_uaddv(tmp, size, pg, src2); 2149 umov(dst, tmp, size, 0); 2150 add(dst, dst, src1); 2151 break; 2152 } 2153 case Op_AndReductionV: { 2154 sve_andv(tmp, size, pg, src2); 2155 if (bt == T_INT || bt == T_LONG) { 2156 umov(dst, tmp, size, 0); 2157 } else { 2158 smov(dst, tmp, size, 0); 2159 } 2160 if (bt == T_LONG) { 2161 andr(dst, dst, src1); 2162 } else { 2163 andw(dst, dst, src1); 2164 } 2165 break; 2166 } 2167 case Op_OrReductionV: { 2168 sve_orv(tmp, size, pg, src2); 2169 if (bt == T_INT || bt == T_LONG) { 2170 umov(dst, tmp, size, 0); 2171 } else { 2172 smov(dst, tmp, size, 0); 2173 } 2174 if (bt == T_LONG) { 2175 orr(dst, dst, src1); 2176 } else { 2177 orrw(dst, dst, src1); 2178 } 2179 break; 2180 } 2181 case Op_XorReductionV: { 2182 sve_eorv(tmp, size, pg, src2); 2183 if (bt == T_INT || bt == T_LONG) { 2184 umov(dst, tmp, size, 0); 2185 } else { 2186 smov(dst, tmp, size, 0); 2187 } 2188 if (bt == T_LONG) { 2189 eor(dst, dst, src1); 2190 } else { 2191 eorw(dst, dst, src1); 2192 } 2193 break; 2194 } 2195 case Op_MaxReductionV: { 2196 sve_smaxv(tmp, size, pg, src2); 2197 if (bt == T_INT || bt == T_LONG) { 2198 umov(dst, tmp, size, 0); 2199 } else { 2200 smov(dst, tmp, size, 0); 2201 } 2202 if (bt == T_LONG) { 2203 cmp(dst, src1); 2204 csel(dst, dst, src1, Assembler::GT); 2205 } else { 2206 cmpw(dst, src1); 2207 cselw(dst, dst, src1, Assembler::GT); 2208 } 2209 break; 2210 } 2211 case Op_MinReductionV: { 2212 sve_sminv(tmp, size, pg, src2); 2213 if (bt == T_INT || bt == T_LONG) { 2214 umov(dst, tmp, size, 0); 2215 } else { 2216 smov(dst, tmp, size, 0); 2217 } 2218 if (bt == T_LONG) { 2219 cmp(dst, src1); 2220 csel(dst, dst, src1, Assembler::LT); 2221 } else { 2222 cmpw(dst, src1); 2223 cselw(dst, dst, src1, Assembler::LT); 2224 } 2225 break; 2226 } 2227 default: 2228 assert(false, "unsupported"); 2229 ShouldNotReachHere(); 2230 } 2231 2232 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2233 if (bt == T_BYTE) { 2234 sxtb(dst, dst); 2235 } else if (bt == T_SHORT) { 2236 sxth(dst, dst); 2237 } 2238 } 2239 } 2240 2241 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2242 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2243 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2244 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2245 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2246 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2247 2248 // Set all elements to false if the input "lane_cnt" is zero. 2249 if (lane_cnt == 0) { 2250 sve_pfalse(dst); 2251 return; 2252 } 2253 2254 SIMD_RegVariant size = elemType_to_regVariant(bt); 2255 assert(size != Q, "invalid size"); 2256 2257 // Set all true if "lane_cnt" equals to the max lane count. 2258 if (lane_cnt == max_vector_length) { 2259 sve_ptrue(dst, size, /* ALL */ 0b11111); 2260 return; 2261 } 2262 2263 // Fixed numbers for "ptrue". 2264 switch(lane_cnt) { 2265 case 1: /* VL1 */ 2266 case 2: /* VL2 */ 2267 case 3: /* VL3 */ 2268 case 4: /* VL4 */ 2269 case 5: /* VL5 */ 2270 case 6: /* VL6 */ 2271 case 7: /* VL7 */ 2272 case 8: /* VL8 */ 2273 sve_ptrue(dst, size, lane_cnt); 2274 return; 2275 case 16: 2276 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2277 return; 2278 case 32: 2279 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2280 return; 2281 case 64: 2282 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2283 return; 2284 case 128: 2285 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2286 return; 2287 case 256: 2288 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2289 return; 2290 default: 2291 break; 2292 } 2293 2294 // Special patterns for "ptrue". 2295 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2296 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2297 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2298 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2299 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2300 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2301 } else { 2302 // Encode to "whileltw" for the remaining cases. 2303 mov(rscratch1, lane_cnt); 2304 sve_whileltw(dst, size, zr, rscratch1); 2305 } 2306 } 2307 2308 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2309 // Any remaining elements of dst will be filled with zero. 2310 // Clobbers: rscratch1 2311 // Preserves: src, mask 2312 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2313 FloatRegister vtmp1, FloatRegister vtmp2, 2314 PRegister pgtmp) { 2315 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2316 assert_different_registers(dst, src, vtmp1, vtmp2); 2317 assert_different_registers(mask, pgtmp); 2318 2319 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2320 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2321 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2322 sve_dup(vtmp2, H, 0); 2323 2324 // Extend lowest half to type INT. 2325 // dst = 00004444 00003333 00002222 00001111 2326 sve_uunpklo(dst, S, src); 2327 // pgtmp = 00000001 00000000 00000001 00000001 2328 sve_punpklo(pgtmp, mask); 2329 // Pack the active elements in size of type INT to the right, 2330 // and fill the remainings with zero. 2331 // dst = 00000000 00004444 00002222 00001111 2332 sve_compact(dst, S, dst, pgtmp); 2333 // Narrow the result back to type SHORT. 2334 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2335 sve_uzp1(dst, H, dst, vtmp2); 2336 // Count the active elements of lowest half. 2337 // rscratch1 = 3 2338 sve_cntp(rscratch1, S, ptrue, pgtmp); 2339 2340 // Repeat to the highest half. 2341 // pgtmp = 00000001 00000000 00000000 00000001 2342 sve_punpkhi(pgtmp, mask); 2343 // vtmp1 = 00008888 00007777 00006666 00005555 2344 sve_uunpkhi(vtmp1, S, src); 2345 // vtmp1 = 00000000 00000000 00008888 00005555 2346 sve_compact(vtmp1, S, vtmp1, pgtmp); 2347 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2348 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2349 2350 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2351 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2352 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2353 // TRUE_CNT is the number of active elements in the compressed low. 2354 neg(rscratch1, rscratch1); 2355 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2356 sve_index(vtmp2, H, rscratch1, 1); 2357 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2358 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2359 2360 // Combine the compressed high(after shifted) with the compressed low. 2361 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2362 sve_orr(dst, dst, vtmp1); 2363 } 2364 2365 // Clobbers: rscratch1, rscratch2 2366 // Preserves: src, mask 2367 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2368 FloatRegister vtmp1, FloatRegister vtmp2, 2369 FloatRegister vtmp3, FloatRegister vtmp4, 2370 PRegister ptmp, PRegister pgtmp) { 2371 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2372 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2373 assert_different_registers(mask, ptmp, pgtmp); 2374 // Example input: src = 88 77 66 55 44 33 22 11 2375 // mask = 01 00 00 01 01 00 01 01 2376 // Expected result: dst = 00 00 00 88 55 44 22 11 2377 2378 sve_dup(vtmp4, B, 0); 2379 // Extend lowest half to type SHORT. 2380 // vtmp1 = 0044 0033 0022 0011 2381 sve_uunpklo(vtmp1, H, src); 2382 // ptmp = 0001 0000 0001 0001 2383 sve_punpklo(ptmp, mask); 2384 // Count the active elements of lowest half. 2385 // rscratch2 = 3 2386 sve_cntp(rscratch2, H, ptrue, ptmp); 2387 // Pack the active elements in size of type SHORT to the right, 2388 // and fill the remainings with zero. 2389 // dst = 0000 0044 0022 0011 2390 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2391 // Narrow the result back to type BYTE. 2392 // dst = 00 00 00 00 00 44 22 11 2393 sve_uzp1(dst, B, dst, vtmp4); 2394 2395 // Repeat to the highest half. 2396 // ptmp = 0001 0000 0000 0001 2397 sve_punpkhi(ptmp, mask); 2398 // vtmp1 = 0088 0077 0066 0055 2399 sve_uunpkhi(vtmp2, H, src); 2400 // vtmp1 = 0000 0000 0088 0055 2401 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2402 2403 sve_dup(vtmp4, B, 0); 2404 // vtmp1 = 00 00 00 00 00 00 88 55 2405 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2406 2407 // Compressed low: dst = 00 00 00 00 00 44 22 11 2408 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2409 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2410 // TRUE_CNT is the number of active elements in the compressed low. 2411 neg(rscratch2, rscratch2); 2412 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2413 sve_index(vtmp2, B, rscratch2, 1); 2414 // vtmp1 = 00 00 00 88 55 00 00 00 2415 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2416 // Combine the compressed high(after shifted) with the compressed low. 2417 // dst = 00 00 00 88 55 44 22 11 2418 sve_orr(dst, dst, vtmp1); 2419 } 2420 2421 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2422 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2423 SIMD_Arrangement size = isQ ? T16B : T8B; 2424 if (bt == T_BYTE) { 2425 rbit(dst, size, src); 2426 } else { 2427 neon_reverse_bytes(dst, src, bt, isQ); 2428 rbit(dst, size, dst); 2429 } 2430 } 2431 2432 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2433 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2434 SIMD_Arrangement size = isQ ? T16B : T8B; 2435 switch (bt) { 2436 case T_BYTE: 2437 if (dst != src) { 2438 orr(dst, size, src, src); 2439 } 2440 break; 2441 case T_SHORT: 2442 rev16(dst, size, src); 2443 break; 2444 case T_INT: 2445 rev32(dst, size, src); 2446 break; 2447 case T_LONG: 2448 rev64(dst, size, src); 2449 break; 2450 default: 2451 assert(false, "unsupported"); 2452 ShouldNotReachHere(); 2453 } 2454 } 2455 2456 // Extract a scalar element from an sve vector at position 'idx'. 2457 // The input elements in src are expected to be of integral type. 2458 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2459 int idx, FloatRegister vtmp) { 2460 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2461 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2462 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2463 if (bt == T_INT || bt == T_LONG) { 2464 umov(dst, src, size, idx); 2465 } else { 2466 smov(dst, src, size, idx); 2467 } 2468 } else { 2469 sve_orr(vtmp, src, src); 2470 sve_ext(vtmp, vtmp, idx << size); 2471 if (bt == T_INT || bt == T_LONG) { 2472 umov(dst, vtmp, size, 0); 2473 } else { 2474 smov(dst, vtmp, size, 0); 2475 } 2476 } 2477 } 2478 2479 // java.lang.Math::round intrinsics 2480 2481 // Clobbers: rscratch1, rflags 2482 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2483 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2484 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2485 switch (T) { 2486 case T2S: 2487 case T4S: 2488 fmovs(tmp1, T, 0.5f); 2489 mov(rscratch1, jint_cast(0x1.0p23f)); 2490 break; 2491 case T2D: 2492 fmovd(tmp1, T, 0.5); 2493 mov(rscratch1, julong_cast(0x1.0p52)); 2494 break; 2495 default: 2496 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2497 } 2498 fadd(tmp1, T, tmp1, src); 2499 fcvtms(tmp1, T, tmp1); 2500 // tmp1 = floor(src + 0.5, ties to even) 2501 2502 fcvtas(dst, T, src); 2503 // dst = round(src), ties to away 2504 2505 fneg(tmp3, T, src); 2506 dup(tmp2, T, rscratch1); 2507 cm(HS, tmp3, T, tmp3, tmp2); 2508 // tmp3 is now a set of flags 2509 2510 bif(dst, T16B, tmp1, tmp3); 2511 // result in dst 2512 } 2513 2514 // Clobbers: rscratch1, rflags 2515 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2516 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2517 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2518 assert_different_registers(tmp1, tmp2, src, dst); 2519 2520 switch (T) { 2521 case S: 2522 mov(rscratch1, jint_cast(0x1.0p23f)); 2523 break; 2524 case D: 2525 mov(rscratch1, julong_cast(0x1.0p52)); 2526 break; 2527 default: 2528 assert(T == S || T == D, "invalid register variant"); 2529 } 2530 2531 sve_frinta(dst, T, ptrue, src); 2532 // dst = round(src), ties to away 2533 2534 Label none; 2535 2536 sve_fneg(tmp1, T, ptrue, src); 2537 sve_dup(tmp2, T, rscratch1); 2538 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2539 br(EQ, none); 2540 { 2541 sve_cpy(tmp1, T, pgtmp, 0.5); 2542 sve_fadd(tmp1, T, pgtmp, src); 2543 sve_frintm(dst, T, pgtmp, tmp1); 2544 // dst = floor(src + 0.5, ties to even) 2545 } 2546 bind(none); 2547 2548 sve_fcvtzs(dst, T, ptrue, dst, T); 2549 // result in dst 2550 } 2551 2552 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2553 FloatRegister one, SIMD_Arrangement T) { 2554 assert_different_registers(dst, src, zero, one); 2555 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2556 2557 facgt(dst, T, src, zero); 2558 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2559 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2560 } 2561 2562 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2563 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2564 assert_different_registers(dst, src, zero, one, vtmp); 2565 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2566 2567 sve_orr(vtmp, src, src); 2568 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2569 switch (T) { 2570 case S: 2571 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2572 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2573 // on the sign of the float value 2574 break; 2575 case D: 2576 sve_and(vtmp, T, min_jlong); 2577 sve_orr(vtmp, T, jlong_cast(1.0)); 2578 break; 2579 default: 2580 assert(false, "unsupported"); 2581 ShouldNotReachHere(); 2582 } 2583 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2584 // Result in dst 2585 } 2586 2587 bool C2_MacroAssembler::in_scratch_emit_size() { 2588 if (ciEnv::current()->task() != nullptr) { 2589 PhaseOutput* phase_output = Compile::current()->output(); 2590 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2591 return true; 2592 } 2593 } 2594 return MacroAssembler::in_scratch_emit_size(); 2595 }