1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::entry_barrier() { 50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 51 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 52 // Dummy labels for just measuring the code size 53 Label dummy_slow_path; 54 Label dummy_continuation; 55 Label dummy_guard; 56 Label* slow_path = &dummy_slow_path; 57 Label* continuation = &dummy_continuation; 58 Label* guard = &dummy_guard; 59 if (!Compile::current()->output()->in_scratch_emit_size()) { 60 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 61 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 62 Compile::current()->output()->add_stub(stub); 63 slow_path = &stub->entry(); 64 continuation = &stub->continuation(); 65 guard = &stub->guard(); 66 } 67 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 68 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 69 } 70 } 71 72 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 73 Register tmp2Reg, Register tmp3Reg) { 74 Register oop = objectReg; 75 Register box = boxReg; 76 Register disp_hdr = tmpReg; 77 Register tmp = tmp2Reg; 78 Label cont; 79 Label object_has_monitor; 80 Label count, no_count; 81 82 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 83 assert_different_registers(oop, box, tmp, disp_hdr); 84 85 // Load markWord from object into displaced_header. 86 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 87 88 if (DiagnoseSyncOnValueBasedClasses != 0) { 89 load_klass(tmp, oop); 90 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 91 tst(tmp, KlassFlags::_misc_is_value_based_class); 92 br(Assembler::NE, cont); 93 } 94 95 // Check for existing monitor 96 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 97 98 if (LockingMode == LM_MONITOR) { 99 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 100 b(cont); 101 } else { 102 assert(LockingMode == LM_LEGACY, "must be"); 103 // Set tmp to be (markWord of object | UNLOCK_VALUE). 104 orr(tmp, disp_hdr, markWord::unlocked_value); 105 106 if (EnableValhalla) { 107 // Mask inline_type bit such that we go to the slow path if object is an inline type 108 andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place)); 109 } 110 111 // Initialize the box. (Must happen before we update the object mark!) 112 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 113 114 // Compare object markWord with an unlocked value (tmp) and if 115 // equal exchange the stack address of our box with object markWord. 116 // On failure disp_hdr contains the possibly locked markWord. 117 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 118 /*release*/ true, /*weak*/ false, disp_hdr); 119 br(Assembler::EQ, cont); 120 121 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 122 123 // If the compare-and-exchange succeeded, then we found an unlocked 124 // object, will have now locked it will continue at label cont 125 126 // Check if the owner is self by comparing the value in the 127 // markWord of object (disp_hdr) with the stack pointer. 128 mov(rscratch1, sp); 129 sub(disp_hdr, disp_hdr, rscratch1); 130 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 131 // If condition is true we are cont and hence we can store 0 as the 132 // displaced header in the box, which indicates that it is a recursive lock. 133 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 134 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 135 b(cont); 136 } 137 138 // Handle existing monitor. 139 bind(object_has_monitor); 140 141 // The object's monitor m is unlocked iff m->owner == nullptr, 142 // otherwise m->owner may contain a thread or a stack address. 143 // 144 // Try to CAS m->owner from null to current thread. 145 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 146 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 147 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 148 149 // Store a non-null value into the box to avoid looking like a re-entrant 150 // lock. The fast-path monitor unlock code checks for 151 // markWord::monitor_value so use markWord::unused_mark which has the 152 // relevant bit set, and also matches ObjectSynchronizer::enter. 153 mov(tmp, (address)markWord::unused_mark().value()); 154 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 155 156 br(Assembler::EQ, cont); // CAS success means locking succeeded 157 158 cmp(tmp3Reg, rthread); 159 br(Assembler::NE, cont); // Check for recursive locking 160 161 // Recursive lock case 162 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 163 // flag == EQ still from the cmp above, checking if this is a reentrant lock 164 165 bind(cont); 166 // flag == EQ indicates success 167 // flag == NE indicates failure 168 br(Assembler::NE, no_count); 169 170 bind(count); 171 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 172 173 bind(no_count); 174 } 175 176 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 177 Register tmp2Reg) { 178 Register oop = objectReg; 179 Register box = boxReg; 180 Register disp_hdr = tmpReg; 181 Register tmp = tmp2Reg; 182 Label cont; 183 Label object_has_monitor; 184 Label count, no_count; 185 186 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 187 assert_different_registers(oop, box, tmp, disp_hdr); 188 189 if (LockingMode == LM_LEGACY) { 190 // Find the lock address and load the displaced header from the stack. 191 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 192 193 // If the displaced header is 0, we have a recursive unlock. 194 cmp(disp_hdr, zr); 195 br(Assembler::EQ, cont); 196 } 197 198 // Handle existing monitor. 199 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 200 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 201 202 if (LockingMode == LM_MONITOR) { 203 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 204 b(cont); 205 } else { 206 assert(LockingMode == LM_LEGACY, "must be"); 207 // Check if it is still a light weight lock, this is is true if we 208 // see the stack address of the basicLock in the markWord of the 209 // object. 210 211 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 212 /*release*/ true, /*weak*/ false, tmp); 213 b(cont); 214 } 215 216 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 217 218 // Handle existing monitor. 219 bind(object_has_monitor); 220 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 221 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 222 223 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 224 225 Label notRecursive; 226 cbz(disp_hdr, notRecursive); 227 228 // Recursive lock 229 sub(disp_hdr, disp_hdr, 1u); 230 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 231 cmp(disp_hdr, disp_hdr); // Sets flags for result 232 b(cont); 233 234 bind(notRecursive); 235 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 236 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 237 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 238 cmp(rscratch1, zr); // Sets flags for result 239 cbnz(rscratch1, cont); 240 // need a release store here 241 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 242 stlr(zr, tmp); // set unowned 243 244 bind(cont); 245 // flag == EQ indicates success 246 // flag == NE indicates failure 247 br(Assembler::NE, no_count); 248 249 bind(count); 250 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 251 252 bind(no_count); 253 } 254 255 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 256 Register t2, Register t3) { 257 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 258 assert_different_registers(obj, box, t1, t2, t3); 259 260 // Handle inflated monitor. 261 Label inflated; 262 // Finish fast lock successfully. MUST branch to with flag == EQ 263 Label locked; 264 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 265 Label slow_path; 266 267 if (UseObjectMonitorTable) { 268 // Clear cache in case fast locking succeeds. 269 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 270 } 271 272 if (DiagnoseSyncOnValueBasedClasses != 0) { 273 load_klass(t1, obj); 274 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 275 tst(t1, KlassFlags::_misc_is_value_based_class); 276 br(Assembler::NE, slow_path); 277 } 278 279 const Register t1_mark = t1; 280 const Register t3_t = t3; 281 282 { // Lightweight locking 283 284 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 285 Label push; 286 287 const Register t2_top = t2; 288 289 // Check if lock-stack is full. 290 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 291 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 292 br(Assembler::GT, slow_path); 293 294 // Check if recursive. 295 subw(t3_t, t2_top, oopSize); 296 ldr(t3_t, Address(rthread, t3_t)); 297 cmp(obj, t3_t); 298 br(Assembler::EQ, push); 299 300 // Relaxed normal load to check for monitor. Optimization for monitor case. 301 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 302 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 303 304 // Not inflated 305 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 306 307 // Try to lock. Transition lock-bits 0b01 => 0b00 308 orr(t1_mark, t1_mark, markWord::unlocked_value); 309 eor(t3_t, t1_mark, markWord::unlocked_value); 310 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 311 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 312 br(Assembler::NE, slow_path); 313 314 bind(push); 315 // After successful lock, push object on lock-stack. 316 str(obj, Address(rthread, t2_top)); 317 addw(t2_top, t2_top, oopSize); 318 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 319 b(locked); 320 } 321 322 { // Handle inflated monitor. 323 bind(inflated); 324 325 const Register t1_monitor = t1; 326 327 if (!UseObjectMonitorTable) { 328 assert(t1_monitor == t1_mark, "should be the same here"); 329 } else { 330 Label monitor_found; 331 332 // Load cache address 333 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 334 335 const int num_unrolled = 2; 336 for (int i = 0; i < num_unrolled; i++) { 337 ldr(t1, Address(t3_t)); 338 cmp(obj, t1); 339 br(Assembler::EQ, monitor_found); 340 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 341 } 342 343 Label loop; 344 345 // Search for obj in cache. 346 bind(loop); 347 348 // Check for match. 349 ldr(t1, Address(t3_t)); 350 cmp(obj, t1); 351 br(Assembler::EQ, monitor_found); 352 353 // Search until null encountered, guaranteed _null_sentinel at end. 354 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 355 cbnz(t1, loop); 356 // Cache Miss, NE set from cmp above, cbnz does not set flags 357 b(slow_path); 358 359 bind(monitor_found); 360 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 361 } 362 363 const Register t2_owner_addr = t2; 364 const Register t3_owner = t3; 365 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 366 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 367 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 368 369 Label monitor_locked; 370 371 // Compute owner address. 372 lea(t2_owner_addr, owner_address); 373 374 // CAS owner (null => current thread). 375 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 376 /*release*/ false, /*weak*/ false, t3_owner); 377 br(Assembler::EQ, monitor_locked); 378 379 // Check if recursive. 380 cmp(t3_owner, rthread); 381 br(Assembler::NE, slow_path); 382 383 // Recursive. 384 increment(recursions_address, 1); 385 386 bind(monitor_locked); 387 if (UseObjectMonitorTable) { 388 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 389 } 390 } 391 392 bind(locked); 393 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 394 395 #ifdef ASSERT 396 // Check that locked label is reached with Flags == EQ. 397 Label flag_correct; 398 br(Assembler::EQ, flag_correct); 399 stop("Fast Lock Flag != EQ"); 400 #endif 401 402 bind(slow_path); 403 #ifdef ASSERT 404 // Check that slow_path label is reached with Flags == NE. 405 br(Assembler::NE, flag_correct); 406 stop("Fast Lock Flag != NE"); 407 bind(flag_correct); 408 #endif 409 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 410 } 411 412 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 413 Register t2, Register t3) { 414 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 415 assert_different_registers(obj, box, t1, t2, t3); 416 417 // Handle inflated monitor. 418 Label inflated, inflated_load_mark; 419 // Finish fast unlock successfully. MUST branch to with flag == EQ 420 Label unlocked; 421 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 422 Label slow_path; 423 424 const Register t1_mark = t1; 425 const Register t2_top = t2; 426 const Register t3_t = t3; 427 428 { // Lightweight unlock 429 430 Label push_and_slow_path; 431 432 // Check if obj is top of lock-stack. 433 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 434 subw(t2_top, t2_top, oopSize); 435 ldr(t3_t, Address(rthread, t2_top)); 436 cmp(obj, t3_t); 437 // Top of lock stack was not obj. Must be monitor. 438 br(Assembler::NE, inflated_load_mark); 439 440 // Pop lock-stack. 441 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 442 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 443 444 // Check if recursive. 445 subw(t3_t, t2_top, oopSize); 446 ldr(t3_t, Address(rthread, t3_t)); 447 cmp(obj, t3_t); 448 br(Assembler::EQ, unlocked); 449 450 // Not recursive. 451 // Load Mark. 452 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 453 454 // Check header for monitor (0b10). 455 // Because we got here by popping (meaning we pushed in locked) 456 // there will be no monitor in the box. So we need to push back the obj 457 // so that the runtime can fix any potential anonymous owner. 458 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 459 460 // Try to unlock. Transition lock bits 0b00 => 0b01 461 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 462 orr(t3_t, t1_mark, markWord::unlocked_value); 463 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 464 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 465 br(Assembler::EQ, unlocked); 466 467 bind(push_and_slow_path); 468 // Compare and exchange failed. 469 // Restore lock-stack and handle the unlock in runtime. 470 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 471 addw(t2_top, t2_top, oopSize); 472 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 473 b(slow_path); 474 } 475 476 477 { // Handle inflated monitor. 478 bind(inflated_load_mark); 479 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 480 #ifdef ASSERT 481 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 482 stop("Fast Unlock not monitor"); 483 #endif 484 485 bind(inflated); 486 487 #ifdef ASSERT 488 Label check_done; 489 subw(t2_top, t2_top, oopSize); 490 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 491 br(Assembler::LT, check_done); 492 ldr(t3_t, Address(rthread, t2_top)); 493 cmp(obj, t3_t); 494 br(Assembler::NE, inflated); 495 stop("Fast Unlock lock on stack"); 496 bind(check_done); 497 #endif 498 499 const Register t1_monitor = t1; 500 501 if (!UseObjectMonitorTable) { 502 assert(t1_monitor == t1_mark, "should be the same here"); 503 504 // Untag the monitor. 505 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 506 } else { 507 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 508 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 509 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 510 br(Assembler::LO, slow_path); 511 } 512 513 const Register t2_recursions = t2; 514 Label not_recursive; 515 516 // Check if recursive. 517 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 518 cbz(t2_recursions, not_recursive); 519 520 // Recursive unlock. 521 sub(t2_recursions, t2_recursions, 1u); 522 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 523 // Set flag == EQ 524 cmp(t2_recursions, t2_recursions); 525 b(unlocked); 526 527 bind(not_recursive); 528 529 Label release; 530 const Register t2_owner_addr = t2; 531 532 // Compute owner address. 533 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 534 535 // Check if the entry lists are empty. 536 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 537 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 538 orr(rscratch1, rscratch1, t3_t); 539 cmp(rscratch1, zr); 540 br(Assembler::EQ, release); 541 542 // The owner may be anonymous and we removed the last obj entry in 543 // the lock-stack. This loses the information about the owner. 544 // Write the thread to the owner field so the runtime knows the owner. 545 str(rthread, Address(t2_owner_addr)); 546 b(slow_path); 547 548 bind(release); 549 // Set owner to null. 550 // Release to satisfy the JMM 551 stlr(zr, t2_owner_addr); 552 } 553 554 bind(unlocked); 555 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 556 557 #ifdef ASSERT 558 // Check that unlocked label is reached with Flags == EQ. 559 Label flag_correct; 560 br(Assembler::EQ, flag_correct); 561 stop("Fast Unlock Flag != EQ"); 562 #endif 563 564 bind(slow_path); 565 #ifdef ASSERT 566 // Check that slow_path label is reached with Flags == NE. 567 br(Assembler::NE, flag_correct); 568 stop("Fast Unlock Flag != NE"); 569 bind(flag_correct); 570 #endif 571 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 572 } 573 574 // Search for str1 in str2 and return index or -1 575 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 576 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 577 Register cnt2, Register cnt1, 578 Register tmp1, Register tmp2, 579 Register tmp3, Register tmp4, 580 Register tmp5, Register tmp6, 581 int icnt1, Register result, int ae) { 582 // NOTE: tmp5, tmp6 can be zr depending on specific method version 583 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 584 585 Register ch1 = rscratch1; 586 Register ch2 = rscratch2; 587 Register cnt1tmp = tmp1; 588 Register cnt2tmp = tmp2; 589 Register cnt1_neg = cnt1; 590 Register cnt2_neg = cnt2; 591 Register result_tmp = tmp4; 592 593 bool isL = ae == StrIntrinsicNode::LL; 594 595 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 596 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 597 int str1_chr_shift = str1_isL ? 0:1; 598 int str2_chr_shift = str2_isL ? 0:1; 599 int str1_chr_size = str1_isL ? 1:2; 600 int str2_chr_size = str2_isL ? 1:2; 601 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 602 (chr_insn)&MacroAssembler::ldrh; 603 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 604 (chr_insn)&MacroAssembler::ldrh; 605 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 606 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 607 608 // Note, inline_string_indexOf() generates checks: 609 // if (substr.count > string.count) return -1; 610 // if (substr.count == 0) return 0; 611 612 // We have two strings, a source string in str2, cnt2 and a pattern string 613 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 614 615 // For larger pattern and source we use a simplified Boyer Moore algorithm. 616 // With a small pattern and source we use linear scan. 617 618 if (icnt1 == -1) { 619 sub(result_tmp, cnt2, cnt1); 620 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 621 br(LT, LINEARSEARCH); 622 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 623 subs(zr, cnt1, 256); 624 lsr(tmp1, cnt2, 2); 625 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 626 br(GE, LINEARSTUB); 627 } 628 629 // The Boyer Moore alogorithm is based on the description here:- 630 // 631 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 632 // 633 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 634 // and the 'Good Suffix' rule. 635 // 636 // These rules are essentially heuristics for how far we can shift the 637 // pattern along the search string. 638 // 639 // The implementation here uses the 'Bad Character' rule only because of the 640 // complexity of initialisation for the 'Good Suffix' rule. 641 // 642 // This is also known as the Boyer-Moore-Horspool algorithm:- 643 // 644 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 645 // 646 // This particular implementation has few java-specific optimizations. 647 // 648 // #define ASIZE 256 649 // 650 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 651 // int i, j; 652 // unsigned c; 653 // unsigned char bc[ASIZE]; 654 // 655 // /* Preprocessing */ 656 // for (i = 0; i < ASIZE; ++i) 657 // bc[i] = m; 658 // for (i = 0; i < m - 1; ) { 659 // c = x[i]; 660 // ++i; 661 // // c < 256 for Latin1 string, so, no need for branch 662 // #ifdef PATTERN_STRING_IS_LATIN1 663 // bc[c] = m - i; 664 // #else 665 // if (c < ASIZE) bc[c] = m - i; 666 // #endif 667 // } 668 // 669 // /* Searching */ 670 // j = 0; 671 // while (j <= n - m) { 672 // c = y[i+j]; 673 // if (x[m-1] == c) 674 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 675 // if (i < 0) return j; 676 // // c < 256 for Latin1 string, so, no need for branch 677 // #ifdef SOURCE_STRING_IS_LATIN1 678 // // LL case: (c< 256) always true. Remove branch 679 // j += bc[y[j+m-1]]; 680 // #endif 681 // #ifndef PATTERN_STRING_IS_UTF 682 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 683 // if (c < ASIZE) 684 // j += bc[y[j+m-1]]; 685 // else 686 // j += 1 687 // #endif 688 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 689 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 690 // if (c < ASIZE) 691 // j += bc[y[j+m-1]]; 692 // else 693 // j += m 694 // #endif 695 // } 696 // } 697 698 if (icnt1 == -1) { 699 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 700 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 701 Register cnt1end = tmp2; 702 Register str2end = cnt2; 703 Register skipch = tmp2; 704 705 // str1 length is >=8, so, we can read at least 1 register for cases when 706 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 707 // UL case. We'll re-read last character in inner pre-loop code to have 708 // single outer pre-loop load 709 const int firstStep = isL ? 7 : 3; 710 711 const int ASIZE = 256; 712 const int STORED_BYTES = 32; // amount of bytes stored per instruction 713 sub(sp, sp, ASIZE); 714 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 715 mov(ch1, sp); 716 BIND(BM_INIT_LOOP); 717 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 718 subs(tmp5, tmp5, 1); 719 br(GT, BM_INIT_LOOP); 720 721 sub(cnt1tmp, cnt1, 1); 722 mov(tmp5, str2); 723 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 724 sub(ch2, cnt1, 1); 725 mov(tmp3, str1); 726 BIND(BCLOOP); 727 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 728 if (!str1_isL) { 729 subs(zr, ch1, ASIZE); 730 br(HS, BCSKIP); 731 } 732 strb(ch2, Address(sp, ch1)); 733 BIND(BCSKIP); 734 subs(ch2, ch2, 1); 735 br(GT, BCLOOP); 736 737 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 738 if (str1_isL == str2_isL) { 739 // load last 8 bytes (8LL/4UU symbols) 740 ldr(tmp6, Address(tmp6, -wordSize)); 741 } else { 742 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 743 // convert Latin1 to UTF. We'll have to wait until load completed, but 744 // it's still faster than per-character loads+checks 745 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 746 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 747 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 748 andr(tmp6, tmp6, 0xFF); // str1[N-4] 749 orr(ch2, ch1, ch2, LSL, 16); 750 orr(tmp6, tmp6, tmp3, LSL, 48); 751 orr(tmp6, tmp6, ch2, LSL, 16); 752 } 753 BIND(BMLOOPSTR2); 754 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 755 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 756 if (str1_isL == str2_isL) { 757 // re-init tmp3. It's for free because it's executed in parallel with 758 // load above. Alternative is to initialize it before loop, but it'll 759 // affect performance on in-order systems with 2 or more ld/st pipelines 760 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 761 } 762 if (!isL) { // UU/UL case 763 lsl(ch2, cnt1tmp, 1); // offset in bytes 764 } 765 cmp(tmp3, skipch); 766 br(NE, BMSKIP); 767 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 768 mov(ch1, tmp6); 769 if (isL) { 770 b(BMLOOPSTR1_AFTER_LOAD); 771 } else { 772 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 773 b(BMLOOPSTR1_CMP); 774 } 775 BIND(BMLOOPSTR1); 776 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 777 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 778 BIND(BMLOOPSTR1_AFTER_LOAD); 779 subs(cnt1tmp, cnt1tmp, 1); 780 br(LT, BMLOOPSTR1_LASTCMP); 781 BIND(BMLOOPSTR1_CMP); 782 cmp(ch1, ch2); 783 br(EQ, BMLOOPSTR1); 784 BIND(BMSKIP); 785 if (!isL) { 786 // if we've met UTF symbol while searching Latin1 pattern, then we can 787 // skip cnt1 symbols 788 if (str1_isL != str2_isL) { 789 mov(result_tmp, cnt1); 790 } else { 791 mov(result_tmp, 1); 792 } 793 subs(zr, skipch, ASIZE); 794 br(HS, BMADV); 795 } 796 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 797 BIND(BMADV); 798 sub(cnt1tmp, cnt1, 1); 799 add(str2, str2, result_tmp, LSL, str2_chr_shift); 800 cmp(str2, str2end); 801 br(LE, BMLOOPSTR2); 802 add(sp, sp, ASIZE); 803 b(NOMATCH); 804 BIND(BMLOOPSTR1_LASTCMP); 805 cmp(ch1, ch2); 806 br(NE, BMSKIP); 807 BIND(BMMATCH); 808 sub(result, str2, tmp5); 809 if (!str2_isL) lsr(result, result, 1); 810 add(sp, sp, ASIZE); 811 b(DONE); 812 813 BIND(LINEARSTUB); 814 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 815 br(LT, LINEAR_MEDIUM); 816 mov(result, zr); 817 RuntimeAddress stub = nullptr; 818 if (isL) { 819 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 820 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 821 } else if (str1_isL) { 822 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 823 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 824 } else { 825 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 826 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 827 } 828 address call = trampoline_call(stub); 829 if (call == nullptr) { 830 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 831 ciEnv::current()->record_failure("CodeCache is full"); 832 return; 833 } 834 b(DONE); 835 } 836 837 BIND(LINEARSEARCH); 838 { 839 Label DO1, DO2, DO3; 840 841 Register str2tmp = tmp2; 842 Register first = tmp3; 843 844 if (icnt1 == -1) 845 { 846 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 847 848 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 849 br(LT, DOSHORT); 850 BIND(LINEAR_MEDIUM); 851 (this->*str1_load_1chr)(first, Address(str1)); 852 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 853 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 854 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 855 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 856 857 BIND(FIRST_LOOP); 858 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 859 cmp(first, ch2); 860 br(EQ, STR1_LOOP); 861 BIND(STR2_NEXT); 862 adds(cnt2_neg, cnt2_neg, str2_chr_size); 863 br(LE, FIRST_LOOP); 864 b(NOMATCH); 865 866 BIND(STR1_LOOP); 867 adds(cnt1tmp, cnt1_neg, str1_chr_size); 868 add(cnt2tmp, cnt2_neg, str2_chr_size); 869 br(GE, MATCH); 870 871 BIND(STR1_NEXT); 872 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 873 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 874 cmp(ch1, ch2); 875 br(NE, STR2_NEXT); 876 adds(cnt1tmp, cnt1tmp, str1_chr_size); 877 add(cnt2tmp, cnt2tmp, str2_chr_size); 878 br(LT, STR1_NEXT); 879 b(MATCH); 880 881 BIND(DOSHORT); 882 if (str1_isL == str2_isL) { 883 cmp(cnt1, (u1)2); 884 br(LT, DO1); 885 br(GT, DO3); 886 } 887 } 888 889 if (icnt1 == 4) { 890 Label CH1_LOOP; 891 892 (this->*load_4chr)(ch1, str1); 893 sub(result_tmp, cnt2, 4); 894 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 895 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 896 897 BIND(CH1_LOOP); 898 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 899 cmp(ch1, ch2); 900 br(EQ, MATCH); 901 adds(cnt2_neg, cnt2_neg, str2_chr_size); 902 br(LE, CH1_LOOP); 903 b(NOMATCH); 904 } 905 906 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 907 Label CH1_LOOP; 908 909 BIND(DO2); 910 (this->*load_2chr)(ch1, str1); 911 if (icnt1 == 2) { 912 sub(result_tmp, cnt2, 2); 913 } 914 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 915 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 916 BIND(CH1_LOOP); 917 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 918 cmp(ch1, ch2); 919 br(EQ, MATCH); 920 adds(cnt2_neg, cnt2_neg, str2_chr_size); 921 br(LE, CH1_LOOP); 922 b(NOMATCH); 923 } 924 925 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 926 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 927 928 BIND(DO3); 929 (this->*load_2chr)(first, str1); 930 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 931 if (icnt1 == 3) { 932 sub(result_tmp, cnt2, 3); 933 } 934 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 935 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 936 BIND(FIRST_LOOP); 937 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 938 cmpw(first, ch2); 939 br(EQ, STR1_LOOP); 940 BIND(STR2_NEXT); 941 adds(cnt2_neg, cnt2_neg, str2_chr_size); 942 br(LE, FIRST_LOOP); 943 b(NOMATCH); 944 945 BIND(STR1_LOOP); 946 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 947 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 948 cmp(ch1, ch2); 949 br(NE, STR2_NEXT); 950 b(MATCH); 951 } 952 953 if (icnt1 == -1 || icnt1 == 1) { 954 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 955 956 BIND(DO1); 957 (this->*str1_load_1chr)(ch1, str1); 958 cmp(cnt2, (u1)8); 959 br(LT, DO1_SHORT); 960 961 sub(result_tmp, cnt2, 8/str2_chr_size); 962 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 963 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 964 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 965 966 if (str2_isL) { 967 orr(ch1, ch1, ch1, LSL, 8); 968 } 969 orr(ch1, ch1, ch1, LSL, 16); 970 orr(ch1, ch1, ch1, LSL, 32); 971 BIND(CH1_LOOP); 972 ldr(ch2, Address(str2, cnt2_neg)); 973 eor(ch2, ch1, ch2); 974 sub(tmp1, ch2, tmp3); 975 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 976 bics(tmp1, tmp1, tmp2); 977 br(NE, HAS_ZERO); 978 adds(cnt2_neg, cnt2_neg, 8); 979 br(LT, CH1_LOOP); 980 981 cmp(cnt2_neg, (u1)8); 982 mov(cnt2_neg, 0); 983 br(LT, CH1_LOOP); 984 b(NOMATCH); 985 986 BIND(HAS_ZERO); 987 rev(tmp1, tmp1); 988 clz(tmp1, tmp1); 989 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 990 b(MATCH); 991 992 BIND(DO1_SHORT); 993 mov(result_tmp, cnt2); 994 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 995 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 996 BIND(DO1_LOOP); 997 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 998 cmpw(ch1, ch2); 999 br(EQ, MATCH); 1000 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1001 br(LT, DO1_LOOP); 1002 } 1003 } 1004 BIND(NOMATCH); 1005 mov(result, -1); 1006 b(DONE); 1007 BIND(MATCH); 1008 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1009 BIND(DONE); 1010 } 1011 1012 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1013 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1014 1015 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1016 Register ch, Register result, 1017 Register tmp1, Register tmp2, Register tmp3) 1018 { 1019 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1020 Register cnt1_neg = cnt1; 1021 Register ch1 = rscratch1; 1022 Register result_tmp = rscratch2; 1023 1024 cbz(cnt1, NOMATCH); 1025 1026 cmp(cnt1, (u1)4); 1027 br(LT, DO1_SHORT); 1028 1029 orr(ch, ch, ch, LSL, 16); 1030 orr(ch, ch, ch, LSL, 32); 1031 1032 sub(cnt1, cnt1, 4); 1033 mov(result_tmp, cnt1); 1034 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1035 sub(cnt1_neg, zr, cnt1, LSL, 1); 1036 1037 mov(tmp3, 0x0001000100010001); 1038 1039 BIND(CH1_LOOP); 1040 ldr(ch1, Address(str1, cnt1_neg)); 1041 eor(ch1, ch, ch1); 1042 sub(tmp1, ch1, tmp3); 1043 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1044 bics(tmp1, tmp1, tmp2); 1045 br(NE, HAS_ZERO); 1046 adds(cnt1_neg, cnt1_neg, 8); 1047 br(LT, CH1_LOOP); 1048 1049 cmp(cnt1_neg, (u1)8); 1050 mov(cnt1_neg, 0); 1051 br(LT, CH1_LOOP); 1052 b(NOMATCH); 1053 1054 BIND(HAS_ZERO); 1055 rev(tmp1, tmp1); 1056 clz(tmp1, tmp1); 1057 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1058 b(MATCH); 1059 1060 BIND(DO1_SHORT); 1061 mov(result_tmp, cnt1); 1062 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1063 sub(cnt1_neg, zr, cnt1, LSL, 1); 1064 BIND(DO1_LOOP); 1065 ldrh(ch1, Address(str1, cnt1_neg)); 1066 cmpw(ch, ch1); 1067 br(EQ, MATCH); 1068 adds(cnt1_neg, cnt1_neg, 2); 1069 br(LT, DO1_LOOP); 1070 BIND(NOMATCH); 1071 mov(result, -1); 1072 b(DONE); 1073 BIND(MATCH); 1074 add(result, result_tmp, cnt1_neg, ASR, 1); 1075 BIND(DONE); 1076 } 1077 1078 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1079 Register ch, Register result, 1080 FloatRegister ztmp1, 1081 FloatRegister ztmp2, 1082 PRegister tmp_pg, 1083 PRegister tmp_pdn, bool isL) 1084 { 1085 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1086 assert(tmp_pg->is_governing(), 1087 "this register has to be a governing predicate register"); 1088 1089 Label LOOP, MATCH, DONE, NOMATCH; 1090 Register vec_len = rscratch1; 1091 Register idx = rscratch2; 1092 1093 SIMD_RegVariant T = (isL == true) ? B : H; 1094 1095 cbz(cnt1, NOMATCH); 1096 1097 // Assign the particular char throughout the vector. 1098 sve_dup(ztmp2, T, ch); 1099 if (isL) { 1100 sve_cntb(vec_len); 1101 } else { 1102 sve_cnth(vec_len); 1103 } 1104 mov(idx, 0); 1105 1106 // Generate a predicate to control the reading of input string. 1107 sve_whilelt(tmp_pg, T, idx, cnt1); 1108 1109 BIND(LOOP); 1110 // Read a vector of 8- or 16-bit data depending on the string type. Note 1111 // that inactive elements indicated by the predicate register won't cause 1112 // a data read from memory to the destination vector. 1113 if (isL) { 1114 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1115 } else { 1116 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1117 } 1118 add(idx, idx, vec_len); 1119 1120 // Perform the comparison. An element of the destination predicate is set 1121 // to active if the particular char is matched. 1122 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1123 1124 // Branch if the particular char is found. 1125 br(NE, MATCH); 1126 1127 sve_whilelt(tmp_pg, T, idx, cnt1); 1128 1129 // Loop back if the particular char not found. 1130 br(MI, LOOP); 1131 1132 BIND(NOMATCH); 1133 mov(result, -1); 1134 b(DONE); 1135 1136 BIND(MATCH); 1137 // Undo the index increment. 1138 sub(idx, idx, vec_len); 1139 1140 // Crop the vector to find its location. 1141 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1142 add(result, idx, -1); 1143 sve_incp(result, T, tmp_pdn); 1144 BIND(DONE); 1145 } 1146 1147 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1148 Register ch, Register result, 1149 Register tmp1, Register tmp2, Register tmp3) 1150 { 1151 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1152 Register cnt1_neg = cnt1; 1153 Register ch1 = rscratch1; 1154 Register result_tmp = rscratch2; 1155 1156 cbz(cnt1, NOMATCH); 1157 1158 cmp(cnt1, (u1)8); 1159 br(LT, DO1_SHORT); 1160 1161 orr(ch, ch, ch, LSL, 8); 1162 orr(ch, ch, ch, LSL, 16); 1163 orr(ch, ch, ch, LSL, 32); 1164 1165 sub(cnt1, cnt1, 8); 1166 mov(result_tmp, cnt1); 1167 lea(str1, Address(str1, cnt1)); 1168 sub(cnt1_neg, zr, cnt1); 1169 1170 mov(tmp3, 0x0101010101010101); 1171 1172 BIND(CH1_LOOP); 1173 ldr(ch1, Address(str1, cnt1_neg)); 1174 eor(ch1, ch, ch1); 1175 sub(tmp1, ch1, tmp3); 1176 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1177 bics(tmp1, tmp1, tmp2); 1178 br(NE, HAS_ZERO); 1179 adds(cnt1_neg, cnt1_neg, 8); 1180 br(LT, CH1_LOOP); 1181 1182 cmp(cnt1_neg, (u1)8); 1183 mov(cnt1_neg, 0); 1184 br(LT, CH1_LOOP); 1185 b(NOMATCH); 1186 1187 BIND(HAS_ZERO); 1188 rev(tmp1, tmp1); 1189 clz(tmp1, tmp1); 1190 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1191 b(MATCH); 1192 1193 BIND(DO1_SHORT); 1194 mov(result_tmp, cnt1); 1195 lea(str1, Address(str1, cnt1)); 1196 sub(cnt1_neg, zr, cnt1); 1197 BIND(DO1_LOOP); 1198 ldrb(ch1, Address(str1, cnt1_neg)); 1199 cmp(ch, ch1); 1200 br(EQ, MATCH); 1201 adds(cnt1_neg, cnt1_neg, 1); 1202 br(LT, DO1_LOOP); 1203 BIND(NOMATCH); 1204 mov(result, -1); 1205 b(DONE); 1206 BIND(MATCH); 1207 add(result, result_tmp, cnt1_neg); 1208 BIND(DONE); 1209 } 1210 1211 // Compare strings. 1212 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1213 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1214 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1215 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1216 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1217 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1218 SHORT_LOOP_START, TAIL_CHECK; 1219 1220 bool isLL = ae == StrIntrinsicNode::LL; 1221 bool isLU = ae == StrIntrinsicNode::LU; 1222 bool isUL = ae == StrIntrinsicNode::UL; 1223 1224 // The stub threshold for LL strings is: 72 (64 + 8) chars 1225 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1226 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1227 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1228 1229 bool str1_isL = isLL || isLU; 1230 bool str2_isL = isLL || isUL; 1231 1232 int str1_chr_shift = str1_isL ? 0 : 1; 1233 int str2_chr_shift = str2_isL ? 0 : 1; 1234 int str1_chr_size = str1_isL ? 1 : 2; 1235 int str2_chr_size = str2_isL ? 1 : 2; 1236 int minCharsInWord = isLL ? wordSize : wordSize/2; 1237 1238 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1239 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1240 (chr_insn)&MacroAssembler::ldrh; 1241 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1242 (chr_insn)&MacroAssembler::ldrh; 1243 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1244 (uxt_insn)&MacroAssembler::uxthw; 1245 1246 BLOCK_COMMENT("string_compare {"); 1247 1248 // Bizarrely, the counts are passed in bytes, regardless of whether they 1249 // are L or U strings, however the result is always in characters. 1250 if (!str1_isL) asrw(cnt1, cnt1, 1); 1251 if (!str2_isL) asrw(cnt2, cnt2, 1); 1252 1253 // Compute the minimum of the string lengths and save the difference. 1254 subsw(result, cnt1, cnt2); 1255 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1256 1257 // A very short string 1258 cmpw(cnt2, minCharsInWord); 1259 br(Assembler::LE, SHORT_STRING); 1260 1261 // Compare longwords 1262 // load first parts of strings and finish initialization while loading 1263 { 1264 if (str1_isL == str2_isL) { // LL or UU 1265 ldr(tmp1, Address(str1)); 1266 cmp(str1, str2); 1267 br(Assembler::EQ, DONE); 1268 ldr(tmp2, Address(str2)); 1269 cmp(cnt2, stub_threshold); 1270 br(GE, STUB); 1271 subsw(cnt2, cnt2, minCharsInWord); 1272 br(EQ, TAIL_CHECK); 1273 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1274 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1275 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1276 } else if (isLU) { 1277 ldrs(vtmp, Address(str1)); 1278 ldr(tmp2, Address(str2)); 1279 cmp(cnt2, stub_threshold); 1280 br(GE, STUB); 1281 subw(cnt2, cnt2, 4); 1282 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1283 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1284 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1285 zip1(vtmp, T8B, vtmp, vtmpZ); 1286 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1287 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1288 add(cnt1, cnt1, 4); 1289 fmovd(tmp1, vtmp); 1290 } else { // UL case 1291 ldr(tmp1, Address(str1)); 1292 ldrs(vtmp, Address(str2)); 1293 cmp(cnt2, stub_threshold); 1294 br(GE, STUB); 1295 subw(cnt2, cnt2, 4); 1296 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1297 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1298 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1299 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1300 zip1(vtmp, T8B, vtmp, vtmpZ); 1301 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1302 add(cnt1, cnt1, 8); 1303 fmovd(tmp2, vtmp); 1304 } 1305 adds(cnt2, cnt2, isUL ? 4 : 8); 1306 br(GE, TAIL); 1307 eor(rscratch2, tmp1, tmp2); 1308 cbnz(rscratch2, DIFF); 1309 // main loop 1310 bind(NEXT_WORD); 1311 if (str1_isL == str2_isL) { 1312 ldr(tmp1, Address(str1, cnt2)); 1313 ldr(tmp2, Address(str2, cnt2)); 1314 adds(cnt2, cnt2, 8); 1315 } else if (isLU) { 1316 ldrs(vtmp, Address(str1, cnt1)); 1317 ldr(tmp2, Address(str2, cnt2)); 1318 add(cnt1, cnt1, 4); 1319 zip1(vtmp, T8B, vtmp, vtmpZ); 1320 fmovd(tmp1, vtmp); 1321 adds(cnt2, cnt2, 8); 1322 } else { // UL 1323 ldrs(vtmp, Address(str2, cnt2)); 1324 ldr(tmp1, Address(str1, cnt1)); 1325 zip1(vtmp, T8B, vtmp, vtmpZ); 1326 add(cnt1, cnt1, 8); 1327 fmovd(tmp2, vtmp); 1328 adds(cnt2, cnt2, 4); 1329 } 1330 br(GE, TAIL); 1331 1332 eor(rscratch2, tmp1, tmp2); 1333 cbz(rscratch2, NEXT_WORD); 1334 b(DIFF); 1335 bind(TAIL); 1336 eor(rscratch2, tmp1, tmp2); 1337 cbnz(rscratch2, DIFF); 1338 // Last longword. In the case where length == 4 we compare the 1339 // same longword twice, but that's still faster than another 1340 // conditional branch. 1341 if (str1_isL == str2_isL) { 1342 ldr(tmp1, Address(str1)); 1343 ldr(tmp2, Address(str2)); 1344 } else if (isLU) { 1345 ldrs(vtmp, Address(str1)); 1346 ldr(tmp2, Address(str2)); 1347 zip1(vtmp, T8B, vtmp, vtmpZ); 1348 fmovd(tmp1, vtmp); 1349 } else { // UL 1350 ldrs(vtmp, Address(str2)); 1351 ldr(tmp1, Address(str1)); 1352 zip1(vtmp, T8B, vtmp, vtmpZ); 1353 fmovd(tmp2, vtmp); 1354 } 1355 bind(TAIL_CHECK); 1356 eor(rscratch2, tmp1, tmp2); 1357 cbz(rscratch2, DONE); 1358 1359 // Find the first different characters in the longwords and 1360 // compute their difference. 1361 bind(DIFF); 1362 rev(rscratch2, rscratch2); 1363 clz(rscratch2, rscratch2); 1364 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1365 lsrv(tmp1, tmp1, rscratch2); 1366 (this->*ext_chr)(tmp1, tmp1); 1367 lsrv(tmp2, tmp2, rscratch2); 1368 (this->*ext_chr)(tmp2, tmp2); 1369 subw(result, tmp1, tmp2); 1370 b(DONE); 1371 } 1372 1373 bind(STUB); 1374 RuntimeAddress stub = nullptr; 1375 switch(ae) { 1376 case StrIntrinsicNode::LL: 1377 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1378 break; 1379 case StrIntrinsicNode::UU: 1380 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1381 break; 1382 case StrIntrinsicNode::LU: 1383 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1384 break; 1385 case StrIntrinsicNode::UL: 1386 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1387 break; 1388 default: 1389 ShouldNotReachHere(); 1390 } 1391 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1392 address call = trampoline_call(stub); 1393 if (call == nullptr) { 1394 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1395 ciEnv::current()->record_failure("CodeCache is full"); 1396 return; 1397 } 1398 b(DONE); 1399 1400 bind(SHORT_STRING); 1401 // Is the minimum length zero? 1402 cbz(cnt2, DONE); 1403 // arrange code to do most branches while loading and loading next characters 1404 // while comparing previous 1405 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1406 subs(cnt2, cnt2, 1); 1407 br(EQ, SHORT_LAST_INIT); 1408 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1409 b(SHORT_LOOP_START); 1410 bind(SHORT_LOOP); 1411 subs(cnt2, cnt2, 1); 1412 br(EQ, SHORT_LAST); 1413 bind(SHORT_LOOP_START); 1414 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1415 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1416 cmp(tmp1, cnt1); 1417 br(NE, SHORT_LOOP_TAIL); 1418 subs(cnt2, cnt2, 1); 1419 br(EQ, SHORT_LAST2); 1420 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1421 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1422 cmp(tmp2, rscratch1); 1423 br(EQ, SHORT_LOOP); 1424 sub(result, tmp2, rscratch1); 1425 b(DONE); 1426 bind(SHORT_LOOP_TAIL); 1427 sub(result, tmp1, cnt1); 1428 b(DONE); 1429 bind(SHORT_LAST2); 1430 cmp(tmp2, rscratch1); 1431 br(EQ, DONE); 1432 sub(result, tmp2, rscratch1); 1433 1434 b(DONE); 1435 bind(SHORT_LAST_INIT); 1436 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1437 bind(SHORT_LAST); 1438 cmp(tmp1, cnt1); 1439 br(EQ, DONE); 1440 sub(result, tmp1, cnt1); 1441 1442 bind(DONE); 1443 1444 BLOCK_COMMENT("} string_compare"); 1445 } 1446 1447 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1448 FloatRegister src2, Condition cond, bool isQ) { 1449 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1450 FloatRegister zn = src1, zm = src2; 1451 bool needs_negation = false; 1452 switch (cond) { 1453 case LT: cond = GT; zn = src2; zm = src1; break; 1454 case LE: cond = GE; zn = src2; zm = src1; break; 1455 case LO: cond = HI; zn = src2; zm = src1; break; 1456 case LS: cond = HS; zn = src2; zm = src1; break; 1457 case NE: cond = EQ; needs_negation = true; break; 1458 default: 1459 break; 1460 } 1461 1462 if (is_floating_point_type(bt)) { 1463 fcm(cond, dst, size, zn, zm); 1464 } else { 1465 cm(cond, dst, size, zn, zm); 1466 } 1467 1468 if (needs_negation) { 1469 notr(dst, isQ ? T16B : T8B, dst); 1470 } 1471 } 1472 1473 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1474 Condition cond, bool isQ) { 1475 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1476 if (bt == T_FLOAT || bt == T_DOUBLE) { 1477 if (cond == Assembler::NE) { 1478 fcm(Assembler::EQ, dst, size, src); 1479 notr(dst, isQ ? T16B : T8B, dst); 1480 } else { 1481 fcm(cond, dst, size, src); 1482 } 1483 } else { 1484 if (cond == Assembler::NE) { 1485 cm(Assembler::EQ, dst, size, src); 1486 notr(dst, isQ ? T16B : T8B, dst); 1487 } else { 1488 cm(cond, dst, size, src); 1489 } 1490 } 1491 } 1492 1493 // Compress the least significant bit of each byte to the rightmost and clear 1494 // the higher garbage bits. 1495 void C2_MacroAssembler::bytemask_compress(Register dst) { 1496 // Example input, dst = 0x01 00 00 00 01 01 00 01 1497 // The "??" bytes are garbage. 1498 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1499 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1500 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1501 andr(dst, dst, 0xff); // dst = 0x8D 1502 } 1503 1504 // Pack the lowest-numbered bit of each mask element in src into a long value 1505 // in dst, at most the first 64 lane elements. 1506 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1507 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1508 FloatRegister vtmp1, FloatRegister vtmp2) { 1509 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1510 assert_different_registers(dst, rscratch1); 1511 assert_different_registers(vtmp1, vtmp2); 1512 1513 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1514 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1515 // Expected: dst = 0x658D 1516 1517 // Convert the mask into vector with sequential bytes. 1518 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1519 sve_cpy(vtmp1, size, src, 1, false); 1520 if (bt != T_BYTE) { 1521 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1522 } 1523 1524 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1525 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1526 // is to compress each significant bit of the byte in a cross-lane way. Due 1527 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1528 // (bit-compress in each lane) with the biggest lane size (T = D) then 1529 // concatenate the results. 1530 1531 // The second source input of BEXT, initialized with 0x01 in each byte. 1532 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1533 sve_dup(vtmp2, B, 1); 1534 1535 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1536 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1537 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1538 // --------------------------------------- 1539 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1540 sve_bext(vtmp1, D, vtmp1, vtmp2); 1541 1542 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1543 // result to dst. 1544 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1545 // dst = 0x658D 1546 if (lane_cnt <= 8) { 1547 // No need to concatenate. 1548 umov(dst, vtmp1, B, 0); 1549 } else if (lane_cnt <= 16) { 1550 ins(vtmp1, B, vtmp1, 1, 8); 1551 umov(dst, vtmp1, H, 0); 1552 } else { 1553 // As the lane count is 64 at most, the final expected value must be in 1554 // the lowest 64 bits after narrowing vtmp1 from D to B. 1555 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1556 umov(dst, vtmp1, D, 0); 1557 } 1558 } else if (UseSVE > 0) { 1559 // Compress the lowest 8 bytes. 1560 fmovd(dst, vtmp1); 1561 bytemask_compress(dst); 1562 if (lane_cnt <= 8) return; 1563 1564 // Repeat on higher bytes and join the results. 1565 // Compress 8 bytes in each iteration. 1566 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1567 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1568 bytemask_compress(rscratch1); 1569 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1570 } 1571 } else { 1572 assert(false, "unsupported"); 1573 ShouldNotReachHere(); 1574 } 1575 } 1576 1577 // Unpack the mask, a long value in src, into predicate register dst based on the 1578 // corresponding data type. Note that dst can support at most 64 lanes. 1579 // Below example gives the expected dst predicate register in different types, with 1580 // a valid src(0x658D) on a 1024-bit vector size machine. 1581 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1582 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1583 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1584 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1585 // 1586 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1587 // has 24 significant bits would be an invalid input if dst predicate register refers to 1588 // a LONG type 1024-bit vector, which has at most 16 lanes. 1589 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1590 FloatRegister vtmp1, FloatRegister vtmp2) { 1591 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1592 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1593 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1594 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1595 // Expected: dst = 0b01101001 10001101 1596 1597 // Put long value from general purpose register into the first lane of vector. 1598 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1599 sve_dup(vtmp1, B, 0); 1600 mov(vtmp1, D, 0, src); 1601 1602 // As sve_cmp generates mask value with the minimum unit in byte, we should 1603 // transform the value in the first lane which is mask in bit now to the 1604 // mask in byte, which can be done by SVE2's BDEP instruction. 1605 1606 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1607 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1608 if (lane_cnt <= 8) { 1609 // Nothing. As only one byte exsits. 1610 } else if (lane_cnt <= 16) { 1611 ins(vtmp1, B, vtmp1, 8, 1); 1612 mov(vtmp1, B, 1, zr); 1613 } else { 1614 sve_vector_extend(vtmp1, D, vtmp1, B); 1615 } 1616 1617 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1618 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1619 sve_dup(vtmp2, B, 1); 1620 1621 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1622 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1623 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1624 // --------------------------------------- 1625 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1626 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1627 1628 if (bt != T_BYTE) { 1629 sve_vector_extend(vtmp1, size, vtmp1, B); 1630 } 1631 // Generate mask according to the given vector, in which the elements have been 1632 // extended to expected type. 1633 // dst = 0b01101001 10001101 1634 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1635 } 1636 1637 // Clobbers: rflags 1638 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1639 FloatRegister zn, FloatRegister zm, Condition cond) { 1640 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1641 FloatRegister z1 = zn, z2 = zm; 1642 switch (cond) { 1643 case LE: z1 = zm; z2 = zn; cond = GE; break; 1644 case LT: z1 = zm; z2 = zn; cond = GT; break; 1645 case LO: z1 = zm; z2 = zn; cond = HI; break; 1646 case LS: z1 = zm; z2 = zn; cond = HS; break; 1647 default: 1648 break; 1649 } 1650 1651 SIMD_RegVariant size = elemType_to_regVariant(bt); 1652 if (is_floating_point_type(bt)) { 1653 sve_fcm(cond, pd, size, pg, z1, z2); 1654 } else { 1655 assert(is_integral_type(bt), "unsupported element type"); 1656 sve_cmp(cond, pd, size, pg, z1, z2); 1657 } 1658 } 1659 1660 // Get index of the last mask lane that is set 1661 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1662 SIMD_RegVariant size = elemType_to_regVariant(bt); 1663 sve_rev(ptmp, size, src); 1664 sve_brkb(ptmp, ptrue, ptmp, false); 1665 sve_cntp(dst, size, ptrue, ptmp); 1666 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1667 subw(dst, rscratch1, dst); 1668 } 1669 1670 // Extend integer vector src to dst with the same lane count 1671 // but larger element size, e.g. 4B -> 4I 1672 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1673 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1674 if (src_bt == T_BYTE) { 1675 if (dst_bt == T_SHORT) { 1676 // 4B/8B to 4S/8S 1677 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1678 } else { 1679 // 4B to 4I 1680 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1681 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1682 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1683 } 1684 } else if (src_bt == T_SHORT) { 1685 // 4S to 4I 1686 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1687 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1688 } else if (src_bt == T_INT) { 1689 // 2I to 2L 1690 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1691 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1692 } else { 1693 ShouldNotReachHere(); 1694 } 1695 } 1696 1697 // Narrow integer vector src down to dst with the same lane count 1698 // but smaller element size, e.g. 4I -> 4B 1699 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1700 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1701 if (src_bt == T_SHORT) { 1702 // 4S/8S to 4B/8B 1703 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1704 assert(dst_bt == T_BYTE, "unsupported"); 1705 xtn(dst, T8B, src, T8H); 1706 } else if (src_bt == T_INT) { 1707 // 4I to 4B/4S 1708 assert(src_vlen_in_bytes == 16, "unsupported"); 1709 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1710 xtn(dst, T4H, src, T4S); 1711 if (dst_bt == T_BYTE) { 1712 xtn(dst, T8B, dst, T8H); 1713 } 1714 } else if (src_bt == T_LONG) { 1715 // 2L to 2I 1716 assert(src_vlen_in_bytes == 16, "unsupported"); 1717 assert(dst_bt == T_INT, "unsupported"); 1718 xtn(dst, T2S, src, T2D); 1719 } else { 1720 ShouldNotReachHere(); 1721 } 1722 } 1723 1724 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1725 FloatRegister src, SIMD_RegVariant src_size, 1726 bool is_unsigned) { 1727 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1728 1729 if (src_size == B) { 1730 switch (dst_size) { 1731 case H: 1732 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1733 break; 1734 case S: 1735 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1736 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1737 break; 1738 case D: 1739 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1740 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1741 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1742 break; 1743 default: 1744 ShouldNotReachHere(); 1745 } 1746 } else if (src_size == H) { 1747 if (dst_size == S) { 1748 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1749 } else { // D 1750 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1751 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1752 } 1753 } else if (src_size == S) { 1754 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1755 } 1756 } 1757 1758 // Vector narrow from src to dst with specified element sizes. 1759 // High part of dst vector will be filled with zero. 1760 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1761 FloatRegister src, SIMD_RegVariant src_size, 1762 FloatRegister tmp) { 1763 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1764 assert_different_registers(src, tmp); 1765 sve_dup(tmp, src_size, 0); 1766 if (src_size == D) { 1767 switch (dst_size) { 1768 case S: 1769 sve_uzp1(dst, S, src, tmp); 1770 break; 1771 case H: 1772 assert_different_registers(dst, tmp); 1773 sve_uzp1(dst, S, src, tmp); 1774 sve_uzp1(dst, H, dst, tmp); 1775 break; 1776 case B: 1777 assert_different_registers(dst, tmp); 1778 sve_uzp1(dst, S, src, tmp); 1779 sve_uzp1(dst, H, dst, tmp); 1780 sve_uzp1(dst, B, dst, tmp); 1781 break; 1782 default: 1783 ShouldNotReachHere(); 1784 } 1785 } else if (src_size == S) { 1786 if (dst_size == H) { 1787 sve_uzp1(dst, H, src, tmp); 1788 } else { // B 1789 assert_different_registers(dst, tmp); 1790 sve_uzp1(dst, H, src, tmp); 1791 sve_uzp1(dst, B, dst, tmp); 1792 } 1793 } else if (src_size == H) { 1794 sve_uzp1(dst, B, src, tmp); 1795 } 1796 } 1797 1798 // Extend src predicate to dst predicate with the same lane count but larger 1799 // element size, e.g. 64Byte -> 512Long 1800 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1801 uint dst_element_length_in_bytes, 1802 uint src_element_length_in_bytes) { 1803 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1804 sve_punpklo(dst, src); 1805 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1806 sve_punpklo(dst, src); 1807 sve_punpklo(dst, dst); 1808 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1809 sve_punpklo(dst, src); 1810 sve_punpklo(dst, dst); 1811 sve_punpklo(dst, dst); 1812 } else { 1813 assert(false, "unsupported"); 1814 ShouldNotReachHere(); 1815 } 1816 } 1817 1818 // Narrow src predicate to dst predicate with the same lane count but 1819 // smaller element size, e.g. 512Long -> 64Byte 1820 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1821 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1822 // The insignificant bits in src predicate are expected to be zero. 1823 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1824 // passed as the second argument. An example narrowing operation with a given mask would be - 1825 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1826 // Mask (for 2 Longs) : TF 1827 // Predicate register for the above mask (16 bits) : 00000001 00000000 1828 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1829 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1830 assert_different_registers(src, ptmp); 1831 assert_different_registers(dst, ptmp); 1832 sve_pfalse(ptmp); 1833 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1834 sve_uzp1(dst, B, src, ptmp); 1835 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1836 sve_uzp1(dst, H, src, ptmp); 1837 sve_uzp1(dst, B, dst, ptmp); 1838 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1839 sve_uzp1(dst, S, src, ptmp); 1840 sve_uzp1(dst, H, dst, ptmp); 1841 sve_uzp1(dst, B, dst, ptmp); 1842 } else { 1843 assert(false, "unsupported"); 1844 ShouldNotReachHere(); 1845 } 1846 } 1847 1848 // Vector reduction add for integral type with ASIMD instructions. 1849 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1850 Register isrc, FloatRegister vsrc, 1851 unsigned vector_length_in_bytes, 1852 FloatRegister vtmp) { 1853 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1854 assert_different_registers(dst, isrc); 1855 bool isQ = vector_length_in_bytes == 16; 1856 1857 BLOCK_COMMENT("neon_reduce_add_integral {"); 1858 switch(bt) { 1859 case T_BYTE: 1860 addv(vtmp, isQ ? T16B : T8B, vsrc); 1861 smov(dst, vtmp, B, 0); 1862 addw(dst, dst, isrc, ext::sxtb); 1863 break; 1864 case T_SHORT: 1865 addv(vtmp, isQ ? T8H : T4H, vsrc); 1866 smov(dst, vtmp, H, 0); 1867 addw(dst, dst, isrc, ext::sxth); 1868 break; 1869 case T_INT: 1870 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1871 umov(dst, vtmp, S, 0); 1872 addw(dst, dst, isrc); 1873 break; 1874 case T_LONG: 1875 assert(isQ, "unsupported"); 1876 addpd(vtmp, vsrc); 1877 umov(dst, vtmp, D, 0); 1878 add(dst, dst, isrc); 1879 break; 1880 default: 1881 assert(false, "unsupported"); 1882 ShouldNotReachHere(); 1883 } 1884 BLOCK_COMMENT("} neon_reduce_add_integral"); 1885 } 1886 1887 // Vector reduction multiply for integral type with ASIMD instructions. 1888 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1889 // Clobbers: rscratch1 1890 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1891 Register isrc, FloatRegister vsrc, 1892 unsigned vector_length_in_bytes, 1893 FloatRegister vtmp1, FloatRegister vtmp2) { 1894 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1895 bool isQ = vector_length_in_bytes == 16; 1896 1897 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1898 switch(bt) { 1899 case T_BYTE: 1900 if (isQ) { 1901 // Multiply the lower half and higher half of vector iteratively. 1902 // vtmp1 = vsrc[8:15] 1903 ins(vtmp1, D, vsrc, 0, 1); 1904 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1905 mulv(vtmp1, T8B, vtmp1, vsrc); 1906 // vtmp2 = vtmp1[4:7] 1907 ins(vtmp2, S, vtmp1, 0, 1); 1908 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1909 mulv(vtmp1, T8B, vtmp2, vtmp1); 1910 } else { 1911 ins(vtmp1, S, vsrc, 0, 1); 1912 mulv(vtmp1, T8B, vtmp1, vsrc); 1913 } 1914 // vtmp2 = vtmp1[2:3] 1915 ins(vtmp2, H, vtmp1, 0, 1); 1916 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1917 mulv(vtmp2, T8B, vtmp2, vtmp1); 1918 // dst = vtmp2[0] * isrc * vtmp2[1] 1919 umov(rscratch1, vtmp2, B, 0); 1920 mulw(dst, rscratch1, isrc); 1921 sxtb(dst, dst); 1922 umov(rscratch1, vtmp2, B, 1); 1923 mulw(dst, rscratch1, dst); 1924 sxtb(dst, dst); 1925 break; 1926 case T_SHORT: 1927 if (isQ) { 1928 ins(vtmp2, D, vsrc, 0, 1); 1929 mulv(vtmp2, T4H, vtmp2, vsrc); 1930 ins(vtmp1, S, vtmp2, 0, 1); 1931 mulv(vtmp1, T4H, vtmp1, vtmp2); 1932 } else { 1933 ins(vtmp1, S, vsrc, 0, 1); 1934 mulv(vtmp1, T4H, vtmp1, vsrc); 1935 } 1936 umov(rscratch1, vtmp1, H, 0); 1937 mulw(dst, rscratch1, isrc); 1938 sxth(dst, dst); 1939 umov(rscratch1, vtmp1, H, 1); 1940 mulw(dst, rscratch1, dst); 1941 sxth(dst, dst); 1942 break; 1943 case T_INT: 1944 if (isQ) { 1945 ins(vtmp1, D, vsrc, 0, 1); 1946 mulv(vtmp1, T2S, vtmp1, vsrc); 1947 } else { 1948 vtmp1 = vsrc; 1949 } 1950 umov(rscratch1, vtmp1, S, 0); 1951 mul(dst, rscratch1, isrc); 1952 umov(rscratch1, vtmp1, S, 1); 1953 mul(dst, rscratch1, dst); 1954 break; 1955 case T_LONG: 1956 umov(rscratch1, vsrc, D, 0); 1957 mul(dst, isrc, rscratch1); 1958 umov(rscratch1, vsrc, D, 1); 1959 mul(dst, dst, rscratch1); 1960 break; 1961 default: 1962 assert(false, "unsupported"); 1963 ShouldNotReachHere(); 1964 } 1965 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1966 } 1967 1968 // Vector reduction multiply for floating-point type with ASIMD instructions. 1969 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1970 FloatRegister fsrc, FloatRegister vsrc, 1971 unsigned vector_length_in_bytes, 1972 FloatRegister vtmp) { 1973 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1974 bool isQ = vector_length_in_bytes == 16; 1975 1976 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1977 switch(bt) { 1978 case T_FLOAT: 1979 fmuls(dst, fsrc, vsrc); 1980 ins(vtmp, S, vsrc, 0, 1); 1981 fmuls(dst, dst, vtmp); 1982 if (isQ) { 1983 ins(vtmp, S, vsrc, 0, 2); 1984 fmuls(dst, dst, vtmp); 1985 ins(vtmp, S, vsrc, 0, 3); 1986 fmuls(dst, dst, vtmp); 1987 } 1988 break; 1989 case T_DOUBLE: 1990 assert(isQ, "unsupported"); 1991 fmuld(dst, fsrc, vsrc); 1992 ins(vtmp, D, vsrc, 0, 1); 1993 fmuld(dst, dst, vtmp); 1994 break; 1995 default: 1996 assert(false, "unsupported"); 1997 ShouldNotReachHere(); 1998 } 1999 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2000 } 2001 2002 // Helper to select logical instruction 2003 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2004 Register Rn, Register Rm, 2005 enum shift_kind kind, unsigned shift) { 2006 switch(opc) { 2007 case Op_AndReductionV: 2008 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2009 break; 2010 case Op_OrReductionV: 2011 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2012 break; 2013 case Op_XorReductionV: 2014 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2015 break; 2016 default: 2017 assert(false, "unsupported"); 2018 ShouldNotReachHere(); 2019 } 2020 } 2021 2022 // Vector reduction logical operations And, Or, Xor 2023 // Clobbers: rscratch1 2024 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2025 Register isrc, FloatRegister vsrc, 2026 unsigned vector_length_in_bytes) { 2027 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2028 "unsupported"); 2029 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2030 assert_different_registers(dst, isrc); 2031 bool isQ = vector_length_in_bytes == 16; 2032 2033 BLOCK_COMMENT("neon_reduce_logical {"); 2034 umov(rscratch1, vsrc, isQ ? D : S, 0); 2035 umov(dst, vsrc, isQ ? D : S, 1); 2036 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2037 switch(bt) { 2038 case T_BYTE: 2039 if (isQ) { 2040 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2041 } 2042 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2043 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2044 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2045 sxtb(dst, dst); 2046 break; 2047 case T_SHORT: 2048 if (isQ) { 2049 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2050 } 2051 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2052 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2053 sxth(dst, dst); 2054 break; 2055 case T_INT: 2056 if (isQ) { 2057 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2058 } 2059 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2060 break; 2061 case T_LONG: 2062 assert(isQ, "unsupported"); 2063 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2064 break; 2065 default: 2066 assert(false, "unsupported"); 2067 ShouldNotReachHere(); 2068 } 2069 BLOCK_COMMENT("} neon_reduce_logical"); 2070 } 2071 2072 // Vector reduction min/max for integral type with ASIMD instructions. 2073 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2074 // Clobbers: rscratch1, rflags 2075 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2076 Register isrc, FloatRegister vsrc, 2077 unsigned vector_length_in_bytes, 2078 FloatRegister vtmp) { 2079 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2080 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2081 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2082 assert_different_registers(dst, isrc); 2083 bool isQ = vector_length_in_bytes == 16; 2084 bool is_min = opc == Op_MinReductionV; 2085 2086 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2087 if (bt == T_LONG) { 2088 assert(vtmp == fnoreg, "should be"); 2089 assert(isQ, "should be"); 2090 umov(rscratch1, vsrc, D, 0); 2091 cmp(isrc, rscratch1); 2092 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2093 umov(rscratch1, vsrc, D, 1); 2094 cmp(dst, rscratch1); 2095 csel(dst, dst, rscratch1, is_min ? LT : GT); 2096 } else { 2097 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2098 if (size == T2S) { 2099 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2100 } else { 2101 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2102 } 2103 if (bt == T_INT) { 2104 umov(dst, vtmp, S, 0); 2105 } else { 2106 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2107 } 2108 cmpw(dst, isrc); 2109 cselw(dst, dst, isrc, is_min ? LT : GT); 2110 } 2111 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2112 } 2113 2114 // Vector reduction for integral type with SVE instruction. 2115 // Supported operations are Add, And, Or, Xor, Max, Min. 2116 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2117 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2118 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2119 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2120 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2121 assert_different_registers(src1, dst); 2122 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2123 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2124 switch (opc) { 2125 case Op_AddReductionVI: { 2126 sve_uaddv(tmp, size, pg, src2); 2127 if (bt == T_BYTE) { 2128 smov(dst, tmp, size, 0); 2129 addw(dst, src1, dst, ext::sxtb); 2130 } else if (bt == T_SHORT) { 2131 smov(dst, tmp, size, 0); 2132 addw(dst, src1, dst, ext::sxth); 2133 } else { 2134 umov(dst, tmp, size, 0); 2135 addw(dst, dst, src1); 2136 } 2137 break; 2138 } 2139 case Op_AddReductionVL: { 2140 sve_uaddv(tmp, size, pg, src2); 2141 umov(dst, tmp, size, 0); 2142 add(dst, dst, src1); 2143 break; 2144 } 2145 case Op_AndReductionV: { 2146 sve_andv(tmp, size, pg, src2); 2147 if (bt == T_INT || bt == T_LONG) { 2148 umov(dst, tmp, size, 0); 2149 } else { 2150 smov(dst, tmp, size, 0); 2151 } 2152 if (bt == T_LONG) { 2153 andr(dst, dst, src1); 2154 } else { 2155 andw(dst, dst, src1); 2156 } 2157 break; 2158 } 2159 case Op_OrReductionV: { 2160 sve_orv(tmp, size, pg, src2); 2161 if (bt == T_INT || bt == T_LONG) { 2162 umov(dst, tmp, size, 0); 2163 } else { 2164 smov(dst, tmp, size, 0); 2165 } 2166 if (bt == T_LONG) { 2167 orr(dst, dst, src1); 2168 } else { 2169 orrw(dst, dst, src1); 2170 } 2171 break; 2172 } 2173 case Op_XorReductionV: { 2174 sve_eorv(tmp, size, pg, src2); 2175 if (bt == T_INT || bt == T_LONG) { 2176 umov(dst, tmp, size, 0); 2177 } else { 2178 smov(dst, tmp, size, 0); 2179 } 2180 if (bt == T_LONG) { 2181 eor(dst, dst, src1); 2182 } else { 2183 eorw(dst, dst, src1); 2184 } 2185 break; 2186 } 2187 case Op_MaxReductionV: { 2188 sve_smaxv(tmp, size, pg, src2); 2189 if (bt == T_INT || bt == T_LONG) { 2190 umov(dst, tmp, size, 0); 2191 } else { 2192 smov(dst, tmp, size, 0); 2193 } 2194 if (bt == T_LONG) { 2195 cmp(dst, src1); 2196 csel(dst, dst, src1, Assembler::GT); 2197 } else { 2198 cmpw(dst, src1); 2199 cselw(dst, dst, src1, Assembler::GT); 2200 } 2201 break; 2202 } 2203 case Op_MinReductionV: { 2204 sve_sminv(tmp, size, pg, src2); 2205 if (bt == T_INT || bt == T_LONG) { 2206 umov(dst, tmp, size, 0); 2207 } else { 2208 smov(dst, tmp, size, 0); 2209 } 2210 if (bt == T_LONG) { 2211 cmp(dst, src1); 2212 csel(dst, dst, src1, Assembler::LT); 2213 } else { 2214 cmpw(dst, src1); 2215 cselw(dst, dst, src1, Assembler::LT); 2216 } 2217 break; 2218 } 2219 default: 2220 assert(false, "unsupported"); 2221 ShouldNotReachHere(); 2222 } 2223 2224 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2225 if (bt == T_BYTE) { 2226 sxtb(dst, dst); 2227 } else if (bt == T_SHORT) { 2228 sxth(dst, dst); 2229 } 2230 } 2231 } 2232 2233 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2234 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2235 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2236 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2237 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2238 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2239 2240 // Set all elements to false if the input "lane_cnt" is zero. 2241 if (lane_cnt == 0) { 2242 sve_pfalse(dst); 2243 return; 2244 } 2245 2246 SIMD_RegVariant size = elemType_to_regVariant(bt); 2247 assert(size != Q, "invalid size"); 2248 2249 // Set all true if "lane_cnt" equals to the max lane count. 2250 if (lane_cnt == max_vector_length) { 2251 sve_ptrue(dst, size, /* ALL */ 0b11111); 2252 return; 2253 } 2254 2255 // Fixed numbers for "ptrue". 2256 switch(lane_cnt) { 2257 case 1: /* VL1 */ 2258 case 2: /* VL2 */ 2259 case 3: /* VL3 */ 2260 case 4: /* VL4 */ 2261 case 5: /* VL5 */ 2262 case 6: /* VL6 */ 2263 case 7: /* VL7 */ 2264 case 8: /* VL8 */ 2265 sve_ptrue(dst, size, lane_cnt); 2266 return; 2267 case 16: 2268 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2269 return; 2270 case 32: 2271 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2272 return; 2273 case 64: 2274 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2275 return; 2276 case 128: 2277 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2278 return; 2279 case 256: 2280 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2281 return; 2282 default: 2283 break; 2284 } 2285 2286 // Special patterns for "ptrue". 2287 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2288 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2289 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2290 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2291 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2292 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2293 } else { 2294 // Encode to "whileltw" for the remaining cases. 2295 mov(rscratch1, lane_cnt); 2296 sve_whileltw(dst, size, zr, rscratch1); 2297 } 2298 } 2299 2300 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2301 // Any remaining elements of dst will be filled with zero. 2302 // Clobbers: rscratch1 2303 // Preserves: src, mask 2304 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2305 FloatRegister vtmp1, FloatRegister vtmp2, 2306 PRegister pgtmp) { 2307 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2308 assert_different_registers(dst, src, vtmp1, vtmp2); 2309 assert_different_registers(mask, pgtmp); 2310 2311 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2312 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2313 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2314 sve_dup(vtmp2, H, 0); 2315 2316 // Extend lowest half to type INT. 2317 // dst = 00004444 00003333 00002222 00001111 2318 sve_uunpklo(dst, S, src); 2319 // pgtmp = 00000001 00000000 00000001 00000001 2320 sve_punpklo(pgtmp, mask); 2321 // Pack the active elements in size of type INT to the right, 2322 // and fill the remainings with zero. 2323 // dst = 00000000 00004444 00002222 00001111 2324 sve_compact(dst, S, dst, pgtmp); 2325 // Narrow the result back to type SHORT. 2326 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2327 sve_uzp1(dst, H, dst, vtmp2); 2328 // Count the active elements of lowest half. 2329 // rscratch1 = 3 2330 sve_cntp(rscratch1, S, ptrue, pgtmp); 2331 2332 // Repeat to the highest half. 2333 // pgtmp = 00000001 00000000 00000000 00000001 2334 sve_punpkhi(pgtmp, mask); 2335 // vtmp1 = 00008888 00007777 00006666 00005555 2336 sve_uunpkhi(vtmp1, S, src); 2337 // vtmp1 = 00000000 00000000 00008888 00005555 2338 sve_compact(vtmp1, S, vtmp1, pgtmp); 2339 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2340 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2341 2342 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2343 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2344 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2345 // TRUE_CNT is the number of active elements in the compressed low. 2346 neg(rscratch1, rscratch1); 2347 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2348 sve_index(vtmp2, H, rscratch1, 1); 2349 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2350 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2351 2352 // Combine the compressed high(after shifted) with the compressed low. 2353 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2354 sve_orr(dst, dst, vtmp1); 2355 } 2356 2357 // Clobbers: rscratch1, rscratch2 2358 // Preserves: src, mask 2359 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2360 FloatRegister vtmp1, FloatRegister vtmp2, 2361 FloatRegister vtmp3, FloatRegister vtmp4, 2362 PRegister ptmp, PRegister pgtmp) { 2363 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2364 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2365 assert_different_registers(mask, ptmp, pgtmp); 2366 // Example input: src = 88 77 66 55 44 33 22 11 2367 // mask = 01 00 00 01 01 00 01 01 2368 // Expected result: dst = 00 00 00 88 55 44 22 11 2369 2370 sve_dup(vtmp4, B, 0); 2371 // Extend lowest half to type SHORT. 2372 // vtmp1 = 0044 0033 0022 0011 2373 sve_uunpklo(vtmp1, H, src); 2374 // ptmp = 0001 0000 0001 0001 2375 sve_punpklo(ptmp, mask); 2376 // Count the active elements of lowest half. 2377 // rscratch2 = 3 2378 sve_cntp(rscratch2, H, ptrue, ptmp); 2379 // Pack the active elements in size of type SHORT to the right, 2380 // and fill the remainings with zero. 2381 // dst = 0000 0044 0022 0011 2382 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2383 // Narrow the result back to type BYTE. 2384 // dst = 00 00 00 00 00 44 22 11 2385 sve_uzp1(dst, B, dst, vtmp4); 2386 2387 // Repeat to the highest half. 2388 // ptmp = 0001 0000 0000 0001 2389 sve_punpkhi(ptmp, mask); 2390 // vtmp1 = 0088 0077 0066 0055 2391 sve_uunpkhi(vtmp2, H, src); 2392 // vtmp1 = 0000 0000 0088 0055 2393 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2394 2395 sve_dup(vtmp4, B, 0); 2396 // vtmp1 = 00 00 00 00 00 00 88 55 2397 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2398 2399 // Compressed low: dst = 00 00 00 00 00 44 22 11 2400 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2401 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2402 // TRUE_CNT is the number of active elements in the compressed low. 2403 neg(rscratch2, rscratch2); 2404 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2405 sve_index(vtmp2, B, rscratch2, 1); 2406 // vtmp1 = 00 00 00 88 55 00 00 00 2407 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2408 // Combine the compressed high(after shifted) with the compressed low. 2409 // dst = 00 00 00 88 55 44 22 11 2410 sve_orr(dst, dst, vtmp1); 2411 } 2412 2413 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2414 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2415 SIMD_Arrangement size = isQ ? T16B : T8B; 2416 if (bt == T_BYTE) { 2417 rbit(dst, size, src); 2418 } else { 2419 neon_reverse_bytes(dst, src, bt, isQ); 2420 rbit(dst, size, dst); 2421 } 2422 } 2423 2424 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2425 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2426 SIMD_Arrangement size = isQ ? T16B : T8B; 2427 switch (bt) { 2428 case T_BYTE: 2429 if (dst != src) { 2430 orr(dst, size, src, src); 2431 } 2432 break; 2433 case T_SHORT: 2434 rev16(dst, size, src); 2435 break; 2436 case T_INT: 2437 rev32(dst, size, src); 2438 break; 2439 case T_LONG: 2440 rev64(dst, size, src); 2441 break; 2442 default: 2443 assert(false, "unsupported"); 2444 ShouldNotReachHere(); 2445 } 2446 } 2447 2448 // Extract a scalar element from an sve vector at position 'idx'. 2449 // The input elements in src are expected to be of integral type. 2450 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2451 int idx, FloatRegister vtmp) { 2452 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2453 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2454 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2455 if (bt == T_INT || bt == T_LONG) { 2456 umov(dst, src, size, idx); 2457 } else { 2458 smov(dst, src, size, idx); 2459 } 2460 } else { 2461 sve_orr(vtmp, src, src); 2462 sve_ext(vtmp, vtmp, idx << size); 2463 if (bt == T_INT || bt == T_LONG) { 2464 umov(dst, vtmp, size, 0); 2465 } else { 2466 smov(dst, vtmp, size, 0); 2467 } 2468 } 2469 } 2470 2471 // java.lang.Math::round intrinsics 2472 2473 // Clobbers: rscratch1, rflags 2474 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2475 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2476 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2477 switch (T) { 2478 case T2S: 2479 case T4S: 2480 fmovs(tmp1, T, 0.5f); 2481 mov(rscratch1, jint_cast(0x1.0p23f)); 2482 break; 2483 case T2D: 2484 fmovd(tmp1, T, 0.5); 2485 mov(rscratch1, julong_cast(0x1.0p52)); 2486 break; 2487 default: 2488 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2489 } 2490 fadd(tmp1, T, tmp1, src); 2491 fcvtms(tmp1, T, tmp1); 2492 // tmp1 = floor(src + 0.5, ties to even) 2493 2494 fcvtas(dst, T, src); 2495 // dst = round(src), ties to away 2496 2497 fneg(tmp3, T, src); 2498 dup(tmp2, T, rscratch1); 2499 cm(HS, tmp3, T, tmp3, tmp2); 2500 // tmp3 is now a set of flags 2501 2502 bif(dst, T16B, tmp1, tmp3); 2503 // result in dst 2504 } 2505 2506 // Clobbers: rscratch1, rflags 2507 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2508 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2509 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2510 assert_different_registers(tmp1, tmp2, src, dst); 2511 2512 switch (T) { 2513 case S: 2514 mov(rscratch1, jint_cast(0x1.0p23f)); 2515 break; 2516 case D: 2517 mov(rscratch1, julong_cast(0x1.0p52)); 2518 break; 2519 default: 2520 assert(T == S || T == D, "invalid register variant"); 2521 } 2522 2523 sve_frinta(dst, T, ptrue, src); 2524 // dst = round(src), ties to away 2525 2526 Label none; 2527 2528 sve_fneg(tmp1, T, ptrue, src); 2529 sve_dup(tmp2, T, rscratch1); 2530 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2531 br(EQ, none); 2532 { 2533 sve_cpy(tmp1, T, pgtmp, 0.5); 2534 sve_fadd(tmp1, T, pgtmp, src); 2535 sve_frintm(dst, T, pgtmp, tmp1); 2536 // dst = floor(src + 0.5, ties to even) 2537 } 2538 bind(none); 2539 2540 sve_fcvtzs(dst, T, ptrue, dst, T); 2541 // result in dst 2542 } 2543 2544 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2545 FloatRegister one, SIMD_Arrangement T) { 2546 assert_different_registers(dst, src, zero, one); 2547 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2548 2549 facgt(dst, T, src, zero); 2550 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2551 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2552 } 2553 2554 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2555 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2556 assert_different_registers(dst, src, zero, one, vtmp); 2557 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2558 2559 sve_orr(vtmp, src, src); 2560 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2561 switch (T) { 2562 case S: 2563 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2564 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2565 // on the sign of the float value 2566 break; 2567 case D: 2568 sve_and(vtmp, T, min_jlong); 2569 sve_orr(vtmp, T, jlong_cast(1.0)); 2570 break; 2571 default: 2572 assert(false, "unsupported"); 2573 ShouldNotReachHere(); 2574 } 2575 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2576 // Result in dst 2577 } 2578 2579 bool C2_MacroAssembler::in_scratch_emit_size() { 2580 if (ciEnv::current()->task() != nullptr) { 2581 PhaseOutput* phase_output = Compile::current()->output(); 2582 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2583 return true; 2584 } 2585 } 2586 return MacroAssembler::in_scratch_emit_size(); 2587 }