1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 50 Register tmp2Reg, Register tmp3Reg) { 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmpReg; 54 Register tmp = tmp2Reg; 55 Label cont; 56 Label object_has_monitor; 57 Label count, no_count; 58 59 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 60 assert_different_registers(oop, box, tmp, disp_hdr); 61 62 // Load markWord from object into displaced_header. 63 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 64 65 if (DiagnoseSyncOnValueBasedClasses != 0) { 66 load_klass(tmp, oop); 67 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 68 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 69 br(Assembler::NE, cont); 70 } 71 72 // Check for existing monitor 73 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 74 75 if (LockingMode == LM_MONITOR) { 76 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 77 b(cont); 78 } else { 79 assert(LockingMode == LM_LEGACY, "must be"); 80 // Set tmp to be (markWord of object | UNLOCK_VALUE). 81 orr(tmp, disp_hdr, markWord::unlocked_value); 82 83 // Initialize the box. (Must happen before we update the object mark!) 84 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 85 86 // Compare object markWord with an unlocked value (tmp) and if 87 // equal exchange the stack address of our box with object markWord. 88 // On failure disp_hdr contains the possibly locked markWord. 89 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 90 /*release*/ true, /*weak*/ false, disp_hdr); 91 br(Assembler::EQ, cont); 92 93 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 94 95 // If the compare-and-exchange succeeded, then we found an unlocked 96 // object, will have now locked it will continue at label cont 97 98 // Check if the owner is self by comparing the value in the 99 // markWord of object (disp_hdr) with the stack pointer. 100 mov(rscratch1, sp); 101 sub(disp_hdr, disp_hdr, rscratch1); 102 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 103 // If condition is true we are cont and hence we can store 0 as the 104 // displaced header in the box, which indicates that it is a recursive lock. 105 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 106 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 107 b(cont); 108 } 109 110 // Handle existing monitor. 111 bind(object_has_monitor); 112 113 // The object's monitor m is unlocked iff m->owner == nullptr, 114 // otherwise m->owner may contain a thread or a stack address. 115 // 116 // Try to CAS m->owner from null to current thread. 117 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 118 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 119 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 120 121 // Store a non-null value into the box to avoid looking like a re-entrant 122 // lock. The fast-path monitor unlock code checks for 123 // markWord::monitor_value so use markWord::unused_mark which has the 124 // relevant bit set, and also matches ObjectSynchronizer::enter. 125 mov(tmp, (address)markWord::unused_mark().value()); 126 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 127 128 br(Assembler::EQ, cont); // CAS success means locking succeeded 129 130 cmp(tmp3Reg, rthread); 131 br(Assembler::NE, cont); // Check for recursive locking 132 133 // Recursive lock case 134 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 135 // flag == EQ still from the cmp above, checking if this is a reentrant lock 136 137 bind(cont); 138 // flag == EQ indicates success 139 // flag == NE indicates failure 140 br(Assembler::NE, no_count); 141 142 bind(count); 143 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 144 145 bind(no_count); 146 } 147 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 149 Register tmp2Reg) { 150 Register oop = objectReg; 151 Register box = boxReg; 152 Register disp_hdr = tmpReg; 153 Register tmp = tmp2Reg; 154 Label cont; 155 Label object_has_monitor; 156 Label count, no_count; 157 158 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 159 assert_different_registers(oop, box, tmp, disp_hdr); 160 161 if (LockingMode == LM_LEGACY) { 162 // Find the lock address and load the displaced header from the stack. 163 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 164 165 // If the displaced header is 0, we have a recursive unlock. 166 cmp(disp_hdr, zr); 167 br(Assembler::EQ, cont); 168 } 169 170 // Handle existing monitor. 171 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 172 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 173 174 if (LockingMode == LM_MONITOR) { 175 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 176 b(cont); 177 } else { 178 assert(LockingMode == LM_LEGACY, "must be"); 179 // Check if it is still a light weight lock, this is is true if we 180 // see the stack address of the basicLock in the markWord of the 181 // object. 182 183 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 184 /*release*/ true, /*weak*/ false, tmp); 185 b(cont); 186 } 187 188 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 189 190 // Handle existing monitor. 191 bind(object_has_monitor); 192 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 193 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 194 195 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 196 197 Label notRecursive; 198 cbz(disp_hdr, notRecursive); 199 200 // Recursive lock 201 sub(disp_hdr, disp_hdr, 1u); 202 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 203 cmp(disp_hdr, disp_hdr); // Sets flags for result 204 b(cont); 205 206 bind(notRecursive); 207 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 208 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 209 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 210 cmp(rscratch1, zr); // Sets flags for result 211 cbnz(rscratch1, cont); 212 // need a release store here 213 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 214 stlr(zr, tmp); // set unowned 215 216 bind(cont); 217 // flag == EQ indicates success 218 // flag == NE indicates failure 219 br(Assembler::NE, no_count); 220 221 bind(count); 222 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 223 224 bind(no_count); 225 } 226 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 228 Register t2, Register t3) { 229 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 230 assert_different_registers(obj, box, t1, t2, t3); 231 232 // Handle inflated monitor. 233 Label inflated; 234 // Finish fast lock successfully. MUST branch to with flag == EQ 235 Label locked; 236 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 237 Label slow_path; 238 239 // Clear box. TODO[OMWorld]: Is this necessary? May also defer this to not write twice. 240 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 241 242 if (DiagnoseSyncOnValueBasedClasses != 0) { 243 load_klass(t1, obj); 244 ldrw(t1, Address(t1, Klass::access_flags_offset())); 245 tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS); 246 br(Assembler::NE, slow_path); 247 } 248 249 const Register t1_mark = t1; 250 const Register t3_t = t3; 251 252 { // Lightweight locking 253 254 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 255 Label push; 256 257 const Register t2_top = t2; 258 259 // Check if lock-stack is full. 260 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 261 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 262 br(Assembler::GT, slow_path); 263 264 // Check if recursive. 265 subw(t3_t, t2_top, oopSize); 266 ldr(t3_t, Address(rthread, t3_t)); 267 cmp(obj, t3_t); 268 br(Assembler::EQ, push); 269 270 // Relaxed normal load to check for monitor. Optimization for monitor case. 271 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 272 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 273 274 // Not inflated 275 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 276 277 // Try to lock. Transition lock-bits 0b01 => 0b00 278 orr(t1_mark, t1_mark, markWord::unlocked_value); 279 eor(t3_t, t1_mark, markWord::unlocked_value); 280 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 281 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 282 br(Assembler::NE, slow_path); 283 284 bind(push); 285 // After successful lock, push object on lock-stack. 286 str(obj, Address(rthread, t2_top)); 287 addw(t2_top, t2_top, oopSize); 288 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 289 b(locked); 290 } 291 292 { // Handle inflated monitor. 293 bind(inflated); 294 295 if (!OMUseC2Cache) { 296 // Set Flags == NE 297 cmp(zr, obj); 298 b(slow_path); 299 } else { 300 301 if (OMCacheHitRate) increment(Address(rthread, JavaThread::lock_lookup_offset())); 302 303 Label monitor_found; 304 305 // Load cache address 306 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 307 308 const int num_unrolled = MIN2(OMC2UnrollCacheEntries, OMCacheSize); 309 for (int i = 0; i < num_unrolled; i++) { 310 ldr(t1, Address(t3_t)); 311 cmp(obj, t1); 312 br(Assembler::EQ, monitor_found); 313 if (i + 1 != num_unrolled) { 314 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 315 } 316 } 317 318 if (num_unrolled == 0 || (OMC2UnrollCacheLookupLoopTail && num_unrolled != OMCacheSize)) { 319 if (num_unrolled != 0) { 320 // Loop after unrolling, advance iterator. 321 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 322 } 323 324 Label loop; 325 326 // Search for obj in cache. 327 bind(loop); 328 329 // Check for match. 330 ldr(t1, Address(t3_t)); 331 cmp(obj, t1); 332 br(Assembler::EQ, monitor_found); 333 334 // Search until null encountered, guaranteed _null_sentinel at end. 335 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 336 cbnz(t1, loop); 337 // Cache Miss, NE set from cmp above, cbnz does not set flags 338 b(slow_path); 339 } else { 340 b(slow_path); 341 } 342 343 bind(monitor_found); 344 ldr(t1, Address(t3_t, OMCache::oop_to_monitor_difference())); 345 if (OMCacheHitRate) increment(Address(rthread, JavaThread::lock_hit_offset())); 346 347 // ObjectMonitor* is in t1 348 const Register t1_monitor = t1; 349 const Register t2_owner_addr = t2; 350 const Register t3_owner = t3; 351 352 Label recursive; 353 Label monitor_locked; 354 355 // Compute owner address. 356 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 357 358 if (OMRecursiveFastPath) { 359 ldr(t3_owner, Address(t2_owner_addr)); 360 cmp(t3_owner, rthread); 361 br(Assembler::EQ, recursive); 362 } 363 364 // CAS owner (null => current thread). 365 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 366 /*release*/ false, /*weak*/ false, t3_owner); 367 br(Assembler::EQ, monitor_locked); 368 369 if (OMRecursiveFastPath) { 370 b(slow_path); 371 } else { 372 // Check if recursive. 373 cmp(t3_owner, rthread); 374 br(Assembler::NE, slow_path); 375 } 376 377 // Recursive. 378 bind(recursive); 379 increment(Address(t1_monitor, ObjectMonitor::recursions_offset()), 1); 380 381 bind(monitor_locked); 382 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 383 } 384 385 } 386 387 bind(locked); 388 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 389 390 #ifdef ASSERT 391 // Check that locked label is reached with Flags == EQ. 392 Label flag_correct; 393 br(Assembler::EQ, flag_correct); 394 stop("Fast Lock Flag != EQ"); 395 #endif 396 397 bind(slow_path); 398 #ifdef ASSERT 399 // Check that slow_path label is reached with Flags == NE. 400 br(Assembler::NE, flag_correct); 401 stop("Fast Lock Flag != NE"); 402 bind(flag_correct); 403 #endif 404 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 405 } 406 407 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 408 Register t2, Register t3) { 409 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 410 assert_different_registers(obj, box, t1, t2, t3); 411 412 // Handle inflated monitor. 413 Label inflated, inflated_load_monitor; 414 // Finish fast unlock successfully. MUST branch to with flag == EQ 415 Label unlocked; 416 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 417 Label slow_path; 418 419 const Register t1_mark = t1; 420 const Register t2_top = t2; 421 const Register t3_t = t3; 422 423 { // Lightweight unlock 424 425 Label push_and_slow_path; 426 427 // Check if obj is top of lock-stack. 428 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 429 subw(t2_top, t2_top, oopSize); 430 ldr(t3_t, Address(rthread, t2_top)); 431 cmp(obj, t3_t); 432 // Top of lock stack was not obj. Must be monitor. 433 br(Assembler::NE, inflated_load_monitor); 434 435 // Pop lock-stack. 436 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 437 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 438 439 // Check if recursive. 440 subw(t3_t, t2_top, oopSize); 441 ldr(t3_t, Address(rthread, t3_t)); 442 cmp(obj, t3_t); 443 br(Assembler::EQ, unlocked); 444 445 // Not recursive. 446 // Load Mark. 447 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 448 449 // Check header for monitor (0b10). 450 // Because we got here by popping (meaning we pushed in locked) 451 // there will be no monitor in the box. So we need to push back the obj 452 // so that the runtime can fix any potential anonymous owner. 453 tbnz(t1_mark, exact_log2(markWord::monitor_value), push_and_slow_path); 454 455 // Try to unlock. Transition lock bits 0b00 => 0b01 456 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 457 orr(t3_t, t1_mark, markWord::unlocked_value); 458 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 459 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 460 br(Assembler::EQ, unlocked); 461 462 bind(push_and_slow_path); 463 // Compare and exchange failed. 464 // Restore lock-stack and handle the unlock in runtime. 465 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 466 addw(t2_top, t2_top, oopSize); 467 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 468 b(slow_path); 469 } 470 471 472 { // Handle inflated monitor. 473 bind(inflated_load_monitor); 474 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 475 #ifdef ASSERT 476 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 477 stop("Fast Unlock not monitor"); 478 #endif 479 480 bind(inflated); 481 482 #ifdef ASSERT 483 Label check_done; 484 subw(t2_top, t2_top, oopSize); 485 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 486 br(Assembler::LT, check_done); 487 ldr(t3_t, Address(rthread, t2_top)); 488 cmp(obj, t3_t); 489 br(Assembler::NE, inflated); 490 stop("Fast Unlock lock on stack"); 491 bind(check_done); 492 #endif 493 494 if (!OMUseC2Cache) { 495 b(slow_path); 496 } else { 497 const Register t1_monitor = t1; 498 499 if (OMCacheHitRate) increment(Address(rthread, JavaThread::unlock_lookup_offset())); 500 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 501 // TODO: Cleanup these constants (with an enum and asserts) 502 cmp(t1_monitor, (uint8_t)2); 503 // Non symmetrical, take slow path monitor == 0 or 1, 0 and 1 < 2, both LS and NE 504 br(Assembler::LO, slow_path); 505 if (OMCacheHitRate) increment(Address(rthread, JavaThread::unlock_hit_offset())); 506 507 const Register t2_recursions = t2; 508 Label not_recursive; 509 510 // Check if recursive. 511 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 512 cbz(t2_recursions, not_recursive); 513 514 // Recursive unlock. 515 sub(t2_recursions, t2_recursions, 1u); 516 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 517 // Set flag == EQ 518 cmp(t2_recursions, t2_recursions); 519 b(unlocked); 520 521 bind(not_recursive); 522 523 Label release; 524 const Register t2_owner_addr = t2; 525 526 // Compute owner address. 527 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 528 529 // Check if the entry lists are empty. 530 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 531 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 532 orr(rscratch1, rscratch1, t3_t); 533 cmp(rscratch1, zr); 534 br(Assembler::EQ, release); 535 536 // The owner may be anonymous and we removed the last obj entry in 537 // the lock-stack. This loses the information about the owner. 538 // Write the thread to the owner field so the runtime knows the owner. 539 str(rthread, Address(t2_owner_addr)); 540 b(slow_path); 541 542 bind(release); 543 // Set owner to null. 544 // Release to satisfy the JMM 545 stlr(zr, t2_owner_addr); 546 } 547 } 548 549 bind(unlocked); 550 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 551 552 #ifdef ASSERT 553 // Check that unlocked label is reached with Flags == EQ. 554 Label flag_correct; 555 br(Assembler::EQ, flag_correct); 556 stop("Fast Unlock Flag != EQ"); 557 #endif 558 559 bind(slow_path); 560 #ifdef ASSERT 561 // Check that slow_path label is reached with Flags == NE. 562 br(Assembler::NE, flag_correct); 563 stop("Fast Unlock Flag != NE"); 564 bind(flag_correct); 565 #endif 566 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 567 } 568 569 // Search for str1 in str2 and return index or -1 570 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 571 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 572 Register cnt2, Register cnt1, 573 Register tmp1, Register tmp2, 574 Register tmp3, Register tmp4, 575 Register tmp5, Register tmp6, 576 int icnt1, Register result, int ae) { 577 // NOTE: tmp5, tmp6 can be zr depending on specific method version 578 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 579 580 Register ch1 = rscratch1; 581 Register ch2 = rscratch2; 582 Register cnt1tmp = tmp1; 583 Register cnt2tmp = tmp2; 584 Register cnt1_neg = cnt1; 585 Register cnt2_neg = cnt2; 586 Register result_tmp = tmp4; 587 588 bool isL = ae == StrIntrinsicNode::LL; 589 590 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 591 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 592 int str1_chr_shift = str1_isL ? 0:1; 593 int str2_chr_shift = str2_isL ? 0:1; 594 int str1_chr_size = str1_isL ? 1:2; 595 int str2_chr_size = str2_isL ? 1:2; 596 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 597 (chr_insn)&MacroAssembler::ldrh; 598 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 599 (chr_insn)&MacroAssembler::ldrh; 600 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 601 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 602 603 // Note, inline_string_indexOf() generates checks: 604 // if (substr.count > string.count) return -1; 605 // if (substr.count == 0) return 0; 606 607 // We have two strings, a source string in str2, cnt2 and a pattern string 608 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 609 610 // For larger pattern and source we use a simplified Boyer Moore algorithm. 611 // With a small pattern and source we use linear scan. 612 613 if (icnt1 == -1) { 614 sub(result_tmp, cnt2, cnt1); 615 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 616 br(LT, LINEARSEARCH); 617 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 618 subs(zr, cnt1, 256); 619 lsr(tmp1, cnt2, 2); 620 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 621 br(GE, LINEARSTUB); 622 } 623 624 // The Boyer Moore alogorithm is based on the description here:- 625 // 626 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 627 // 628 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 629 // and the 'Good Suffix' rule. 630 // 631 // These rules are essentially heuristics for how far we can shift the 632 // pattern along the search string. 633 // 634 // The implementation here uses the 'Bad Character' rule only because of the 635 // complexity of initialisation for the 'Good Suffix' rule. 636 // 637 // This is also known as the Boyer-Moore-Horspool algorithm:- 638 // 639 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 640 // 641 // This particular implementation has few java-specific optimizations. 642 // 643 // #define ASIZE 256 644 // 645 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 646 // int i, j; 647 // unsigned c; 648 // unsigned char bc[ASIZE]; 649 // 650 // /* Preprocessing */ 651 // for (i = 0; i < ASIZE; ++i) 652 // bc[i] = m; 653 // for (i = 0; i < m - 1; ) { 654 // c = x[i]; 655 // ++i; 656 // // c < 256 for Latin1 string, so, no need for branch 657 // #ifdef PATTERN_STRING_IS_LATIN1 658 // bc[c] = m - i; 659 // #else 660 // if (c < ASIZE) bc[c] = m - i; 661 // #endif 662 // } 663 // 664 // /* Searching */ 665 // j = 0; 666 // while (j <= n - m) { 667 // c = y[i+j]; 668 // if (x[m-1] == c) 669 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 670 // if (i < 0) return j; 671 // // c < 256 for Latin1 string, so, no need for branch 672 // #ifdef SOURCE_STRING_IS_LATIN1 673 // // LL case: (c< 256) always true. Remove branch 674 // j += bc[y[j+m-1]]; 675 // #endif 676 // #ifndef PATTERN_STRING_IS_UTF 677 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 678 // if (c < ASIZE) 679 // j += bc[y[j+m-1]]; 680 // else 681 // j += 1 682 // #endif 683 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 684 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 685 // if (c < ASIZE) 686 // j += bc[y[j+m-1]]; 687 // else 688 // j += m 689 // #endif 690 // } 691 // } 692 693 if (icnt1 == -1) { 694 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 695 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 696 Register cnt1end = tmp2; 697 Register str2end = cnt2; 698 Register skipch = tmp2; 699 700 // str1 length is >=8, so, we can read at least 1 register for cases when 701 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 702 // UL case. We'll re-read last character in inner pre-loop code to have 703 // single outer pre-loop load 704 const int firstStep = isL ? 7 : 3; 705 706 const int ASIZE = 256; 707 const int STORED_BYTES = 32; // amount of bytes stored per instruction 708 sub(sp, sp, ASIZE); 709 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 710 mov(ch1, sp); 711 BIND(BM_INIT_LOOP); 712 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 713 subs(tmp5, tmp5, 1); 714 br(GT, BM_INIT_LOOP); 715 716 sub(cnt1tmp, cnt1, 1); 717 mov(tmp5, str2); 718 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 719 sub(ch2, cnt1, 1); 720 mov(tmp3, str1); 721 BIND(BCLOOP); 722 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 723 if (!str1_isL) { 724 subs(zr, ch1, ASIZE); 725 br(HS, BCSKIP); 726 } 727 strb(ch2, Address(sp, ch1)); 728 BIND(BCSKIP); 729 subs(ch2, ch2, 1); 730 br(GT, BCLOOP); 731 732 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 733 if (str1_isL == str2_isL) { 734 // load last 8 bytes (8LL/4UU symbols) 735 ldr(tmp6, Address(tmp6, -wordSize)); 736 } else { 737 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 738 // convert Latin1 to UTF. We'll have to wait until load completed, but 739 // it's still faster than per-character loads+checks 740 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 741 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 742 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 743 andr(tmp6, tmp6, 0xFF); // str1[N-4] 744 orr(ch2, ch1, ch2, LSL, 16); 745 orr(tmp6, tmp6, tmp3, LSL, 48); 746 orr(tmp6, tmp6, ch2, LSL, 16); 747 } 748 BIND(BMLOOPSTR2); 749 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 750 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 751 if (str1_isL == str2_isL) { 752 // re-init tmp3. It's for free because it's executed in parallel with 753 // load above. Alternative is to initialize it before loop, but it'll 754 // affect performance on in-order systems with 2 or more ld/st pipelines 755 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 756 } 757 if (!isL) { // UU/UL case 758 lsl(ch2, cnt1tmp, 1); // offset in bytes 759 } 760 cmp(tmp3, skipch); 761 br(NE, BMSKIP); 762 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 763 mov(ch1, tmp6); 764 if (isL) { 765 b(BMLOOPSTR1_AFTER_LOAD); 766 } else { 767 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 768 b(BMLOOPSTR1_CMP); 769 } 770 BIND(BMLOOPSTR1); 771 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 772 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 773 BIND(BMLOOPSTR1_AFTER_LOAD); 774 subs(cnt1tmp, cnt1tmp, 1); 775 br(LT, BMLOOPSTR1_LASTCMP); 776 BIND(BMLOOPSTR1_CMP); 777 cmp(ch1, ch2); 778 br(EQ, BMLOOPSTR1); 779 BIND(BMSKIP); 780 if (!isL) { 781 // if we've met UTF symbol while searching Latin1 pattern, then we can 782 // skip cnt1 symbols 783 if (str1_isL != str2_isL) { 784 mov(result_tmp, cnt1); 785 } else { 786 mov(result_tmp, 1); 787 } 788 subs(zr, skipch, ASIZE); 789 br(HS, BMADV); 790 } 791 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 792 BIND(BMADV); 793 sub(cnt1tmp, cnt1, 1); 794 add(str2, str2, result_tmp, LSL, str2_chr_shift); 795 cmp(str2, str2end); 796 br(LE, BMLOOPSTR2); 797 add(sp, sp, ASIZE); 798 b(NOMATCH); 799 BIND(BMLOOPSTR1_LASTCMP); 800 cmp(ch1, ch2); 801 br(NE, BMSKIP); 802 BIND(BMMATCH); 803 sub(result, str2, tmp5); 804 if (!str2_isL) lsr(result, result, 1); 805 add(sp, sp, ASIZE); 806 b(DONE); 807 808 BIND(LINEARSTUB); 809 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 810 br(LT, LINEAR_MEDIUM); 811 mov(result, zr); 812 RuntimeAddress stub = nullptr; 813 if (isL) { 814 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 815 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 816 } else if (str1_isL) { 817 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 818 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 819 } else { 820 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 821 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 822 } 823 address call = trampoline_call(stub); 824 if (call == nullptr) { 825 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 826 ciEnv::current()->record_failure("CodeCache is full"); 827 return; 828 } 829 b(DONE); 830 } 831 832 BIND(LINEARSEARCH); 833 { 834 Label DO1, DO2, DO3; 835 836 Register str2tmp = tmp2; 837 Register first = tmp3; 838 839 if (icnt1 == -1) 840 { 841 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 842 843 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 844 br(LT, DOSHORT); 845 BIND(LINEAR_MEDIUM); 846 (this->*str1_load_1chr)(first, Address(str1)); 847 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 848 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 849 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 850 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 851 852 BIND(FIRST_LOOP); 853 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 854 cmp(first, ch2); 855 br(EQ, STR1_LOOP); 856 BIND(STR2_NEXT); 857 adds(cnt2_neg, cnt2_neg, str2_chr_size); 858 br(LE, FIRST_LOOP); 859 b(NOMATCH); 860 861 BIND(STR1_LOOP); 862 adds(cnt1tmp, cnt1_neg, str1_chr_size); 863 add(cnt2tmp, cnt2_neg, str2_chr_size); 864 br(GE, MATCH); 865 866 BIND(STR1_NEXT); 867 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 868 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 869 cmp(ch1, ch2); 870 br(NE, STR2_NEXT); 871 adds(cnt1tmp, cnt1tmp, str1_chr_size); 872 add(cnt2tmp, cnt2tmp, str2_chr_size); 873 br(LT, STR1_NEXT); 874 b(MATCH); 875 876 BIND(DOSHORT); 877 if (str1_isL == str2_isL) { 878 cmp(cnt1, (u1)2); 879 br(LT, DO1); 880 br(GT, DO3); 881 } 882 } 883 884 if (icnt1 == 4) { 885 Label CH1_LOOP; 886 887 (this->*load_4chr)(ch1, str1); 888 sub(result_tmp, cnt2, 4); 889 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 890 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 891 892 BIND(CH1_LOOP); 893 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 894 cmp(ch1, ch2); 895 br(EQ, MATCH); 896 adds(cnt2_neg, cnt2_neg, str2_chr_size); 897 br(LE, CH1_LOOP); 898 b(NOMATCH); 899 } 900 901 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 902 Label CH1_LOOP; 903 904 BIND(DO2); 905 (this->*load_2chr)(ch1, str1); 906 if (icnt1 == 2) { 907 sub(result_tmp, cnt2, 2); 908 } 909 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 910 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 911 BIND(CH1_LOOP); 912 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 913 cmp(ch1, ch2); 914 br(EQ, MATCH); 915 adds(cnt2_neg, cnt2_neg, str2_chr_size); 916 br(LE, CH1_LOOP); 917 b(NOMATCH); 918 } 919 920 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 921 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 922 923 BIND(DO3); 924 (this->*load_2chr)(first, str1); 925 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 926 if (icnt1 == 3) { 927 sub(result_tmp, cnt2, 3); 928 } 929 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 930 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 931 BIND(FIRST_LOOP); 932 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 933 cmpw(first, ch2); 934 br(EQ, STR1_LOOP); 935 BIND(STR2_NEXT); 936 adds(cnt2_neg, cnt2_neg, str2_chr_size); 937 br(LE, FIRST_LOOP); 938 b(NOMATCH); 939 940 BIND(STR1_LOOP); 941 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 942 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 943 cmp(ch1, ch2); 944 br(NE, STR2_NEXT); 945 b(MATCH); 946 } 947 948 if (icnt1 == -1 || icnt1 == 1) { 949 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 950 951 BIND(DO1); 952 (this->*str1_load_1chr)(ch1, str1); 953 cmp(cnt2, (u1)8); 954 br(LT, DO1_SHORT); 955 956 sub(result_tmp, cnt2, 8/str2_chr_size); 957 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 958 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 959 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 960 961 if (str2_isL) { 962 orr(ch1, ch1, ch1, LSL, 8); 963 } 964 orr(ch1, ch1, ch1, LSL, 16); 965 orr(ch1, ch1, ch1, LSL, 32); 966 BIND(CH1_LOOP); 967 ldr(ch2, Address(str2, cnt2_neg)); 968 eor(ch2, ch1, ch2); 969 sub(tmp1, ch2, tmp3); 970 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 971 bics(tmp1, tmp1, tmp2); 972 br(NE, HAS_ZERO); 973 adds(cnt2_neg, cnt2_neg, 8); 974 br(LT, CH1_LOOP); 975 976 cmp(cnt2_neg, (u1)8); 977 mov(cnt2_neg, 0); 978 br(LT, CH1_LOOP); 979 b(NOMATCH); 980 981 BIND(HAS_ZERO); 982 rev(tmp1, tmp1); 983 clz(tmp1, tmp1); 984 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 985 b(MATCH); 986 987 BIND(DO1_SHORT); 988 mov(result_tmp, cnt2); 989 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 990 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 991 BIND(DO1_LOOP); 992 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 993 cmpw(ch1, ch2); 994 br(EQ, MATCH); 995 adds(cnt2_neg, cnt2_neg, str2_chr_size); 996 br(LT, DO1_LOOP); 997 } 998 } 999 BIND(NOMATCH); 1000 mov(result, -1); 1001 b(DONE); 1002 BIND(MATCH); 1003 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1004 BIND(DONE); 1005 } 1006 1007 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1008 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1009 1010 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1011 Register ch, Register result, 1012 Register tmp1, Register tmp2, Register tmp3) 1013 { 1014 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1015 Register cnt1_neg = cnt1; 1016 Register ch1 = rscratch1; 1017 Register result_tmp = rscratch2; 1018 1019 cbz(cnt1, NOMATCH); 1020 1021 cmp(cnt1, (u1)4); 1022 br(LT, DO1_SHORT); 1023 1024 orr(ch, ch, ch, LSL, 16); 1025 orr(ch, ch, ch, LSL, 32); 1026 1027 sub(cnt1, cnt1, 4); 1028 mov(result_tmp, cnt1); 1029 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1030 sub(cnt1_neg, zr, cnt1, LSL, 1); 1031 1032 mov(tmp3, 0x0001000100010001); 1033 1034 BIND(CH1_LOOP); 1035 ldr(ch1, Address(str1, cnt1_neg)); 1036 eor(ch1, ch, ch1); 1037 sub(tmp1, ch1, tmp3); 1038 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1039 bics(tmp1, tmp1, tmp2); 1040 br(NE, HAS_ZERO); 1041 adds(cnt1_neg, cnt1_neg, 8); 1042 br(LT, CH1_LOOP); 1043 1044 cmp(cnt1_neg, (u1)8); 1045 mov(cnt1_neg, 0); 1046 br(LT, CH1_LOOP); 1047 b(NOMATCH); 1048 1049 BIND(HAS_ZERO); 1050 rev(tmp1, tmp1); 1051 clz(tmp1, tmp1); 1052 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1053 b(MATCH); 1054 1055 BIND(DO1_SHORT); 1056 mov(result_tmp, cnt1); 1057 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1058 sub(cnt1_neg, zr, cnt1, LSL, 1); 1059 BIND(DO1_LOOP); 1060 ldrh(ch1, Address(str1, cnt1_neg)); 1061 cmpw(ch, ch1); 1062 br(EQ, MATCH); 1063 adds(cnt1_neg, cnt1_neg, 2); 1064 br(LT, DO1_LOOP); 1065 BIND(NOMATCH); 1066 mov(result, -1); 1067 b(DONE); 1068 BIND(MATCH); 1069 add(result, result_tmp, cnt1_neg, ASR, 1); 1070 BIND(DONE); 1071 } 1072 1073 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1074 Register ch, Register result, 1075 FloatRegister ztmp1, 1076 FloatRegister ztmp2, 1077 PRegister tmp_pg, 1078 PRegister tmp_pdn, bool isL) 1079 { 1080 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1081 assert(tmp_pg->is_governing(), 1082 "this register has to be a governing predicate register"); 1083 1084 Label LOOP, MATCH, DONE, NOMATCH; 1085 Register vec_len = rscratch1; 1086 Register idx = rscratch2; 1087 1088 SIMD_RegVariant T = (isL == true) ? B : H; 1089 1090 cbz(cnt1, NOMATCH); 1091 1092 // Assign the particular char throughout the vector. 1093 sve_dup(ztmp2, T, ch); 1094 if (isL) { 1095 sve_cntb(vec_len); 1096 } else { 1097 sve_cnth(vec_len); 1098 } 1099 mov(idx, 0); 1100 1101 // Generate a predicate to control the reading of input string. 1102 sve_whilelt(tmp_pg, T, idx, cnt1); 1103 1104 BIND(LOOP); 1105 // Read a vector of 8- or 16-bit data depending on the string type. Note 1106 // that inactive elements indicated by the predicate register won't cause 1107 // a data read from memory to the destination vector. 1108 if (isL) { 1109 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1110 } else { 1111 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1112 } 1113 add(idx, idx, vec_len); 1114 1115 // Perform the comparison. An element of the destination predicate is set 1116 // to active if the particular char is matched. 1117 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1118 1119 // Branch if the particular char is found. 1120 br(NE, MATCH); 1121 1122 sve_whilelt(tmp_pg, T, idx, cnt1); 1123 1124 // Loop back if the particular char not found. 1125 br(MI, LOOP); 1126 1127 BIND(NOMATCH); 1128 mov(result, -1); 1129 b(DONE); 1130 1131 BIND(MATCH); 1132 // Undo the index increment. 1133 sub(idx, idx, vec_len); 1134 1135 // Crop the vector to find its location. 1136 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1137 add(result, idx, -1); 1138 sve_incp(result, T, tmp_pdn); 1139 BIND(DONE); 1140 } 1141 1142 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1143 Register ch, Register result, 1144 Register tmp1, Register tmp2, Register tmp3) 1145 { 1146 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1147 Register cnt1_neg = cnt1; 1148 Register ch1 = rscratch1; 1149 Register result_tmp = rscratch2; 1150 1151 cbz(cnt1, NOMATCH); 1152 1153 cmp(cnt1, (u1)8); 1154 br(LT, DO1_SHORT); 1155 1156 orr(ch, ch, ch, LSL, 8); 1157 orr(ch, ch, ch, LSL, 16); 1158 orr(ch, ch, ch, LSL, 32); 1159 1160 sub(cnt1, cnt1, 8); 1161 mov(result_tmp, cnt1); 1162 lea(str1, Address(str1, cnt1)); 1163 sub(cnt1_neg, zr, cnt1); 1164 1165 mov(tmp3, 0x0101010101010101); 1166 1167 BIND(CH1_LOOP); 1168 ldr(ch1, Address(str1, cnt1_neg)); 1169 eor(ch1, ch, ch1); 1170 sub(tmp1, ch1, tmp3); 1171 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1172 bics(tmp1, tmp1, tmp2); 1173 br(NE, HAS_ZERO); 1174 adds(cnt1_neg, cnt1_neg, 8); 1175 br(LT, CH1_LOOP); 1176 1177 cmp(cnt1_neg, (u1)8); 1178 mov(cnt1_neg, 0); 1179 br(LT, CH1_LOOP); 1180 b(NOMATCH); 1181 1182 BIND(HAS_ZERO); 1183 rev(tmp1, tmp1); 1184 clz(tmp1, tmp1); 1185 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1186 b(MATCH); 1187 1188 BIND(DO1_SHORT); 1189 mov(result_tmp, cnt1); 1190 lea(str1, Address(str1, cnt1)); 1191 sub(cnt1_neg, zr, cnt1); 1192 BIND(DO1_LOOP); 1193 ldrb(ch1, Address(str1, cnt1_neg)); 1194 cmp(ch, ch1); 1195 br(EQ, MATCH); 1196 adds(cnt1_neg, cnt1_neg, 1); 1197 br(LT, DO1_LOOP); 1198 BIND(NOMATCH); 1199 mov(result, -1); 1200 b(DONE); 1201 BIND(MATCH); 1202 add(result, result_tmp, cnt1_neg); 1203 BIND(DONE); 1204 } 1205 1206 // Compare strings. 1207 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1208 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1209 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1210 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1211 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1212 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1213 SHORT_LOOP_START, TAIL_CHECK; 1214 1215 bool isLL = ae == StrIntrinsicNode::LL; 1216 bool isLU = ae == StrIntrinsicNode::LU; 1217 bool isUL = ae == StrIntrinsicNode::UL; 1218 1219 // The stub threshold for LL strings is: 72 (64 + 8) chars 1220 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1221 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1222 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1223 1224 bool str1_isL = isLL || isLU; 1225 bool str2_isL = isLL || isUL; 1226 1227 int str1_chr_shift = str1_isL ? 0 : 1; 1228 int str2_chr_shift = str2_isL ? 0 : 1; 1229 int str1_chr_size = str1_isL ? 1 : 2; 1230 int str2_chr_size = str2_isL ? 1 : 2; 1231 int minCharsInWord = isLL ? wordSize : wordSize/2; 1232 1233 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1234 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1235 (chr_insn)&MacroAssembler::ldrh; 1236 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1237 (chr_insn)&MacroAssembler::ldrh; 1238 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1239 (uxt_insn)&MacroAssembler::uxthw; 1240 1241 BLOCK_COMMENT("string_compare {"); 1242 1243 // Bizzarely, the counts are passed in bytes, regardless of whether they 1244 // are L or U strings, however the result is always in characters. 1245 if (!str1_isL) asrw(cnt1, cnt1, 1); 1246 if (!str2_isL) asrw(cnt2, cnt2, 1); 1247 1248 // Compute the minimum of the string lengths and save the difference. 1249 subsw(result, cnt1, cnt2); 1250 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1251 1252 // A very short string 1253 cmpw(cnt2, minCharsInWord); 1254 br(Assembler::LE, SHORT_STRING); 1255 1256 // Compare longwords 1257 // load first parts of strings and finish initialization while loading 1258 { 1259 if (str1_isL == str2_isL) { // LL or UU 1260 ldr(tmp1, Address(str1)); 1261 cmp(str1, str2); 1262 br(Assembler::EQ, DONE); 1263 ldr(tmp2, Address(str2)); 1264 cmp(cnt2, stub_threshold); 1265 br(GE, STUB); 1266 subsw(cnt2, cnt2, minCharsInWord); 1267 br(EQ, TAIL_CHECK); 1268 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1269 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1270 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1271 } else if (isLU) { 1272 ldrs(vtmp, Address(str1)); 1273 ldr(tmp2, Address(str2)); 1274 cmp(cnt2, stub_threshold); 1275 br(GE, STUB); 1276 subw(cnt2, cnt2, 4); 1277 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1278 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1279 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1280 zip1(vtmp, T8B, vtmp, vtmpZ); 1281 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1282 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1283 add(cnt1, cnt1, 4); 1284 fmovd(tmp1, vtmp); 1285 } else { // UL case 1286 ldr(tmp1, Address(str1)); 1287 ldrs(vtmp, Address(str2)); 1288 cmp(cnt2, stub_threshold); 1289 br(GE, STUB); 1290 subw(cnt2, cnt2, 4); 1291 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1292 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1293 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1294 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1295 zip1(vtmp, T8B, vtmp, vtmpZ); 1296 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1297 add(cnt1, cnt1, 8); 1298 fmovd(tmp2, vtmp); 1299 } 1300 adds(cnt2, cnt2, isUL ? 4 : 8); 1301 br(GE, TAIL); 1302 eor(rscratch2, tmp1, tmp2); 1303 cbnz(rscratch2, DIFF); 1304 // main loop 1305 bind(NEXT_WORD); 1306 if (str1_isL == str2_isL) { 1307 ldr(tmp1, Address(str1, cnt2)); 1308 ldr(tmp2, Address(str2, cnt2)); 1309 adds(cnt2, cnt2, 8); 1310 } else if (isLU) { 1311 ldrs(vtmp, Address(str1, cnt1)); 1312 ldr(tmp2, Address(str2, cnt2)); 1313 add(cnt1, cnt1, 4); 1314 zip1(vtmp, T8B, vtmp, vtmpZ); 1315 fmovd(tmp1, vtmp); 1316 adds(cnt2, cnt2, 8); 1317 } else { // UL 1318 ldrs(vtmp, Address(str2, cnt2)); 1319 ldr(tmp1, Address(str1, cnt1)); 1320 zip1(vtmp, T8B, vtmp, vtmpZ); 1321 add(cnt1, cnt1, 8); 1322 fmovd(tmp2, vtmp); 1323 adds(cnt2, cnt2, 4); 1324 } 1325 br(GE, TAIL); 1326 1327 eor(rscratch2, tmp1, tmp2); 1328 cbz(rscratch2, NEXT_WORD); 1329 b(DIFF); 1330 bind(TAIL); 1331 eor(rscratch2, tmp1, tmp2); 1332 cbnz(rscratch2, DIFF); 1333 // Last longword. In the case where length == 4 we compare the 1334 // same longword twice, but that's still faster than another 1335 // conditional branch. 1336 if (str1_isL == str2_isL) { 1337 ldr(tmp1, Address(str1)); 1338 ldr(tmp2, Address(str2)); 1339 } else if (isLU) { 1340 ldrs(vtmp, Address(str1)); 1341 ldr(tmp2, Address(str2)); 1342 zip1(vtmp, T8B, vtmp, vtmpZ); 1343 fmovd(tmp1, vtmp); 1344 } else { // UL 1345 ldrs(vtmp, Address(str2)); 1346 ldr(tmp1, Address(str1)); 1347 zip1(vtmp, T8B, vtmp, vtmpZ); 1348 fmovd(tmp2, vtmp); 1349 } 1350 bind(TAIL_CHECK); 1351 eor(rscratch2, tmp1, tmp2); 1352 cbz(rscratch2, DONE); 1353 1354 // Find the first different characters in the longwords and 1355 // compute their difference. 1356 bind(DIFF); 1357 rev(rscratch2, rscratch2); 1358 clz(rscratch2, rscratch2); 1359 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1360 lsrv(tmp1, tmp1, rscratch2); 1361 (this->*ext_chr)(tmp1, tmp1); 1362 lsrv(tmp2, tmp2, rscratch2); 1363 (this->*ext_chr)(tmp2, tmp2); 1364 subw(result, tmp1, tmp2); 1365 b(DONE); 1366 } 1367 1368 bind(STUB); 1369 RuntimeAddress stub = nullptr; 1370 switch(ae) { 1371 case StrIntrinsicNode::LL: 1372 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1373 break; 1374 case StrIntrinsicNode::UU: 1375 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1376 break; 1377 case StrIntrinsicNode::LU: 1378 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1379 break; 1380 case StrIntrinsicNode::UL: 1381 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1382 break; 1383 default: 1384 ShouldNotReachHere(); 1385 } 1386 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1387 address call = trampoline_call(stub); 1388 if (call == nullptr) { 1389 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1390 ciEnv::current()->record_failure("CodeCache is full"); 1391 return; 1392 } 1393 b(DONE); 1394 1395 bind(SHORT_STRING); 1396 // Is the minimum length zero? 1397 cbz(cnt2, DONE); 1398 // arrange code to do most branches while loading and loading next characters 1399 // while comparing previous 1400 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1401 subs(cnt2, cnt2, 1); 1402 br(EQ, SHORT_LAST_INIT); 1403 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1404 b(SHORT_LOOP_START); 1405 bind(SHORT_LOOP); 1406 subs(cnt2, cnt2, 1); 1407 br(EQ, SHORT_LAST); 1408 bind(SHORT_LOOP_START); 1409 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1410 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1411 cmp(tmp1, cnt1); 1412 br(NE, SHORT_LOOP_TAIL); 1413 subs(cnt2, cnt2, 1); 1414 br(EQ, SHORT_LAST2); 1415 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1416 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1417 cmp(tmp2, rscratch1); 1418 br(EQ, SHORT_LOOP); 1419 sub(result, tmp2, rscratch1); 1420 b(DONE); 1421 bind(SHORT_LOOP_TAIL); 1422 sub(result, tmp1, cnt1); 1423 b(DONE); 1424 bind(SHORT_LAST2); 1425 cmp(tmp2, rscratch1); 1426 br(EQ, DONE); 1427 sub(result, tmp2, rscratch1); 1428 1429 b(DONE); 1430 bind(SHORT_LAST_INIT); 1431 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1432 bind(SHORT_LAST); 1433 cmp(tmp1, cnt1); 1434 br(EQ, DONE); 1435 sub(result, tmp1, cnt1); 1436 1437 bind(DONE); 1438 1439 BLOCK_COMMENT("} string_compare"); 1440 } 1441 1442 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1443 FloatRegister src2, Condition cond, bool isQ) { 1444 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1445 FloatRegister zn = src1, zm = src2; 1446 bool needs_negation = false; 1447 switch (cond) { 1448 case LT: cond = GT; zn = src2; zm = src1; break; 1449 case LE: cond = GE; zn = src2; zm = src1; break; 1450 case LO: cond = HI; zn = src2; zm = src1; break; 1451 case LS: cond = HS; zn = src2; zm = src1; break; 1452 case NE: cond = EQ; needs_negation = true; break; 1453 default: 1454 break; 1455 } 1456 1457 if (is_floating_point_type(bt)) { 1458 fcm(cond, dst, size, zn, zm); 1459 } else { 1460 cm(cond, dst, size, zn, zm); 1461 } 1462 1463 if (needs_negation) { 1464 notr(dst, isQ ? T16B : T8B, dst); 1465 } 1466 } 1467 1468 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1469 Condition cond, bool isQ) { 1470 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1471 if (bt == T_FLOAT || bt == T_DOUBLE) { 1472 if (cond == Assembler::NE) { 1473 fcm(Assembler::EQ, dst, size, src); 1474 notr(dst, isQ ? T16B : T8B, dst); 1475 } else { 1476 fcm(cond, dst, size, src); 1477 } 1478 } else { 1479 if (cond == Assembler::NE) { 1480 cm(Assembler::EQ, dst, size, src); 1481 notr(dst, isQ ? T16B : T8B, dst); 1482 } else { 1483 cm(cond, dst, size, src); 1484 } 1485 } 1486 } 1487 1488 // Compress the least significant bit of each byte to the rightmost and clear 1489 // the higher garbage bits. 1490 void C2_MacroAssembler::bytemask_compress(Register dst) { 1491 // Example input, dst = 0x01 00 00 00 01 01 00 01 1492 // The "??" bytes are garbage. 1493 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1494 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1495 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1496 andr(dst, dst, 0xff); // dst = 0x8D 1497 } 1498 1499 // Pack the lowest-numbered bit of each mask element in src into a long value 1500 // in dst, at most the first 64 lane elements. 1501 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1502 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1503 FloatRegister vtmp1, FloatRegister vtmp2) { 1504 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1505 assert_different_registers(dst, rscratch1); 1506 assert_different_registers(vtmp1, vtmp2); 1507 1508 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1509 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1510 // Expected: dst = 0x658D 1511 1512 // Convert the mask into vector with sequential bytes. 1513 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1514 sve_cpy(vtmp1, size, src, 1, false); 1515 if (bt != T_BYTE) { 1516 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1517 } 1518 1519 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1520 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1521 // is to compress each significant bit of the byte in a cross-lane way. Due 1522 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1523 // (bit-compress in each lane) with the biggest lane size (T = D) then 1524 // concatenate the results. 1525 1526 // The second source input of BEXT, initialized with 0x01 in each byte. 1527 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1528 sve_dup(vtmp2, B, 1); 1529 1530 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1531 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1532 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1533 // --------------------------------------- 1534 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1535 sve_bext(vtmp1, D, vtmp1, vtmp2); 1536 1537 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1538 // result to dst. 1539 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1540 // dst = 0x658D 1541 if (lane_cnt <= 8) { 1542 // No need to concatenate. 1543 umov(dst, vtmp1, B, 0); 1544 } else if (lane_cnt <= 16) { 1545 ins(vtmp1, B, vtmp1, 1, 8); 1546 umov(dst, vtmp1, H, 0); 1547 } else { 1548 // As the lane count is 64 at most, the final expected value must be in 1549 // the lowest 64 bits after narrowing vtmp1 from D to B. 1550 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1551 umov(dst, vtmp1, D, 0); 1552 } 1553 } else if (UseSVE > 0) { 1554 // Compress the lowest 8 bytes. 1555 fmovd(dst, vtmp1); 1556 bytemask_compress(dst); 1557 if (lane_cnt <= 8) return; 1558 1559 // Repeat on higher bytes and join the results. 1560 // Compress 8 bytes in each iteration. 1561 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1562 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1563 bytemask_compress(rscratch1); 1564 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1565 } 1566 } else { 1567 assert(false, "unsupported"); 1568 ShouldNotReachHere(); 1569 } 1570 } 1571 1572 // Unpack the mask, a long value in src, into predicate register dst based on the 1573 // corresponding data type. Note that dst can support at most 64 lanes. 1574 // Below example gives the expected dst predicate register in different types, with 1575 // a valid src(0x658D) on a 1024-bit vector size machine. 1576 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1577 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1578 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1579 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1580 // 1581 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1582 // has 24 significant bits would be an invalid input if dst predicate register refers to 1583 // a LONG type 1024-bit vector, which has at most 16 lanes. 1584 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1585 FloatRegister vtmp1, FloatRegister vtmp2) { 1586 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1587 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1588 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1589 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1590 // Expected: dst = 0b01101001 10001101 1591 1592 // Put long value from general purpose register into the first lane of vector. 1593 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1594 sve_dup(vtmp1, B, 0); 1595 mov(vtmp1, D, 0, src); 1596 1597 // As sve_cmp generates mask value with the minimum unit in byte, we should 1598 // transform the value in the first lane which is mask in bit now to the 1599 // mask in byte, which can be done by SVE2's BDEP instruction. 1600 1601 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1602 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1603 if (lane_cnt <= 8) { 1604 // Nothing. As only one byte exsits. 1605 } else if (lane_cnt <= 16) { 1606 ins(vtmp1, B, vtmp1, 8, 1); 1607 mov(vtmp1, B, 1, zr); 1608 } else { 1609 sve_vector_extend(vtmp1, D, vtmp1, B); 1610 } 1611 1612 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1613 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1614 sve_dup(vtmp2, B, 1); 1615 1616 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1617 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1618 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1619 // --------------------------------------- 1620 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1621 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1622 1623 if (bt != T_BYTE) { 1624 sve_vector_extend(vtmp1, size, vtmp1, B); 1625 } 1626 // Generate mask according to the given vector, in which the elements have been 1627 // extended to expected type. 1628 // dst = 0b01101001 10001101 1629 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1630 } 1631 1632 // Clobbers: rflags 1633 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1634 FloatRegister zn, FloatRegister zm, Condition cond) { 1635 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1636 FloatRegister z1 = zn, z2 = zm; 1637 switch (cond) { 1638 case LE: z1 = zm; z2 = zn; cond = GE; break; 1639 case LT: z1 = zm; z2 = zn; cond = GT; break; 1640 case LO: z1 = zm; z2 = zn; cond = HI; break; 1641 case LS: z1 = zm; z2 = zn; cond = HS; break; 1642 default: 1643 break; 1644 } 1645 1646 SIMD_RegVariant size = elemType_to_regVariant(bt); 1647 if (is_floating_point_type(bt)) { 1648 sve_fcm(cond, pd, size, pg, z1, z2); 1649 } else { 1650 assert(is_integral_type(bt), "unsupported element type"); 1651 sve_cmp(cond, pd, size, pg, z1, z2); 1652 } 1653 } 1654 1655 // Get index of the last mask lane that is set 1656 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1657 SIMD_RegVariant size = elemType_to_regVariant(bt); 1658 sve_rev(ptmp, size, src); 1659 sve_brkb(ptmp, ptrue, ptmp, false); 1660 sve_cntp(dst, size, ptrue, ptmp); 1661 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1662 subw(dst, rscratch1, dst); 1663 } 1664 1665 // Extend integer vector src to dst with the same lane count 1666 // but larger element size, e.g. 4B -> 4I 1667 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1668 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1669 if (src_bt == T_BYTE) { 1670 if (dst_bt == T_SHORT) { 1671 // 4B/8B to 4S/8S 1672 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1673 } else { 1674 // 4B to 4I 1675 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1676 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1677 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1678 } 1679 } else if (src_bt == T_SHORT) { 1680 // 4S to 4I 1681 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1682 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1683 } else if (src_bt == T_INT) { 1684 // 2I to 2L 1685 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1686 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1687 } else { 1688 ShouldNotReachHere(); 1689 } 1690 } 1691 1692 // Narrow integer vector src down to dst with the same lane count 1693 // but smaller element size, e.g. 4I -> 4B 1694 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1695 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1696 if (src_bt == T_SHORT) { 1697 // 4S/8S to 4B/8B 1698 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1699 assert(dst_bt == T_BYTE, "unsupported"); 1700 xtn(dst, T8B, src, T8H); 1701 } else if (src_bt == T_INT) { 1702 // 4I to 4B/4S 1703 assert(src_vlen_in_bytes == 16, "unsupported"); 1704 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1705 xtn(dst, T4H, src, T4S); 1706 if (dst_bt == T_BYTE) { 1707 xtn(dst, T8B, dst, T8H); 1708 } 1709 } else if (src_bt == T_LONG) { 1710 // 2L to 2I 1711 assert(src_vlen_in_bytes == 16, "unsupported"); 1712 assert(dst_bt == T_INT, "unsupported"); 1713 xtn(dst, T2S, src, T2D); 1714 } else { 1715 ShouldNotReachHere(); 1716 } 1717 } 1718 1719 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1720 FloatRegister src, SIMD_RegVariant src_size, 1721 bool is_unsigned) { 1722 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1723 1724 if (src_size == B) { 1725 switch (dst_size) { 1726 case H: 1727 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1728 break; 1729 case S: 1730 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1731 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1732 break; 1733 case D: 1734 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1735 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1736 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1737 break; 1738 default: 1739 ShouldNotReachHere(); 1740 } 1741 } else if (src_size == H) { 1742 if (dst_size == S) { 1743 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1744 } else { // D 1745 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1746 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1747 } 1748 } else if (src_size == S) { 1749 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1750 } 1751 } 1752 1753 // Vector narrow from src to dst with specified element sizes. 1754 // High part of dst vector will be filled with zero. 1755 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1756 FloatRegister src, SIMD_RegVariant src_size, 1757 FloatRegister tmp) { 1758 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1759 assert_different_registers(src, tmp); 1760 sve_dup(tmp, src_size, 0); 1761 if (src_size == D) { 1762 switch (dst_size) { 1763 case S: 1764 sve_uzp1(dst, S, src, tmp); 1765 break; 1766 case H: 1767 assert_different_registers(dst, tmp); 1768 sve_uzp1(dst, S, src, tmp); 1769 sve_uzp1(dst, H, dst, tmp); 1770 break; 1771 case B: 1772 assert_different_registers(dst, tmp); 1773 sve_uzp1(dst, S, src, tmp); 1774 sve_uzp1(dst, H, dst, tmp); 1775 sve_uzp1(dst, B, dst, tmp); 1776 break; 1777 default: 1778 ShouldNotReachHere(); 1779 } 1780 } else if (src_size == S) { 1781 if (dst_size == H) { 1782 sve_uzp1(dst, H, src, tmp); 1783 } else { // B 1784 assert_different_registers(dst, tmp); 1785 sve_uzp1(dst, H, src, tmp); 1786 sve_uzp1(dst, B, dst, tmp); 1787 } 1788 } else if (src_size == H) { 1789 sve_uzp1(dst, B, src, tmp); 1790 } 1791 } 1792 1793 // Extend src predicate to dst predicate with the same lane count but larger 1794 // element size, e.g. 64Byte -> 512Long 1795 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1796 uint dst_element_length_in_bytes, 1797 uint src_element_length_in_bytes) { 1798 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1799 sve_punpklo(dst, src); 1800 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1801 sve_punpklo(dst, src); 1802 sve_punpklo(dst, dst); 1803 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1804 sve_punpklo(dst, src); 1805 sve_punpklo(dst, dst); 1806 sve_punpklo(dst, dst); 1807 } else { 1808 assert(false, "unsupported"); 1809 ShouldNotReachHere(); 1810 } 1811 } 1812 1813 // Narrow src predicate to dst predicate with the same lane count but 1814 // smaller element size, e.g. 512Long -> 64Byte 1815 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1816 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1817 // The insignificant bits in src predicate are expected to be zero. 1818 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1819 // passed as the second argument. An example narrowing operation with a given mask would be - 1820 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1821 // Mask (for 2 Longs) : TF 1822 // Predicate register for the above mask (16 bits) : 00000001 00000000 1823 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1824 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1825 assert_different_registers(src, ptmp); 1826 assert_different_registers(dst, ptmp); 1827 sve_pfalse(ptmp); 1828 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1829 sve_uzp1(dst, B, src, ptmp); 1830 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1831 sve_uzp1(dst, H, src, ptmp); 1832 sve_uzp1(dst, B, dst, ptmp); 1833 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1834 sve_uzp1(dst, S, src, ptmp); 1835 sve_uzp1(dst, H, dst, ptmp); 1836 sve_uzp1(dst, B, dst, ptmp); 1837 } else { 1838 assert(false, "unsupported"); 1839 ShouldNotReachHere(); 1840 } 1841 } 1842 1843 // Vector reduction add for integral type with ASIMD instructions. 1844 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1845 Register isrc, FloatRegister vsrc, 1846 unsigned vector_length_in_bytes, 1847 FloatRegister vtmp) { 1848 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1849 assert_different_registers(dst, isrc); 1850 bool isQ = vector_length_in_bytes == 16; 1851 1852 BLOCK_COMMENT("neon_reduce_add_integral {"); 1853 switch(bt) { 1854 case T_BYTE: 1855 addv(vtmp, isQ ? T16B : T8B, vsrc); 1856 smov(dst, vtmp, B, 0); 1857 addw(dst, dst, isrc, ext::sxtb); 1858 break; 1859 case T_SHORT: 1860 addv(vtmp, isQ ? T8H : T4H, vsrc); 1861 smov(dst, vtmp, H, 0); 1862 addw(dst, dst, isrc, ext::sxth); 1863 break; 1864 case T_INT: 1865 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1866 umov(dst, vtmp, S, 0); 1867 addw(dst, dst, isrc); 1868 break; 1869 case T_LONG: 1870 assert(isQ, "unsupported"); 1871 addpd(vtmp, vsrc); 1872 umov(dst, vtmp, D, 0); 1873 add(dst, dst, isrc); 1874 break; 1875 default: 1876 assert(false, "unsupported"); 1877 ShouldNotReachHere(); 1878 } 1879 BLOCK_COMMENT("} neon_reduce_add_integral"); 1880 } 1881 1882 // Vector reduction multiply for integral type with ASIMD instructions. 1883 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1884 // Clobbers: rscratch1 1885 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1886 Register isrc, FloatRegister vsrc, 1887 unsigned vector_length_in_bytes, 1888 FloatRegister vtmp1, FloatRegister vtmp2) { 1889 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1890 bool isQ = vector_length_in_bytes == 16; 1891 1892 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1893 switch(bt) { 1894 case T_BYTE: 1895 if (isQ) { 1896 // Multiply the lower half and higher half of vector iteratively. 1897 // vtmp1 = vsrc[8:15] 1898 ins(vtmp1, D, vsrc, 0, 1); 1899 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1900 mulv(vtmp1, T8B, vtmp1, vsrc); 1901 // vtmp2 = vtmp1[4:7] 1902 ins(vtmp2, S, vtmp1, 0, 1); 1903 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1904 mulv(vtmp1, T8B, vtmp2, vtmp1); 1905 } else { 1906 ins(vtmp1, S, vsrc, 0, 1); 1907 mulv(vtmp1, T8B, vtmp1, vsrc); 1908 } 1909 // vtmp2 = vtmp1[2:3] 1910 ins(vtmp2, H, vtmp1, 0, 1); 1911 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1912 mulv(vtmp2, T8B, vtmp2, vtmp1); 1913 // dst = vtmp2[0] * isrc * vtmp2[1] 1914 umov(rscratch1, vtmp2, B, 0); 1915 mulw(dst, rscratch1, isrc); 1916 sxtb(dst, dst); 1917 umov(rscratch1, vtmp2, B, 1); 1918 mulw(dst, rscratch1, dst); 1919 sxtb(dst, dst); 1920 break; 1921 case T_SHORT: 1922 if (isQ) { 1923 ins(vtmp2, D, vsrc, 0, 1); 1924 mulv(vtmp2, T4H, vtmp2, vsrc); 1925 ins(vtmp1, S, vtmp2, 0, 1); 1926 mulv(vtmp1, T4H, vtmp1, vtmp2); 1927 } else { 1928 ins(vtmp1, S, vsrc, 0, 1); 1929 mulv(vtmp1, T4H, vtmp1, vsrc); 1930 } 1931 umov(rscratch1, vtmp1, H, 0); 1932 mulw(dst, rscratch1, isrc); 1933 sxth(dst, dst); 1934 umov(rscratch1, vtmp1, H, 1); 1935 mulw(dst, rscratch1, dst); 1936 sxth(dst, dst); 1937 break; 1938 case T_INT: 1939 if (isQ) { 1940 ins(vtmp1, D, vsrc, 0, 1); 1941 mulv(vtmp1, T2S, vtmp1, vsrc); 1942 } else { 1943 vtmp1 = vsrc; 1944 } 1945 umov(rscratch1, vtmp1, S, 0); 1946 mul(dst, rscratch1, isrc); 1947 umov(rscratch1, vtmp1, S, 1); 1948 mul(dst, rscratch1, dst); 1949 break; 1950 case T_LONG: 1951 umov(rscratch1, vsrc, D, 0); 1952 mul(dst, isrc, rscratch1); 1953 umov(rscratch1, vsrc, D, 1); 1954 mul(dst, dst, rscratch1); 1955 break; 1956 default: 1957 assert(false, "unsupported"); 1958 ShouldNotReachHere(); 1959 } 1960 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1961 } 1962 1963 // Vector reduction multiply for floating-point type with ASIMD instructions. 1964 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1965 FloatRegister fsrc, FloatRegister vsrc, 1966 unsigned vector_length_in_bytes, 1967 FloatRegister vtmp) { 1968 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1969 bool isQ = vector_length_in_bytes == 16; 1970 1971 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1972 switch(bt) { 1973 case T_FLOAT: 1974 fmuls(dst, fsrc, vsrc); 1975 ins(vtmp, S, vsrc, 0, 1); 1976 fmuls(dst, dst, vtmp); 1977 if (isQ) { 1978 ins(vtmp, S, vsrc, 0, 2); 1979 fmuls(dst, dst, vtmp); 1980 ins(vtmp, S, vsrc, 0, 3); 1981 fmuls(dst, dst, vtmp); 1982 } 1983 break; 1984 case T_DOUBLE: 1985 assert(isQ, "unsupported"); 1986 fmuld(dst, fsrc, vsrc); 1987 ins(vtmp, D, vsrc, 0, 1); 1988 fmuld(dst, dst, vtmp); 1989 break; 1990 default: 1991 assert(false, "unsupported"); 1992 ShouldNotReachHere(); 1993 } 1994 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1995 } 1996 1997 // Helper to select logical instruction 1998 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1999 Register Rn, Register Rm, 2000 enum shift_kind kind, unsigned shift) { 2001 switch(opc) { 2002 case Op_AndReductionV: 2003 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2004 break; 2005 case Op_OrReductionV: 2006 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2007 break; 2008 case Op_XorReductionV: 2009 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2010 break; 2011 default: 2012 assert(false, "unsupported"); 2013 ShouldNotReachHere(); 2014 } 2015 } 2016 2017 // Vector reduction logical operations And, Or, Xor 2018 // Clobbers: rscratch1 2019 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2020 Register isrc, FloatRegister vsrc, 2021 unsigned vector_length_in_bytes) { 2022 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2023 "unsupported"); 2024 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2025 assert_different_registers(dst, isrc); 2026 bool isQ = vector_length_in_bytes == 16; 2027 2028 BLOCK_COMMENT("neon_reduce_logical {"); 2029 umov(rscratch1, vsrc, isQ ? D : S, 0); 2030 umov(dst, vsrc, isQ ? D : S, 1); 2031 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2032 switch(bt) { 2033 case T_BYTE: 2034 if (isQ) { 2035 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2036 } 2037 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2038 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2039 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2040 sxtb(dst, dst); 2041 break; 2042 case T_SHORT: 2043 if (isQ) { 2044 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2045 } 2046 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2047 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2048 sxth(dst, dst); 2049 break; 2050 case T_INT: 2051 if (isQ) { 2052 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2053 } 2054 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2055 break; 2056 case T_LONG: 2057 assert(isQ, "unsupported"); 2058 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2059 break; 2060 default: 2061 assert(false, "unsupported"); 2062 ShouldNotReachHere(); 2063 } 2064 BLOCK_COMMENT("} neon_reduce_logical"); 2065 } 2066 2067 // Vector reduction min/max for integral type with ASIMD instructions. 2068 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2069 // Clobbers: rscratch1, rflags 2070 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2071 Register isrc, FloatRegister vsrc, 2072 unsigned vector_length_in_bytes, 2073 FloatRegister vtmp) { 2074 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2075 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2076 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2077 assert_different_registers(dst, isrc); 2078 bool isQ = vector_length_in_bytes == 16; 2079 bool is_min = opc == Op_MinReductionV; 2080 2081 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2082 if (bt == T_LONG) { 2083 assert(vtmp == fnoreg, "should be"); 2084 assert(isQ, "should be"); 2085 umov(rscratch1, vsrc, D, 0); 2086 cmp(isrc, rscratch1); 2087 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2088 umov(rscratch1, vsrc, D, 1); 2089 cmp(dst, rscratch1); 2090 csel(dst, dst, rscratch1, is_min ? LT : GT); 2091 } else { 2092 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2093 if (size == T2S) { 2094 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2095 } else { 2096 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2097 } 2098 if (bt == T_INT) { 2099 umov(dst, vtmp, S, 0); 2100 } else { 2101 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2102 } 2103 cmpw(dst, isrc); 2104 cselw(dst, dst, isrc, is_min ? LT : GT); 2105 } 2106 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2107 } 2108 2109 // Vector reduction for integral type with SVE instruction. 2110 // Supported operations are Add, And, Or, Xor, Max, Min. 2111 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2112 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2113 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2114 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2115 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2116 assert_different_registers(src1, dst); 2117 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2118 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2119 switch (opc) { 2120 case Op_AddReductionVI: { 2121 sve_uaddv(tmp, size, pg, src2); 2122 if (bt == T_BYTE) { 2123 smov(dst, tmp, size, 0); 2124 addw(dst, src1, dst, ext::sxtb); 2125 } else if (bt == T_SHORT) { 2126 smov(dst, tmp, size, 0); 2127 addw(dst, src1, dst, ext::sxth); 2128 } else { 2129 umov(dst, tmp, size, 0); 2130 addw(dst, dst, src1); 2131 } 2132 break; 2133 } 2134 case Op_AddReductionVL: { 2135 sve_uaddv(tmp, size, pg, src2); 2136 umov(dst, tmp, size, 0); 2137 add(dst, dst, src1); 2138 break; 2139 } 2140 case Op_AndReductionV: { 2141 sve_andv(tmp, size, pg, src2); 2142 if (bt == T_INT || bt == T_LONG) { 2143 umov(dst, tmp, size, 0); 2144 } else { 2145 smov(dst, tmp, size, 0); 2146 } 2147 if (bt == T_LONG) { 2148 andr(dst, dst, src1); 2149 } else { 2150 andw(dst, dst, src1); 2151 } 2152 break; 2153 } 2154 case Op_OrReductionV: { 2155 sve_orv(tmp, size, pg, src2); 2156 if (bt == T_INT || bt == T_LONG) { 2157 umov(dst, tmp, size, 0); 2158 } else { 2159 smov(dst, tmp, size, 0); 2160 } 2161 if (bt == T_LONG) { 2162 orr(dst, dst, src1); 2163 } else { 2164 orrw(dst, dst, src1); 2165 } 2166 break; 2167 } 2168 case Op_XorReductionV: { 2169 sve_eorv(tmp, size, pg, src2); 2170 if (bt == T_INT || bt == T_LONG) { 2171 umov(dst, tmp, size, 0); 2172 } else { 2173 smov(dst, tmp, size, 0); 2174 } 2175 if (bt == T_LONG) { 2176 eor(dst, dst, src1); 2177 } else { 2178 eorw(dst, dst, src1); 2179 } 2180 break; 2181 } 2182 case Op_MaxReductionV: { 2183 sve_smaxv(tmp, size, pg, src2); 2184 if (bt == T_INT || bt == T_LONG) { 2185 umov(dst, tmp, size, 0); 2186 } else { 2187 smov(dst, tmp, size, 0); 2188 } 2189 if (bt == T_LONG) { 2190 cmp(dst, src1); 2191 csel(dst, dst, src1, Assembler::GT); 2192 } else { 2193 cmpw(dst, src1); 2194 cselw(dst, dst, src1, Assembler::GT); 2195 } 2196 break; 2197 } 2198 case Op_MinReductionV: { 2199 sve_sminv(tmp, size, pg, src2); 2200 if (bt == T_INT || bt == T_LONG) { 2201 umov(dst, tmp, size, 0); 2202 } else { 2203 smov(dst, tmp, size, 0); 2204 } 2205 if (bt == T_LONG) { 2206 cmp(dst, src1); 2207 csel(dst, dst, src1, Assembler::LT); 2208 } else { 2209 cmpw(dst, src1); 2210 cselw(dst, dst, src1, Assembler::LT); 2211 } 2212 break; 2213 } 2214 default: 2215 assert(false, "unsupported"); 2216 ShouldNotReachHere(); 2217 } 2218 2219 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2220 if (bt == T_BYTE) { 2221 sxtb(dst, dst); 2222 } else if (bt == T_SHORT) { 2223 sxth(dst, dst); 2224 } 2225 } 2226 } 2227 2228 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2229 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2230 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2231 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2232 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2233 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2234 2235 // Set all elements to false if the input "lane_cnt" is zero. 2236 if (lane_cnt == 0) { 2237 sve_pfalse(dst); 2238 return; 2239 } 2240 2241 SIMD_RegVariant size = elemType_to_regVariant(bt); 2242 assert(size != Q, "invalid size"); 2243 2244 // Set all true if "lane_cnt" equals to the max lane count. 2245 if (lane_cnt == max_vector_length) { 2246 sve_ptrue(dst, size, /* ALL */ 0b11111); 2247 return; 2248 } 2249 2250 // Fixed numbers for "ptrue". 2251 switch(lane_cnt) { 2252 case 1: /* VL1 */ 2253 case 2: /* VL2 */ 2254 case 3: /* VL3 */ 2255 case 4: /* VL4 */ 2256 case 5: /* VL5 */ 2257 case 6: /* VL6 */ 2258 case 7: /* VL7 */ 2259 case 8: /* VL8 */ 2260 sve_ptrue(dst, size, lane_cnt); 2261 return; 2262 case 16: 2263 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2264 return; 2265 case 32: 2266 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2267 return; 2268 case 64: 2269 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2270 return; 2271 case 128: 2272 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2273 return; 2274 case 256: 2275 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2276 return; 2277 default: 2278 break; 2279 } 2280 2281 // Special patterns for "ptrue". 2282 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2283 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2284 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2285 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2286 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2287 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2288 } else { 2289 // Encode to "whileltw" for the remaining cases. 2290 mov(rscratch1, lane_cnt); 2291 sve_whileltw(dst, size, zr, rscratch1); 2292 } 2293 } 2294 2295 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2296 // Any remaining elements of dst will be filled with zero. 2297 // Clobbers: rscratch1 2298 // Preserves: src, mask 2299 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2300 FloatRegister vtmp1, FloatRegister vtmp2, 2301 PRegister pgtmp) { 2302 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2303 assert_different_registers(dst, src, vtmp1, vtmp2); 2304 assert_different_registers(mask, pgtmp); 2305 2306 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2307 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2308 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2309 sve_dup(vtmp2, H, 0); 2310 2311 // Extend lowest half to type INT. 2312 // dst = 00004444 00003333 00002222 00001111 2313 sve_uunpklo(dst, S, src); 2314 // pgtmp = 00000001 00000000 00000001 00000001 2315 sve_punpklo(pgtmp, mask); 2316 // Pack the active elements in size of type INT to the right, 2317 // and fill the remainings with zero. 2318 // dst = 00000000 00004444 00002222 00001111 2319 sve_compact(dst, S, dst, pgtmp); 2320 // Narrow the result back to type SHORT. 2321 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2322 sve_uzp1(dst, H, dst, vtmp2); 2323 // Count the active elements of lowest half. 2324 // rscratch1 = 3 2325 sve_cntp(rscratch1, S, ptrue, pgtmp); 2326 2327 // Repeat to the highest half. 2328 // pgtmp = 00000001 00000000 00000000 00000001 2329 sve_punpkhi(pgtmp, mask); 2330 // vtmp1 = 00008888 00007777 00006666 00005555 2331 sve_uunpkhi(vtmp1, S, src); 2332 // vtmp1 = 00000000 00000000 00008888 00005555 2333 sve_compact(vtmp1, S, vtmp1, pgtmp); 2334 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2335 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2336 2337 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2338 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2339 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2340 // TRUE_CNT is the number of active elements in the compressed low. 2341 neg(rscratch1, rscratch1); 2342 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2343 sve_index(vtmp2, H, rscratch1, 1); 2344 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2345 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2346 2347 // Combine the compressed high(after shifted) with the compressed low. 2348 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2349 sve_orr(dst, dst, vtmp1); 2350 } 2351 2352 // Clobbers: rscratch1, rscratch2 2353 // Preserves: src, mask 2354 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2355 FloatRegister vtmp1, FloatRegister vtmp2, 2356 FloatRegister vtmp3, FloatRegister vtmp4, 2357 PRegister ptmp, PRegister pgtmp) { 2358 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2359 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2360 assert_different_registers(mask, ptmp, pgtmp); 2361 // Example input: src = 88 77 66 55 44 33 22 11 2362 // mask = 01 00 00 01 01 00 01 01 2363 // Expected result: dst = 00 00 00 88 55 44 22 11 2364 2365 sve_dup(vtmp4, B, 0); 2366 // Extend lowest half to type SHORT. 2367 // vtmp1 = 0044 0033 0022 0011 2368 sve_uunpklo(vtmp1, H, src); 2369 // ptmp = 0001 0000 0001 0001 2370 sve_punpklo(ptmp, mask); 2371 // Count the active elements of lowest half. 2372 // rscratch2 = 3 2373 sve_cntp(rscratch2, H, ptrue, ptmp); 2374 // Pack the active elements in size of type SHORT to the right, 2375 // and fill the remainings with zero. 2376 // dst = 0000 0044 0022 0011 2377 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2378 // Narrow the result back to type BYTE. 2379 // dst = 00 00 00 00 00 44 22 11 2380 sve_uzp1(dst, B, dst, vtmp4); 2381 2382 // Repeat to the highest half. 2383 // ptmp = 0001 0000 0000 0001 2384 sve_punpkhi(ptmp, mask); 2385 // vtmp1 = 0088 0077 0066 0055 2386 sve_uunpkhi(vtmp2, H, src); 2387 // vtmp1 = 0000 0000 0088 0055 2388 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2389 2390 sve_dup(vtmp4, B, 0); 2391 // vtmp1 = 00 00 00 00 00 00 88 55 2392 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2393 2394 // Compressed low: dst = 00 00 00 00 00 44 22 11 2395 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2396 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2397 // TRUE_CNT is the number of active elements in the compressed low. 2398 neg(rscratch2, rscratch2); 2399 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2400 sve_index(vtmp2, B, rscratch2, 1); 2401 // vtmp1 = 00 00 00 88 55 00 00 00 2402 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2403 // Combine the compressed high(after shifted) with the compressed low. 2404 // dst = 00 00 00 88 55 44 22 11 2405 sve_orr(dst, dst, vtmp1); 2406 } 2407 2408 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2409 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2410 SIMD_Arrangement size = isQ ? T16B : T8B; 2411 if (bt == T_BYTE) { 2412 rbit(dst, size, src); 2413 } else { 2414 neon_reverse_bytes(dst, src, bt, isQ); 2415 rbit(dst, size, dst); 2416 } 2417 } 2418 2419 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2420 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2421 SIMD_Arrangement size = isQ ? T16B : T8B; 2422 switch (bt) { 2423 case T_BYTE: 2424 if (dst != src) { 2425 orr(dst, size, src, src); 2426 } 2427 break; 2428 case T_SHORT: 2429 rev16(dst, size, src); 2430 break; 2431 case T_INT: 2432 rev32(dst, size, src); 2433 break; 2434 case T_LONG: 2435 rev64(dst, size, src); 2436 break; 2437 default: 2438 assert(false, "unsupported"); 2439 ShouldNotReachHere(); 2440 } 2441 } 2442 2443 // Extract a scalar element from an sve vector at position 'idx'. 2444 // The input elements in src are expected to be of integral type. 2445 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2446 int idx, FloatRegister vtmp) { 2447 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2448 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2449 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2450 if (bt == T_INT || bt == T_LONG) { 2451 umov(dst, src, size, idx); 2452 } else { 2453 smov(dst, src, size, idx); 2454 } 2455 } else { 2456 sve_orr(vtmp, src, src); 2457 sve_ext(vtmp, vtmp, idx << size); 2458 if (bt == T_INT || bt == T_LONG) { 2459 umov(dst, vtmp, size, 0); 2460 } else { 2461 smov(dst, vtmp, size, 0); 2462 } 2463 } 2464 } 2465 2466 // java.lang.Math::round intrinsics 2467 2468 // Clobbers: rscratch1, rflags 2469 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2470 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2471 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2472 switch (T) { 2473 case T2S: 2474 case T4S: 2475 fmovs(tmp1, T, 0.5f); 2476 mov(rscratch1, jint_cast(0x1.0p23f)); 2477 break; 2478 case T2D: 2479 fmovd(tmp1, T, 0.5); 2480 mov(rscratch1, julong_cast(0x1.0p52)); 2481 break; 2482 default: 2483 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2484 } 2485 fadd(tmp1, T, tmp1, src); 2486 fcvtms(tmp1, T, tmp1); 2487 // tmp1 = floor(src + 0.5, ties to even) 2488 2489 fcvtas(dst, T, src); 2490 // dst = round(src), ties to away 2491 2492 fneg(tmp3, T, src); 2493 dup(tmp2, T, rscratch1); 2494 cm(HS, tmp3, T, tmp3, tmp2); 2495 // tmp3 is now a set of flags 2496 2497 bif(dst, T16B, tmp1, tmp3); 2498 // result in dst 2499 } 2500 2501 // Clobbers: rscratch1, rflags 2502 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2503 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2504 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2505 assert_different_registers(tmp1, tmp2, src, dst); 2506 2507 switch (T) { 2508 case S: 2509 mov(rscratch1, jint_cast(0x1.0p23f)); 2510 break; 2511 case D: 2512 mov(rscratch1, julong_cast(0x1.0p52)); 2513 break; 2514 default: 2515 assert(T == S || T == D, "invalid register variant"); 2516 } 2517 2518 sve_frinta(dst, T, ptrue, src); 2519 // dst = round(src), ties to away 2520 2521 Label none; 2522 2523 sve_fneg(tmp1, T, ptrue, src); 2524 sve_dup(tmp2, T, rscratch1); 2525 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2526 br(EQ, none); 2527 { 2528 sve_cpy(tmp1, T, pgtmp, 0.5); 2529 sve_fadd(tmp1, T, pgtmp, src); 2530 sve_frintm(dst, T, pgtmp, tmp1); 2531 // dst = floor(src + 0.5, ties to even) 2532 } 2533 bind(none); 2534 2535 sve_fcvtzs(dst, T, ptrue, dst, T); 2536 // result in dst 2537 } 2538 2539 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2540 FloatRegister one, SIMD_Arrangement T) { 2541 assert_different_registers(dst, src, zero, one); 2542 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2543 2544 facgt(dst, T, src, zero); 2545 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2546 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2547 } 2548 2549 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2550 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2551 assert_different_registers(dst, src, zero, one, vtmp); 2552 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2553 2554 sve_orr(vtmp, src, src); 2555 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2556 switch (T) { 2557 case S: 2558 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2559 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2560 // on the sign of the float value 2561 break; 2562 case D: 2563 sve_and(vtmp, T, min_jlong); 2564 sve_orr(vtmp, T, jlong_cast(1.0)); 2565 break; 2566 default: 2567 assert(false, "unsupported"); 2568 ShouldNotReachHere(); 2569 } 2570 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2571 // Result in dst 2572 } 2573 2574 bool C2_MacroAssembler::in_scratch_emit_size() { 2575 if (ciEnv::current()->task() != nullptr) { 2576 PhaseOutput* phase_output = Compile::current()->output(); 2577 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2578 return true; 2579 } 2580 } 2581 return MacroAssembler::in_scratch_emit_size(); 2582 } 2583 2584 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) { 2585 // Note: Don't clobber obj anywhere in that method! 2586 2587 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract 2588 // obj-start, so that we can load from the object's mark-word instead. Usually the address 2589 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2 2590 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and 2591 // then passes that register as obj and 0 in disp. The following code extracts the base 2592 // and offset to load the mark-word. 2593 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes(); 2594 if (index == noreg) { 2595 ldr(dst, Address(obj, offset)); 2596 } else { 2597 lea(dst, Address(obj, index, Address::lsl(scale))); 2598 ldr(dst, Address(dst, offset)); 2599 } 2600 lsr(dst, dst, markWord::klass_shift); 2601 }