1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 50 Register tmp2Reg, Register tmp3Reg) { 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmpReg; 54 Register tmp = tmp2Reg; 55 Label cont; 56 Label object_has_monitor; 57 Label count, no_count; 58 59 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 60 assert_different_registers(oop, box, tmp, disp_hdr); 61 62 // Load markWord from object into displaced_header. 63 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 64 65 if (DiagnoseSyncOnValueBasedClasses != 0) { 66 load_klass(tmp, oop); 67 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 68 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 69 br(Assembler::NE, cont); 70 } 71 72 // Check for existing monitor 73 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 74 75 if (LockingMode == LM_MONITOR) { 76 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 77 b(cont); 78 } else { 79 assert(LockingMode == LM_LEGACY, "must be"); 80 // Set tmp to be (markWord of object | UNLOCK_VALUE). 81 orr(tmp, disp_hdr, markWord::unlocked_value); 82 83 // Initialize the box. (Must happen before we update the object mark!) 84 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 85 86 // Compare object markWord with an unlocked value (tmp) and if 87 // equal exchange the stack address of our box with object markWord. 88 // On failure disp_hdr contains the possibly locked markWord. 89 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 90 /*release*/ true, /*weak*/ false, disp_hdr); 91 br(Assembler::EQ, cont); 92 93 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 94 95 // If the compare-and-exchange succeeded, then we found an unlocked 96 // object, will have now locked it will continue at label cont 97 98 // Check if the owner is self by comparing the value in the 99 // markWord of object (disp_hdr) with the stack pointer. 100 mov(rscratch1, sp); 101 sub(disp_hdr, disp_hdr, rscratch1); 102 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 103 // If condition is true we are cont and hence we can store 0 as the 104 // displaced header in the box, which indicates that it is a recursive lock. 105 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 106 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 107 b(cont); 108 } 109 110 // Handle existing monitor. 111 bind(object_has_monitor); 112 113 // The object's monitor m is unlocked iff m->owner == nullptr, 114 // otherwise m->owner may contain a thread or a stack address. 115 // 116 // Try to CAS m->owner from null to current thread. 117 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 118 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 119 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 120 121 // Store a non-null value into the box to avoid looking like a re-entrant 122 // lock. The fast-path monitor unlock code checks for 123 // markWord::monitor_value so use markWord::unused_mark which has the 124 // relevant bit set, and also matches ObjectSynchronizer::enter. 125 mov(tmp, (address)markWord::unused_mark().value()); 126 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 127 128 br(Assembler::EQ, cont); // CAS success means locking succeeded 129 130 cmp(tmp3Reg, rthread); 131 br(Assembler::NE, cont); // Check for recursive locking 132 133 // Recursive lock case 134 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 135 // flag == EQ still from the cmp above, checking if this is a reentrant lock 136 137 bind(cont); 138 // flag == EQ indicates success 139 // flag == NE indicates failure 140 br(Assembler::NE, no_count); 141 142 bind(count); 143 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 144 145 bind(no_count); 146 } 147 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 149 Register tmp2Reg) { 150 Register oop = objectReg; 151 Register box = boxReg; 152 Register disp_hdr = tmpReg; 153 Register tmp = tmp2Reg; 154 Label cont; 155 Label object_has_monitor; 156 Label count, no_count; 157 158 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 159 assert_different_registers(oop, box, tmp, disp_hdr); 160 161 if (LockingMode == LM_LEGACY) { 162 // Find the lock address and load the displaced header from the stack. 163 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 164 165 // If the displaced header is 0, we have a recursive unlock. 166 cmp(disp_hdr, zr); 167 br(Assembler::EQ, cont); 168 } 169 170 // Handle existing monitor. 171 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 172 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 173 174 if (LockingMode == LM_MONITOR) { 175 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 176 b(cont); 177 } else { 178 assert(LockingMode == LM_LEGACY, "must be"); 179 // Check if it is still a light weight lock, this is is true if we 180 // see the stack address of the basicLock in the markWord of the 181 // object. 182 183 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 184 /*release*/ true, /*weak*/ false, tmp); 185 b(cont); 186 } 187 188 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 189 190 // Handle existing monitor. 191 bind(object_has_monitor); 192 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 193 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 194 195 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 196 197 Label notRecursive; 198 cbz(disp_hdr, notRecursive); 199 200 // Recursive lock 201 sub(disp_hdr, disp_hdr, 1u); 202 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 203 cmp(disp_hdr, disp_hdr); // Sets flags for result 204 b(cont); 205 206 bind(notRecursive); 207 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 208 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 209 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 210 cmp(rscratch1, zr); // Sets flags for result 211 cbnz(rscratch1, cont); 212 // need a release store here 213 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 214 stlr(zr, tmp); // set unowned 215 216 bind(cont); 217 // flag == EQ indicates success 218 // flag == NE indicates failure 219 br(Assembler::NE, no_count); 220 221 bind(count); 222 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 223 224 bind(no_count); 225 } 226 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 228 Register t2, Register t3) { 229 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 230 assert_different_registers(obj, box, t1, t2, t3); 231 232 // Handle inflated monitor. 233 Label inflated; 234 // Finish fast lock successfully. MUST branch to with flag == EQ 235 Label locked; 236 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 237 Label slow_path; 238 239 if (UseObjectMonitorTable) { 240 // Clear cache in case fast locking succeeds. 241 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 242 } 243 244 if (DiagnoseSyncOnValueBasedClasses != 0) { 245 load_klass(t1, obj); 246 ldrw(t1, Address(t1, Klass::access_flags_offset())); 247 tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS); 248 br(Assembler::NE, slow_path); 249 } 250 251 const Register t1_mark = t1; 252 const Register t3_t = t3; 253 254 { // Lightweight locking 255 256 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 257 Label push; 258 259 const Register t2_top = t2; 260 261 // Check if lock-stack is full. 262 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 263 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 264 br(Assembler::GT, slow_path); 265 266 // Check if recursive. 267 subw(t3_t, t2_top, oopSize); 268 ldr(t3_t, Address(rthread, t3_t)); 269 cmp(obj, t3_t); 270 br(Assembler::EQ, push); 271 272 // Relaxed normal load to check for monitor. Optimization for monitor case. 273 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 274 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 275 276 // Not inflated 277 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 278 279 // Try to lock. Transition lock-bits 0b01 => 0b00 280 orr(t1_mark, t1_mark, markWord::unlocked_value); 281 eor(t3_t, t1_mark, markWord::unlocked_value); 282 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 283 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 284 br(Assembler::NE, slow_path); 285 286 bind(push); 287 // After successful lock, push object on lock-stack. 288 str(obj, Address(rthread, t2_top)); 289 addw(t2_top, t2_top, oopSize); 290 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 291 b(locked); 292 } 293 294 { // Handle inflated monitor. 295 bind(inflated); 296 297 const Register t1_monitor = t1; 298 299 if (!UseObjectMonitorTable) { 300 assert(t1_monitor == t1_mark, "should be the same here"); 301 } else { 302 Label monitor_found; 303 304 // Load cache address 305 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 306 307 const int num_unrolled = 2; 308 for (int i = 0; i < num_unrolled; i++) { 309 ldr(t1, Address(t3_t)); 310 cmp(obj, t1); 311 br(Assembler::EQ, monitor_found); 312 if (i + 1 != num_unrolled) { 313 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 314 } 315 } 316 317 // Loop after unrolling, advance iterator. 318 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 319 320 Label loop; 321 322 // Search for obj in cache. 323 bind(loop); 324 325 // Check for match. 326 ldr(t1, Address(t3_t)); 327 cmp(obj, t1); 328 br(Assembler::EQ, monitor_found); 329 330 // Search until null encountered, guaranteed _null_sentinel at end. 331 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 332 cbnz(t1, loop); 333 // Cache Miss, NE set from cmp above, cbnz does not set flags 334 b(slow_path); 335 336 bind(monitor_found); 337 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 338 } 339 340 const Register t2_owner_addr = t2; 341 const Register t3_owner = t3; 342 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 343 const Address owner_address{t1_monitor, ObjectMonitor::owner_offset() - monitor_tag}; 344 const Address recursions_address{t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag}; 345 346 Label monitor_locked; 347 348 // Compute owner address. 349 lea(t2_owner_addr, owner_address); 350 351 // CAS owner (null => current thread). 352 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 353 /*release*/ false, /*weak*/ false, t3_owner); 354 br(Assembler::EQ, monitor_locked); 355 356 // Check if recursive. 357 cmp(t3_owner, rthread); 358 br(Assembler::NE, slow_path); 359 360 // Recursive. 361 increment(recursions_address, 1); 362 363 bind(monitor_locked); 364 if (UseObjectMonitorTable) { 365 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 366 } 367 } 368 369 bind(locked); 370 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 371 372 #ifdef ASSERT 373 // Check that locked label is reached with Flags == EQ. 374 Label flag_correct; 375 br(Assembler::EQ, flag_correct); 376 stop("Fast Lock Flag != EQ"); 377 #endif 378 379 bind(slow_path); 380 #ifdef ASSERT 381 // Check that slow_path label is reached with Flags == NE. 382 br(Assembler::NE, flag_correct); 383 stop("Fast Lock Flag != NE"); 384 bind(flag_correct); 385 #endif 386 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 387 } 388 389 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 390 Register t2, Register t3) { 391 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 392 assert_different_registers(obj, box, t1, t2, t3); 393 394 // Handle inflated monitor. 395 Label inflated, inflated_load_mark; 396 // Finish fast unlock successfully. MUST branch to with flag == EQ 397 Label unlocked; 398 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 399 Label slow_path; 400 401 const Register t1_mark = t1; 402 const Register t2_top = t2; 403 const Register t3_t = t3; 404 405 { // Lightweight unlock 406 407 Label push_and_slow_path; 408 409 // Check if obj is top of lock-stack. 410 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 411 subw(t2_top, t2_top, oopSize); 412 ldr(t3_t, Address(rthread, t2_top)); 413 cmp(obj, t3_t); 414 // Top of lock stack was not obj. Must be monitor. 415 br(Assembler::NE, inflated_load_mark); 416 417 // Pop lock-stack. 418 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 419 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 420 421 // Check if recursive. 422 subw(t3_t, t2_top, oopSize); 423 ldr(t3_t, Address(rthread, t3_t)); 424 cmp(obj, t3_t); 425 br(Assembler::EQ, unlocked); 426 427 // Not recursive. 428 // Load Mark. 429 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 430 431 // Check header for monitor (0b10). 432 // Because we got here by popping (meaning we pushed in locked) 433 // there will be no monitor in the box. So we need to push back the obj 434 // so that the runtime can fix any potential anonymous owner. 435 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 436 437 // Try to unlock. Transition lock bits 0b00 => 0b01 438 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 439 orr(t3_t, t1_mark, markWord::unlocked_value); 440 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 441 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 442 br(Assembler::EQ, unlocked); 443 444 bind(push_and_slow_path); 445 // Compare and exchange failed. 446 // Restore lock-stack and handle the unlock in runtime. 447 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 448 addw(t2_top, t2_top, oopSize); 449 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 450 b(slow_path); 451 } 452 453 454 { // Handle inflated monitor. 455 bind(inflated_load_mark); 456 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 457 #ifdef ASSERT 458 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 459 stop("Fast Unlock not monitor"); 460 #endif 461 462 bind(inflated); 463 464 #ifdef ASSERT 465 Label check_done; 466 subw(t2_top, t2_top, oopSize); 467 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 468 br(Assembler::LT, check_done); 469 ldr(t3_t, Address(rthread, t2_top)); 470 cmp(obj, t3_t); 471 br(Assembler::NE, inflated); 472 stop("Fast Unlock lock on stack"); 473 bind(check_done); 474 #endif 475 476 const Register t1_monitor = t1; 477 478 if (!UseObjectMonitorTable) { 479 assert(t1_monitor == t1_mark, "should be the same here"); 480 481 // Untag the monitor. 482 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 483 } else { 484 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 485 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 486 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 487 br(Assembler::LO, slow_path); 488 } 489 490 const Register t2_recursions = t2; 491 Label not_recursive; 492 493 // Check if recursive. 494 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 495 cbz(t2_recursions, not_recursive); 496 497 // Recursive unlock. 498 sub(t2_recursions, t2_recursions, 1u); 499 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 500 // Set flag == EQ 501 cmp(t2_recursions, t2_recursions); 502 b(unlocked); 503 504 bind(not_recursive); 505 506 Label release; 507 const Register t2_owner_addr = t2; 508 509 // Compute owner address. 510 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 511 512 // Check if the entry lists are empty. 513 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 514 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 515 orr(rscratch1, rscratch1, t3_t); 516 cmp(rscratch1, zr); 517 br(Assembler::EQ, release); 518 519 // The owner may be anonymous and we removed the last obj entry in 520 // the lock-stack. This loses the information about the owner. 521 // Write the thread to the owner field so the runtime knows the owner. 522 str(rthread, Address(t2_owner_addr)); 523 b(slow_path); 524 525 bind(release); 526 // Set owner to null. 527 // Release to satisfy the JMM 528 stlr(zr, t2_owner_addr); 529 } 530 531 bind(unlocked); 532 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 533 534 #ifdef ASSERT 535 // Check that unlocked label is reached with Flags == EQ. 536 Label flag_correct; 537 br(Assembler::EQ, flag_correct); 538 stop("Fast Unlock Flag != EQ"); 539 #endif 540 541 bind(slow_path); 542 #ifdef ASSERT 543 // Check that slow_path label is reached with Flags == NE. 544 br(Assembler::NE, flag_correct); 545 stop("Fast Unlock Flag != NE"); 546 bind(flag_correct); 547 #endif 548 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 549 } 550 551 // Search for str1 in str2 and return index or -1 552 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 553 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 554 Register cnt2, Register cnt1, 555 Register tmp1, Register tmp2, 556 Register tmp3, Register tmp4, 557 Register tmp5, Register tmp6, 558 int icnt1, Register result, int ae) { 559 // NOTE: tmp5, tmp6 can be zr depending on specific method version 560 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 561 562 Register ch1 = rscratch1; 563 Register ch2 = rscratch2; 564 Register cnt1tmp = tmp1; 565 Register cnt2tmp = tmp2; 566 Register cnt1_neg = cnt1; 567 Register cnt2_neg = cnt2; 568 Register result_tmp = tmp4; 569 570 bool isL = ae == StrIntrinsicNode::LL; 571 572 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 573 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 574 int str1_chr_shift = str1_isL ? 0:1; 575 int str2_chr_shift = str2_isL ? 0:1; 576 int str1_chr_size = str1_isL ? 1:2; 577 int str2_chr_size = str2_isL ? 1:2; 578 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 579 (chr_insn)&MacroAssembler::ldrh; 580 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 581 (chr_insn)&MacroAssembler::ldrh; 582 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 583 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 584 585 // Note, inline_string_indexOf() generates checks: 586 // if (substr.count > string.count) return -1; 587 // if (substr.count == 0) return 0; 588 589 // We have two strings, a source string in str2, cnt2 and a pattern string 590 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 591 592 // For larger pattern and source we use a simplified Boyer Moore algorithm. 593 // With a small pattern and source we use linear scan. 594 595 if (icnt1 == -1) { 596 sub(result_tmp, cnt2, cnt1); 597 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 598 br(LT, LINEARSEARCH); 599 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 600 subs(zr, cnt1, 256); 601 lsr(tmp1, cnt2, 2); 602 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 603 br(GE, LINEARSTUB); 604 } 605 606 // The Boyer Moore alogorithm is based on the description here:- 607 // 608 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 609 // 610 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 611 // and the 'Good Suffix' rule. 612 // 613 // These rules are essentially heuristics for how far we can shift the 614 // pattern along the search string. 615 // 616 // The implementation here uses the 'Bad Character' rule only because of the 617 // complexity of initialisation for the 'Good Suffix' rule. 618 // 619 // This is also known as the Boyer-Moore-Horspool algorithm:- 620 // 621 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 622 // 623 // This particular implementation has few java-specific optimizations. 624 // 625 // #define ASIZE 256 626 // 627 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 628 // int i, j; 629 // unsigned c; 630 // unsigned char bc[ASIZE]; 631 // 632 // /* Preprocessing */ 633 // for (i = 0; i < ASIZE; ++i) 634 // bc[i] = m; 635 // for (i = 0; i < m - 1; ) { 636 // c = x[i]; 637 // ++i; 638 // // c < 256 for Latin1 string, so, no need for branch 639 // #ifdef PATTERN_STRING_IS_LATIN1 640 // bc[c] = m - i; 641 // #else 642 // if (c < ASIZE) bc[c] = m - i; 643 // #endif 644 // } 645 // 646 // /* Searching */ 647 // j = 0; 648 // while (j <= n - m) { 649 // c = y[i+j]; 650 // if (x[m-1] == c) 651 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 652 // if (i < 0) return j; 653 // // c < 256 for Latin1 string, so, no need for branch 654 // #ifdef SOURCE_STRING_IS_LATIN1 655 // // LL case: (c< 256) always true. Remove branch 656 // j += bc[y[j+m-1]]; 657 // #endif 658 // #ifndef PATTERN_STRING_IS_UTF 659 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 660 // if (c < ASIZE) 661 // j += bc[y[j+m-1]]; 662 // else 663 // j += 1 664 // #endif 665 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 666 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 667 // if (c < ASIZE) 668 // j += bc[y[j+m-1]]; 669 // else 670 // j += m 671 // #endif 672 // } 673 // } 674 675 if (icnt1 == -1) { 676 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 677 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 678 Register cnt1end = tmp2; 679 Register str2end = cnt2; 680 Register skipch = tmp2; 681 682 // str1 length is >=8, so, we can read at least 1 register for cases when 683 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 684 // UL case. We'll re-read last character in inner pre-loop code to have 685 // single outer pre-loop load 686 const int firstStep = isL ? 7 : 3; 687 688 const int ASIZE = 256; 689 const int STORED_BYTES = 32; // amount of bytes stored per instruction 690 sub(sp, sp, ASIZE); 691 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 692 mov(ch1, sp); 693 BIND(BM_INIT_LOOP); 694 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 695 subs(tmp5, tmp5, 1); 696 br(GT, BM_INIT_LOOP); 697 698 sub(cnt1tmp, cnt1, 1); 699 mov(tmp5, str2); 700 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 701 sub(ch2, cnt1, 1); 702 mov(tmp3, str1); 703 BIND(BCLOOP); 704 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 705 if (!str1_isL) { 706 subs(zr, ch1, ASIZE); 707 br(HS, BCSKIP); 708 } 709 strb(ch2, Address(sp, ch1)); 710 BIND(BCSKIP); 711 subs(ch2, ch2, 1); 712 br(GT, BCLOOP); 713 714 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 715 if (str1_isL == str2_isL) { 716 // load last 8 bytes (8LL/4UU symbols) 717 ldr(tmp6, Address(tmp6, -wordSize)); 718 } else { 719 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 720 // convert Latin1 to UTF. We'll have to wait until load completed, but 721 // it's still faster than per-character loads+checks 722 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 723 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 724 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 725 andr(tmp6, tmp6, 0xFF); // str1[N-4] 726 orr(ch2, ch1, ch2, LSL, 16); 727 orr(tmp6, tmp6, tmp3, LSL, 48); 728 orr(tmp6, tmp6, ch2, LSL, 16); 729 } 730 BIND(BMLOOPSTR2); 731 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 732 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 733 if (str1_isL == str2_isL) { 734 // re-init tmp3. It's for free because it's executed in parallel with 735 // load above. Alternative is to initialize it before loop, but it'll 736 // affect performance on in-order systems with 2 or more ld/st pipelines 737 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 738 } 739 if (!isL) { // UU/UL case 740 lsl(ch2, cnt1tmp, 1); // offset in bytes 741 } 742 cmp(tmp3, skipch); 743 br(NE, BMSKIP); 744 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 745 mov(ch1, tmp6); 746 if (isL) { 747 b(BMLOOPSTR1_AFTER_LOAD); 748 } else { 749 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 750 b(BMLOOPSTR1_CMP); 751 } 752 BIND(BMLOOPSTR1); 753 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 754 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 755 BIND(BMLOOPSTR1_AFTER_LOAD); 756 subs(cnt1tmp, cnt1tmp, 1); 757 br(LT, BMLOOPSTR1_LASTCMP); 758 BIND(BMLOOPSTR1_CMP); 759 cmp(ch1, ch2); 760 br(EQ, BMLOOPSTR1); 761 BIND(BMSKIP); 762 if (!isL) { 763 // if we've met UTF symbol while searching Latin1 pattern, then we can 764 // skip cnt1 symbols 765 if (str1_isL != str2_isL) { 766 mov(result_tmp, cnt1); 767 } else { 768 mov(result_tmp, 1); 769 } 770 subs(zr, skipch, ASIZE); 771 br(HS, BMADV); 772 } 773 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 774 BIND(BMADV); 775 sub(cnt1tmp, cnt1, 1); 776 add(str2, str2, result_tmp, LSL, str2_chr_shift); 777 cmp(str2, str2end); 778 br(LE, BMLOOPSTR2); 779 add(sp, sp, ASIZE); 780 b(NOMATCH); 781 BIND(BMLOOPSTR1_LASTCMP); 782 cmp(ch1, ch2); 783 br(NE, BMSKIP); 784 BIND(BMMATCH); 785 sub(result, str2, tmp5); 786 if (!str2_isL) lsr(result, result, 1); 787 add(sp, sp, ASIZE); 788 b(DONE); 789 790 BIND(LINEARSTUB); 791 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 792 br(LT, LINEAR_MEDIUM); 793 mov(result, zr); 794 RuntimeAddress stub = nullptr; 795 if (isL) { 796 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 797 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 798 } else if (str1_isL) { 799 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 800 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 801 } else { 802 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 803 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 804 } 805 address call = trampoline_call(stub); 806 if (call == nullptr) { 807 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 808 ciEnv::current()->record_failure("CodeCache is full"); 809 return; 810 } 811 b(DONE); 812 } 813 814 BIND(LINEARSEARCH); 815 { 816 Label DO1, DO2, DO3; 817 818 Register str2tmp = tmp2; 819 Register first = tmp3; 820 821 if (icnt1 == -1) 822 { 823 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 824 825 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 826 br(LT, DOSHORT); 827 BIND(LINEAR_MEDIUM); 828 (this->*str1_load_1chr)(first, Address(str1)); 829 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 830 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 831 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 832 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 833 834 BIND(FIRST_LOOP); 835 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 836 cmp(first, ch2); 837 br(EQ, STR1_LOOP); 838 BIND(STR2_NEXT); 839 adds(cnt2_neg, cnt2_neg, str2_chr_size); 840 br(LE, FIRST_LOOP); 841 b(NOMATCH); 842 843 BIND(STR1_LOOP); 844 adds(cnt1tmp, cnt1_neg, str1_chr_size); 845 add(cnt2tmp, cnt2_neg, str2_chr_size); 846 br(GE, MATCH); 847 848 BIND(STR1_NEXT); 849 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 850 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 851 cmp(ch1, ch2); 852 br(NE, STR2_NEXT); 853 adds(cnt1tmp, cnt1tmp, str1_chr_size); 854 add(cnt2tmp, cnt2tmp, str2_chr_size); 855 br(LT, STR1_NEXT); 856 b(MATCH); 857 858 BIND(DOSHORT); 859 if (str1_isL == str2_isL) { 860 cmp(cnt1, (u1)2); 861 br(LT, DO1); 862 br(GT, DO3); 863 } 864 } 865 866 if (icnt1 == 4) { 867 Label CH1_LOOP; 868 869 (this->*load_4chr)(ch1, str1); 870 sub(result_tmp, cnt2, 4); 871 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 872 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 873 874 BIND(CH1_LOOP); 875 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 876 cmp(ch1, ch2); 877 br(EQ, MATCH); 878 adds(cnt2_neg, cnt2_neg, str2_chr_size); 879 br(LE, CH1_LOOP); 880 b(NOMATCH); 881 } 882 883 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 884 Label CH1_LOOP; 885 886 BIND(DO2); 887 (this->*load_2chr)(ch1, str1); 888 if (icnt1 == 2) { 889 sub(result_tmp, cnt2, 2); 890 } 891 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 892 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 893 BIND(CH1_LOOP); 894 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 895 cmp(ch1, ch2); 896 br(EQ, MATCH); 897 adds(cnt2_neg, cnt2_neg, str2_chr_size); 898 br(LE, CH1_LOOP); 899 b(NOMATCH); 900 } 901 902 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 903 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 904 905 BIND(DO3); 906 (this->*load_2chr)(first, str1); 907 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 908 if (icnt1 == 3) { 909 sub(result_tmp, cnt2, 3); 910 } 911 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 912 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 913 BIND(FIRST_LOOP); 914 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 915 cmpw(first, ch2); 916 br(EQ, STR1_LOOP); 917 BIND(STR2_NEXT); 918 adds(cnt2_neg, cnt2_neg, str2_chr_size); 919 br(LE, FIRST_LOOP); 920 b(NOMATCH); 921 922 BIND(STR1_LOOP); 923 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 924 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 925 cmp(ch1, ch2); 926 br(NE, STR2_NEXT); 927 b(MATCH); 928 } 929 930 if (icnt1 == -1 || icnt1 == 1) { 931 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 932 933 BIND(DO1); 934 (this->*str1_load_1chr)(ch1, str1); 935 cmp(cnt2, (u1)8); 936 br(LT, DO1_SHORT); 937 938 sub(result_tmp, cnt2, 8/str2_chr_size); 939 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 940 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 941 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 942 943 if (str2_isL) { 944 orr(ch1, ch1, ch1, LSL, 8); 945 } 946 orr(ch1, ch1, ch1, LSL, 16); 947 orr(ch1, ch1, ch1, LSL, 32); 948 BIND(CH1_LOOP); 949 ldr(ch2, Address(str2, cnt2_neg)); 950 eor(ch2, ch1, ch2); 951 sub(tmp1, ch2, tmp3); 952 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 953 bics(tmp1, tmp1, tmp2); 954 br(NE, HAS_ZERO); 955 adds(cnt2_neg, cnt2_neg, 8); 956 br(LT, CH1_LOOP); 957 958 cmp(cnt2_neg, (u1)8); 959 mov(cnt2_neg, 0); 960 br(LT, CH1_LOOP); 961 b(NOMATCH); 962 963 BIND(HAS_ZERO); 964 rev(tmp1, tmp1); 965 clz(tmp1, tmp1); 966 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 967 b(MATCH); 968 969 BIND(DO1_SHORT); 970 mov(result_tmp, cnt2); 971 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 972 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 973 BIND(DO1_LOOP); 974 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 975 cmpw(ch1, ch2); 976 br(EQ, MATCH); 977 adds(cnt2_neg, cnt2_neg, str2_chr_size); 978 br(LT, DO1_LOOP); 979 } 980 } 981 BIND(NOMATCH); 982 mov(result, -1); 983 b(DONE); 984 BIND(MATCH); 985 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 986 BIND(DONE); 987 } 988 989 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 990 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 991 992 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 993 Register ch, Register result, 994 Register tmp1, Register tmp2, Register tmp3) 995 { 996 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 997 Register cnt1_neg = cnt1; 998 Register ch1 = rscratch1; 999 Register result_tmp = rscratch2; 1000 1001 cbz(cnt1, NOMATCH); 1002 1003 cmp(cnt1, (u1)4); 1004 br(LT, DO1_SHORT); 1005 1006 orr(ch, ch, ch, LSL, 16); 1007 orr(ch, ch, ch, LSL, 32); 1008 1009 sub(cnt1, cnt1, 4); 1010 mov(result_tmp, cnt1); 1011 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1012 sub(cnt1_neg, zr, cnt1, LSL, 1); 1013 1014 mov(tmp3, 0x0001000100010001); 1015 1016 BIND(CH1_LOOP); 1017 ldr(ch1, Address(str1, cnt1_neg)); 1018 eor(ch1, ch, ch1); 1019 sub(tmp1, ch1, tmp3); 1020 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1021 bics(tmp1, tmp1, tmp2); 1022 br(NE, HAS_ZERO); 1023 adds(cnt1_neg, cnt1_neg, 8); 1024 br(LT, CH1_LOOP); 1025 1026 cmp(cnt1_neg, (u1)8); 1027 mov(cnt1_neg, 0); 1028 br(LT, CH1_LOOP); 1029 b(NOMATCH); 1030 1031 BIND(HAS_ZERO); 1032 rev(tmp1, tmp1); 1033 clz(tmp1, tmp1); 1034 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1035 b(MATCH); 1036 1037 BIND(DO1_SHORT); 1038 mov(result_tmp, cnt1); 1039 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1040 sub(cnt1_neg, zr, cnt1, LSL, 1); 1041 BIND(DO1_LOOP); 1042 ldrh(ch1, Address(str1, cnt1_neg)); 1043 cmpw(ch, ch1); 1044 br(EQ, MATCH); 1045 adds(cnt1_neg, cnt1_neg, 2); 1046 br(LT, DO1_LOOP); 1047 BIND(NOMATCH); 1048 mov(result, -1); 1049 b(DONE); 1050 BIND(MATCH); 1051 add(result, result_tmp, cnt1_neg, ASR, 1); 1052 BIND(DONE); 1053 } 1054 1055 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1056 Register ch, Register result, 1057 FloatRegister ztmp1, 1058 FloatRegister ztmp2, 1059 PRegister tmp_pg, 1060 PRegister tmp_pdn, bool isL) 1061 { 1062 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1063 assert(tmp_pg->is_governing(), 1064 "this register has to be a governing predicate register"); 1065 1066 Label LOOP, MATCH, DONE, NOMATCH; 1067 Register vec_len = rscratch1; 1068 Register idx = rscratch2; 1069 1070 SIMD_RegVariant T = (isL == true) ? B : H; 1071 1072 cbz(cnt1, NOMATCH); 1073 1074 // Assign the particular char throughout the vector. 1075 sve_dup(ztmp2, T, ch); 1076 if (isL) { 1077 sve_cntb(vec_len); 1078 } else { 1079 sve_cnth(vec_len); 1080 } 1081 mov(idx, 0); 1082 1083 // Generate a predicate to control the reading of input string. 1084 sve_whilelt(tmp_pg, T, idx, cnt1); 1085 1086 BIND(LOOP); 1087 // Read a vector of 8- or 16-bit data depending on the string type. Note 1088 // that inactive elements indicated by the predicate register won't cause 1089 // a data read from memory to the destination vector. 1090 if (isL) { 1091 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1092 } else { 1093 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1094 } 1095 add(idx, idx, vec_len); 1096 1097 // Perform the comparison. An element of the destination predicate is set 1098 // to active if the particular char is matched. 1099 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1100 1101 // Branch if the particular char is found. 1102 br(NE, MATCH); 1103 1104 sve_whilelt(tmp_pg, T, idx, cnt1); 1105 1106 // Loop back if the particular char not found. 1107 br(MI, LOOP); 1108 1109 BIND(NOMATCH); 1110 mov(result, -1); 1111 b(DONE); 1112 1113 BIND(MATCH); 1114 // Undo the index increment. 1115 sub(idx, idx, vec_len); 1116 1117 // Crop the vector to find its location. 1118 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1119 add(result, idx, -1); 1120 sve_incp(result, T, tmp_pdn); 1121 BIND(DONE); 1122 } 1123 1124 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1125 Register ch, Register result, 1126 Register tmp1, Register tmp2, Register tmp3) 1127 { 1128 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1129 Register cnt1_neg = cnt1; 1130 Register ch1 = rscratch1; 1131 Register result_tmp = rscratch2; 1132 1133 cbz(cnt1, NOMATCH); 1134 1135 cmp(cnt1, (u1)8); 1136 br(LT, DO1_SHORT); 1137 1138 orr(ch, ch, ch, LSL, 8); 1139 orr(ch, ch, ch, LSL, 16); 1140 orr(ch, ch, ch, LSL, 32); 1141 1142 sub(cnt1, cnt1, 8); 1143 mov(result_tmp, cnt1); 1144 lea(str1, Address(str1, cnt1)); 1145 sub(cnt1_neg, zr, cnt1); 1146 1147 mov(tmp3, 0x0101010101010101); 1148 1149 BIND(CH1_LOOP); 1150 ldr(ch1, Address(str1, cnt1_neg)); 1151 eor(ch1, ch, ch1); 1152 sub(tmp1, ch1, tmp3); 1153 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1154 bics(tmp1, tmp1, tmp2); 1155 br(NE, HAS_ZERO); 1156 adds(cnt1_neg, cnt1_neg, 8); 1157 br(LT, CH1_LOOP); 1158 1159 cmp(cnt1_neg, (u1)8); 1160 mov(cnt1_neg, 0); 1161 br(LT, CH1_LOOP); 1162 b(NOMATCH); 1163 1164 BIND(HAS_ZERO); 1165 rev(tmp1, tmp1); 1166 clz(tmp1, tmp1); 1167 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1168 b(MATCH); 1169 1170 BIND(DO1_SHORT); 1171 mov(result_tmp, cnt1); 1172 lea(str1, Address(str1, cnt1)); 1173 sub(cnt1_neg, zr, cnt1); 1174 BIND(DO1_LOOP); 1175 ldrb(ch1, Address(str1, cnt1_neg)); 1176 cmp(ch, ch1); 1177 br(EQ, MATCH); 1178 adds(cnt1_neg, cnt1_neg, 1); 1179 br(LT, DO1_LOOP); 1180 BIND(NOMATCH); 1181 mov(result, -1); 1182 b(DONE); 1183 BIND(MATCH); 1184 add(result, result_tmp, cnt1_neg); 1185 BIND(DONE); 1186 } 1187 1188 // Compare strings. 1189 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1190 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1191 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1192 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1193 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1194 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1195 SHORT_LOOP_START, TAIL_CHECK; 1196 1197 bool isLL = ae == StrIntrinsicNode::LL; 1198 bool isLU = ae == StrIntrinsicNode::LU; 1199 bool isUL = ae == StrIntrinsicNode::UL; 1200 1201 // The stub threshold for LL strings is: 72 (64 + 8) chars 1202 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1203 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1204 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1205 1206 bool str1_isL = isLL || isLU; 1207 bool str2_isL = isLL || isUL; 1208 1209 int str1_chr_shift = str1_isL ? 0 : 1; 1210 int str2_chr_shift = str2_isL ? 0 : 1; 1211 int str1_chr_size = str1_isL ? 1 : 2; 1212 int str2_chr_size = str2_isL ? 1 : 2; 1213 int minCharsInWord = isLL ? wordSize : wordSize/2; 1214 1215 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1216 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1217 (chr_insn)&MacroAssembler::ldrh; 1218 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1219 (chr_insn)&MacroAssembler::ldrh; 1220 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1221 (uxt_insn)&MacroAssembler::uxthw; 1222 1223 BLOCK_COMMENT("string_compare {"); 1224 1225 // Bizzarely, the counts are passed in bytes, regardless of whether they 1226 // are L or U strings, however the result is always in characters. 1227 if (!str1_isL) asrw(cnt1, cnt1, 1); 1228 if (!str2_isL) asrw(cnt2, cnt2, 1); 1229 1230 // Compute the minimum of the string lengths and save the difference. 1231 subsw(result, cnt1, cnt2); 1232 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1233 1234 // A very short string 1235 cmpw(cnt2, minCharsInWord); 1236 br(Assembler::LE, SHORT_STRING); 1237 1238 // Compare longwords 1239 // load first parts of strings and finish initialization while loading 1240 { 1241 if (str1_isL == str2_isL) { // LL or UU 1242 ldr(tmp1, Address(str1)); 1243 cmp(str1, str2); 1244 br(Assembler::EQ, DONE); 1245 ldr(tmp2, Address(str2)); 1246 cmp(cnt2, stub_threshold); 1247 br(GE, STUB); 1248 subsw(cnt2, cnt2, minCharsInWord); 1249 br(EQ, TAIL_CHECK); 1250 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1251 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1252 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1253 } else if (isLU) { 1254 ldrs(vtmp, Address(str1)); 1255 ldr(tmp2, Address(str2)); 1256 cmp(cnt2, stub_threshold); 1257 br(GE, STUB); 1258 subw(cnt2, cnt2, 4); 1259 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1260 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1261 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1262 zip1(vtmp, T8B, vtmp, vtmpZ); 1263 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1264 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1265 add(cnt1, cnt1, 4); 1266 fmovd(tmp1, vtmp); 1267 } else { // UL case 1268 ldr(tmp1, Address(str1)); 1269 ldrs(vtmp, Address(str2)); 1270 cmp(cnt2, stub_threshold); 1271 br(GE, STUB); 1272 subw(cnt2, cnt2, 4); 1273 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1274 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1275 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1276 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1277 zip1(vtmp, T8B, vtmp, vtmpZ); 1278 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1279 add(cnt1, cnt1, 8); 1280 fmovd(tmp2, vtmp); 1281 } 1282 adds(cnt2, cnt2, isUL ? 4 : 8); 1283 br(GE, TAIL); 1284 eor(rscratch2, tmp1, tmp2); 1285 cbnz(rscratch2, DIFF); 1286 // main loop 1287 bind(NEXT_WORD); 1288 if (str1_isL == str2_isL) { 1289 ldr(tmp1, Address(str1, cnt2)); 1290 ldr(tmp2, Address(str2, cnt2)); 1291 adds(cnt2, cnt2, 8); 1292 } else if (isLU) { 1293 ldrs(vtmp, Address(str1, cnt1)); 1294 ldr(tmp2, Address(str2, cnt2)); 1295 add(cnt1, cnt1, 4); 1296 zip1(vtmp, T8B, vtmp, vtmpZ); 1297 fmovd(tmp1, vtmp); 1298 adds(cnt2, cnt2, 8); 1299 } else { // UL 1300 ldrs(vtmp, Address(str2, cnt2)); 1301 ldr(tmp1, Address(str1, cnt1)); 1302 zip1(vtmp, T8B, vtmp, vtmpZ); 1303 add(cnt1, cnt1, 8); 1304 fmovd(tmp2, vtmp); 1305 adds(cnt2, cnt2, 4); 1306 } 1307 br(GE, TAIL); 1308 1309 eor(rscratch2, tmp1, tmp2); 1310 cbz(rscratch2, NEXT_WORD); 1311 b(DIFF); 1312 bind(TAIL); 1313 eor(rscratch2, tmp1, tmp2); 1314 cbnz(rscratch2, DIFF); 1315 // Last longword. In the case where length == 4 we compare the 1316 // same longword twice, but that's still faster than another 1317 // conditional branch. 1318 if (str1_isL == str2_isL) { 1319 ldr(tmp1, Address(str1)); 1320 ldr(tmp2, Address(str2)); 1321 } else if (isLU) { 1322 ldrs(vtmp, Address(str1)); 1323 ldr(tmp2, Address(str2)); 1324 zip1(vtmp, T8B, vtmp, vtmpZ); 1325 fmovd(tmp1, vtmp); 1326 } else { // UL 1327 ldrs(vtmp, Address(str2)); 1328 ldr(tmp1, Address(str1)); 1329 zip1(vtmp, T8B, vtmp, vtmpZ); 1330 fmovd(tmp2, vtmp); 1331 } 1332 bind(TAIL_CHECK); 1333 eor(rscratch2, tmp1, tmp2); 1334 cbz(rscratch2, DONE); 1335 1336 // Find the first different characters in the longwords and 1337 // compute their difference. 1338 bind(DIFF); 1339 rev(rscratch2, rscratch2); 1340 clz(rscratch2, rscratch2); 1341 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1342 lsrv(tmp1, tmp1, rscratch2); 1343 (this->*ext_chr)(tmp1, tmp1); 1344 lsrv(tmp2, tmp2, rscratch2); 1345 (this->*ext_chr)(tmp2, tmp2); 1346 subw(result, tmp1, tmp2); 1347 b(DONE); 1348 } 1349 1350 bind(STUB); 1351 RuntimeAddress stub = nullptr; 1352 switch(ae) { 1353 case StrIntrinsicNode::LL: 1354 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1355 break; 1356 case StrIntrinsicNode::UU: 1357 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1358 break; 1359 case StrIntrinsicNode::LU: 1360 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1361 break; 1362 case StrIntrinsicNode::UL: 1363 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1364 break; 1365 default: 1366 ShouldNotReachHere(); 1367 } 1368 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1369 address call = trampoline_call(stub); 1370 if (call == nullptr) { 1371 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1372 ciEnv::current()->record_failure("CodeCache is full"); 1373 return; 1374 } 1375 b(DONE); 1376 1377 bind(SHORT_STRING); 1378 // Is the minimum length zero? 1379 cbz(cnt2, DONE); 1380 // arrange code to do most branches while loading and loading next characters 1381 // while comparing previous 1382 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1383 subs(cnt2, cnt2, 1); 1384 br(EQ, SHORT_LAST_INIT); 1385 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1386 b(SHORT_LOOP_START); 1387 bind(SHORT_LOOP); 1388 subs(cnt2, cnt2, 1); 1389 br(EQ, SHORT_LAST); 1390 bind(SHORT_LOOP_START); 1391 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1392 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1393 cmp(tmp1, cnt1); 1394 br(NE, SHORT_LOOP_TAIL); 1395 subs(cnt2, cnt2, 1); 1396 br(EQ, SHORT_LAST2); 1397 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1398 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1399 cmp(tmp2, rscratch1); 1400 br(EQ, SHORT_LOOP); 1401 sub(result, tmp2, rscratch1); 1402 b(DONE); 1403 bind(SHORT_LOOP_TAIL); 1404 sub(result, tmp1, cnt1); 1405 b(DONE); 1406 bind(SHORT_LAST2); 1407 cmp(tmp2, rscratch1); 1408 br(EQ, DONE); 1409 sub(result, tmp2, rscratch1); 1410 1411 b(DONE); 1412 bind(SHORT_LAST_INIT); 1413 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1414 bind(SHORT_LAST); 1415 cmp(tmp1, cnt1); 1416 br(EQ, DONE); 1417 sub(result, tmp1, cnt1); 1418 1419 bind(DONE); 1420 1421 BLOCK_COMMENT("} string_compare"); 1422 } 1423 1424 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1425 FloatRegister src2, Condition cond, bool isQ) { 1426 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1427 FloatRegister zn = src1, zm = src2; 1428 bool needs_negation = false; 1429 switch (cond) { 1430 case LT: cond = GT; zn = src2; zm = src1; break; 1431 case LE: cond = GE; zn = src2; zm = src1; break; 1432 case LO: cond = HI; zn = src2; zm = src1; break; 1433 case LS: cond = HS; zn = src2; zm = src1; break; 1434 case NE: cond = EQ; needs_negation = true; break; 1435 default: 1436 break; 1437 } 1438 1439 if (is_floating_point_type(bt)) { 1440 fcm(cond, dst, size, zn, zm); 1441 } else { 1442 cm(cond, dst, size, zn, zm); 1443 } 1444 1445 if (needs_negation) { 1446 notr(dst, isQ ? T16B : T8B, dst); 1447 } 1448 } 1449 1450 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1451 Condition cond, bool isQ) { 1452 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1453 if (bt == T_FLOAT || bt == T_DOUBLE) { 1454 if (cond == Assembler::NE) { 1455 fcm(Assembler::EQ, dst, size, src); 1456 notr(dst, isQ ? T16B : T8B, dst); 1457 } else { 1458 fcm(cond, dst, size, src); 1459 } 1460 } else { 1461 if (cond == Assembler::NE) { 1462 cm(Assembler::EQ, dst, size, src); 1463 notr(dst, isQ ? T16B : T8B, dst); 1464 } else { 1465 cm(cond, dst, size, src); 1466 } 1467 } 1468 } 1469 1470 // Compress the least significant bit of each byte to the rightmost and clear 1471 // the higher garbage bits. 1472 void C2_MacroAssembler::bytemask_compress(Register dst) { 1473 // Example input, dst = 0x01 00 00 00 01 01 00 01 1474 // The "??" bytes are garbage. 1475 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1476 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1477 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1478 andr(dst, dst, 0xff); // dst = 0x8D 1479 } 1480 1481 // Pack the lowest-numbered bit of each mask element in src into a long value 1482 // in dst, at most the first 64 lane elements. 1483 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1484 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1485 FloatRegister vtmp1, FloatRegister vtmp2) { 1486 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1487 assert_different_registers(dst, rscratch1); 1488 assert_different_registers(vtmp1, vtmp2); 1489 1490 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1491 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1492 // Expected: dst = 0x658D 1493 1494 // Convert the mask into vector with sequential bytes. 1495 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1496 sve_cpy(vtmp1, size, src, 1, false); 1497 if (bt != T_BYTE) { 1498 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1499 } 1500 1501 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1502 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1503 // is to compress each significant bit of the byte in a cross-lane way. Due 1504 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1505 // (bit-compress in each lane) with the biggest lane size (T = D) then 1506 // concatenate the results. 1507 1508 // The second source input of BEXT, initialized with 0x01 in each byte. 1509 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1510 sve_dup(vtmp2, B, 1); 1511 1512 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1513 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1514 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1515 // --------------------------------------- 1516 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1517 sve_bext(vtmp1, D, vtmp1, vtmp2); 1518 1519 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1520 // result to dst. 1521 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1522 // dst = 0x658D 1523 if (lane_cnt <= 8) { 1524 // No need to concatenate. 1525 umov(dst, vtmp1, B, 0); 1526 } else if (lane_cnt <= 16) { 1527 ins(vtmp1, B, vtmp1, 1, 8); 1528 umov(dst, vtmp1, H, 0); 1529 } else { 1530 // As the lane count is 64 at most, the final expected value must be in 1531 // the lowest 64 bits after narrowing vtmp1 from D to B. 1532 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1533 umov(dst, vtmp1, D, 0); 1534 } 1535 } else if (UseSVE > 0) { 1536 // Compress the lowest 8 bytes. 1537 fmovd(dst, vtmp1); 1538 bytemask_compress(dst); 1539 if (lane_cnt <= 8) return; 1540 1541 // Repeat on higher bytes and join the results. 1542 // Compress 8 bytes in each iteration. 1543 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1544 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1545 bytemask_compress(rscratch1); 1546 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1547 } 1548 } else { 1549 assert(false, "unsupported"); 1550 ShouldNotReachHere(); 1551 } 1552 } 1553 1554 // Unpack the mask, a long value in src, into predicate register dst based on the 1555 // corresponding data type. Note that dst can support at most 64 lanes. 1556 // Below example gives the expected dst predicate register in different types, with 1557 // a valid src(0x658D) on a 1024-bit vector size machine. 1558 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1559 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1560 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1561 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1562 // 1563 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1564 // has 24 significant bits would be an invalid input if dst predicate register refers to 1565 // a LONG type 1024-bit vector, which has at most 16 lanes. 1566 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1567 FloatRegister vtmp1, FloatRegister vtmp2) { 1568 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1569 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1570 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1571 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1572 // Expected: dst = 0b01101001 10001101 1573 1574 // Put long value from general purpose register into the first lane of vector. 1575 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1576 sve_dup(vtmp1, B, 0); 1577 mov(vtmp1, D, 0, src); 1578 1579 // As sve_cmp generates mask value with the minimum unit in byte, we should 1580 // transform the value in the first lane which is mask in bit now to the 1581 // mask in byte, which can be done by SVE2's BDEP instruction. 1582 1583 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1584 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1585 if (lane_cnt <= 8) { 1586 // Nothing. As only one byte exsits. 1587 } else if (lane_cnt <= 16) { 1588 ins(vtmp1, B, vtmp1, 8, 1); 1589 mov(vtmp1, B, 1, zr); 1590 } else { 1591 sve_vector_extend(vtmp1, D, vtmp1, B); 1592 } 1593 1594 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1595 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1596 sve_dup(vtmp2, B, 1); 1597 1598 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1599 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1600 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1601 // --------------------------------------- 1602 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1603 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1604 1605 if (bt != T_BYTE) { 1606 sve_vector_extend(vtmp1, size, vtmp1, B); 1607 } 1608 // Generate mask according to the given vector, in which the elements have been 1609 // extended to expected type. 1610 // dst = 0b01101001 10001101 1611 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1612 } 1613 1614 // Clobbers: rflags 1615 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1616 FloatRegister zn, FloatRegister zm, Condition cond) { 1617 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1618 FloatRegister z1 = zn, z2 = zm; 1619 switch (cond) { 1620 case LE: z1 = zm; z2 = zn; cond = GE; break; 1621 case LT: z1 = zm; z2 = zn; cond = GT; break; 1622 case LO: z1 = zm; z2 = zn; cond = HI; break; 1623 case LS: z1 = zm; z2 = zn; cond = HS; break; 1624 default: 1625 break; 1626 } 1627 1628 SIMD_RegVariant size = elemType_to_regVariant(bt); 1629 if (is_floating_point_type(bt)) { 1630 sve_fcm(cond, pd, size, pg, z1, z2); 1631 } else { 1632 assert(is_integral_type(bt), "unsupported element type"); 1633 sve_cmp(cond, pd, size, pg, z1, z2); 1634 } 1635 } 1636 1637 // Get index of the last mask lane that is set 1638 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1639 SIMD_RegVariant size = elemType_to_regVariant(bt); 1640 sve_rev(ptmp, size, src); 1641 sve_brkb(ptmp, ptrue, ptmp, false); 1642 sve_cntp(dst, size, ptrue, ptmp); 1643 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1644 subw(dst, rscratch1, dst); 1645 } 1646 1647 // Extend integer vector src to dst with the same lane count 1648 // but larger element size, e.g. 4B -> 4I 1649 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1650 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1651 if (src_bt == T_BYTE) { 1652 if (dst_bt == T_SHORT) { 1653 // 4B/8B to 4S/8S 1654 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1655 } else { 1656 // 4B to 4I 1657 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1658 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1659 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1660 } 1661 } else if (src_bt == T_SHORT) { 1662 // 4S to 4I 1663 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1664 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1665 } else if (src_bt == T_INT) { 1666 // 2I to 2L 1667 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1668 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1669 } else { 1670 ShouldNotReachHere(); 1671 } 1672 } 1673 1674 // Narrow integer vector src down to dst with the same lane count 1675 // but smaller element size, e.g. 4I -> 4B 1676 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1677 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1678 if (src_bt == T_SHORT) { 1679 // 4S/8S to 4B/8B 1680 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1681 assert(dst_bt == T_BYTE, "unsupported"); 1682 xtn(dst, T8B, src, T8H); 1683 } else if (src_bt == T_INT) { 1684 // 4I to 4B/4S 1685 assert(src_vlen_in_bytes == 16, "unsupported"); 1686 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1687 xtn(dst, T4H, src, T4S); 1688 if (dst_bt == T_BYTE) { 1689 xtn(dst, T8B, dst, T8H); 1690 } 1691 } else if (src_bt == T_LONG) { 1692 // 2L to 2I 1693 assert(src_vlen_in_bytes == 16, "unsupported"); 1694 assert(dst_bt == T_INT, "unsupported"); 1695 xtn(dst, T2S, src, T2D); 1696 } else { 1697 ShouldNotReachHere(); 1698 } 1699 } 1700 1701 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1702 FloatRegister src, SIMD_RegVariant src_size, 1703 bool is_unsigned) { 1704 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1705 1706 if (src_size == B) { 1707 switch (dst_size) { 1708 case H: 1709 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1710 break; 1711 case S: 1712 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1713 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1714 break; 1715 case D: 1716 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1717 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1718 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1719 break; 1720 default: 1721 ShouldNotReachHere(); 1722 } 1723 } else if (src_size == H) { 1724 if (dst_size == S) { 1725 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1726 } else { // D 1727 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1728 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1729 } 1730 } else if (src_size == S) { 1731 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1732 } 1733 } 1734 1735 // Vector narrow from src to dst with specified element sizes. 1736 // High part of dst vector will be filled with zero. 1737 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1738 FloatRegister src, SIMD_RegVariant src_size, 1739 FloatRegister tmp) { 1740 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1741 assert_different_registers(src, tmp); 1742 sve_dup(tmp, src_size, 0); 1743 if (src_size == D) { 1744 switch (dst_size) { 1745 case S: 1746 sve_uzp1(dst, S, src, tmp); 1747 break; 1748 case H: 1749 assert_different_registers(dst, tmp); 1750 sve_uzp1(dst, S, src, tmp); 1751 sve_uzp1(dst, H, dst, tmp); 1752 break; 1753 case B: 1754 assert_different_registers(dst, tmp); 1755 sve_uzp1(dst, S, src, tmp); 1756 sve_uzp1(dst, H, dst, tmp); 1757 sve_uzp1(dst, B, dst, tmp); 1758 break; 1759 default: 1760 ShouldNotReachHere(); 1761 } 1762 } else if (src_size == S) { 1763 if (dst_size == H) { 1764 sve_uzp1(dst, H, src, tmp); 1765 } else { // B 1766 assert_different_registers(dst, tmp); 1767 sve_uzp1(dst, H, src, tmp); 1768 sve_uzp1(dst, B, dst, tmp); 1769 } 1770 } else if (src_size == H) { 1771 sve_uzp1(dst, B, src, tmp); 1772 } 1773 } 1774 1775 // Extend src predicate to dst predicate with the same lane count but larger 1776 // element size, e.g. 64Byte -> 512Long 1777 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1778 uint dst_element_length_in_bytes, 1779 uint src_element_length_in_bytes) { 1780 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1781 sve_punpklo(dst, src); 1782 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1783 sve_punpklo(dst, src); 1784 sve_punpklo(dst, dst); 1785 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1786 sve_punpklo(dst, src); 1787 sve_punpklo(dst, dst); 1788 sve_punpklo(dst, dst); 1789 } else { 1790 assert(false, "unsupported"); 1791 ShouldNotReachHere(); 1792 } 1793 } 1794 1795 // Narrow src predicate to dst predicate with the same lane count but 1796 // smaller element size, e.g. 512Long -> 64Byte 1797 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1798 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1799 // The insignificant bits in src predicate are expected to be zero. 1800 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1801 // passed as the second argument. An example narrowing operation with a given mask would be - 1802 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1803 // Mask (for 2 Longs) : TF 1804 // Predicate register for the above mask (16 bits) : 00000001 00000000 1805 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1806 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1807 assert_different_registers(src, ptmp); 1808 assert_different_registers(dst, ptmp); 1809 sve_pfalse(ptmp); 1810 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1811 sve_uzp1(dst, B, src, ptmp); 1812 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1813 sve_uzp1(dst, H, src, ptmp); 1814 sve_uzp1(dst, B, dst, ptmp); 1815 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1816 sve_uzp1(dst, S, src, ptmp); 1817 sve_uzp1(dst, H, dst, ptmp); 1818 sve_uzp1(dst, B, dst, ptmp); 1819 } else { 1820 assert(false, "unsupported"); 1821 ShouldNotReachHere(); 1822 } 1823 } 1824 1825 // Vector reduction add for integral type with ASIMD instructions. 1826 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1827 Register isrc, FloatRegister vsrc, 1828 unsigned vector_length_in_bytes, 1829 FloatRegister vtmp) { 1830 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1831 assert_different_registers(dst, isrc); 1832 bool isQ = vector_length_in_bytes == 16; 1833 1834 BLOCK_COMMENT("neon_reduce_add_integral {"); 1835 switch(bt) { 1836 case T_BYTE: 1837 addv(vtmp, isQ ? T16B : T8B, vsrc); 1838 smov(dst, vtmp, B, 0); 1839 addw(dst, dst, isrc, ext::sxtb); 1840 break; 1841 case T_SHORT: 1842 addv(vtmp, isQ ? T8H : T4H, vsrc); 1843 smov(dst, vtmp, H, 0); 1844 addw(dst, dst, isrc, ext::sxth); 1845 break; 1846 case T_INT: 1847 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1848 umov(dst, vtmp, S, 0); 1849 addw(dst, dst, isrc); 1850 break; 1851 case T_LONG: 1852 assert(isQ, "unsupported"); 1853 addpd(vtmp, vsrc); 1854 umov(dst, vtmp, D, 0); 1855 add(dst, dst, isrc); 1856 break; 1857 default: 1858 assert(false, "unsupported"); 1859 ShouldNotReachHere(); 1860 } 1861 BLOCK_COMMENT("} neon_reduce_add_integral"); 1862 } 1863 1864 // Vector reduction multiply for integral type with ASIMD instructions. 1865 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1866 // Clobbers: rscratch1 1867 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1868 Register isrc, FloatRegister vsrc, 1869 unsigned vector_length_in_bytes, 1870 FloatRegister vtmp1, FloatRegister vtmp2) { 1871 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1872 bool isQ = vector_length_in_bytes == 16; 1873 1874 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1875 switch(bt) { 1876 case T_BYTE: 1877 if (isQ) { 1878 // Multiply the lower half and higher half of vector iteratively. 1879 // vtmp1 = vsrc[8:15] 1880 ins(vtmp1, D, vsrc, 0, 1); 1881 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1882 mulv(vtmp1, T8B, vtmp1, vsrc); 1883 // vtmp2 = vtmp1[4:7] 1884 ins(vtmp2, S, vtmp1, 0, 1); 1885 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1886 mulv(vtmp1, T8B, vtmp2, vtmp1); 1887 } else { 1888 ins(vtmp1, S, vsrc, 0, 1); 1889 mulv(vtmp1, T8B, vtmp1, vsrc); 1890 } 1891 // vtmp2 = vtmp1[2:3] 1892 ins(vtmp2, H, vtmp1, 0, 1); 1893 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1894 mulv(vtmp2, T8B, vtmp2, vtmp1); 1895 // dst = vtmp2[0] * isrc * vtmp2[1] 1896 umov(rscratch1, vtmp2, B, 0); 1897 mulw(dst, rscratch1, isrc); 1898 sxtb(dst, dst); 1899 umov(rscratch1, vtmp2, B, 1); 1900 mulw(dst, rscratch1, dst); 1901 sxtb(dst, dst); 1902 break; 1903 case T_SHORT: 1904 if (isQ) { 1905 ins(vtmp2, D, vsrc, 0, 1); 1906 mulv(vtmp2, T4H, vtmp2, vsrc); 1907 ins(vtmp1, S, vtmp2, 0, 1); 1908 mulv(vtmp1, T4H, vtmp1, vtmp2); 1909 } else { 1910 ins(vtmp1, S, vsrc, 0, 1); 1911 mulv(vtmp1, T4H, vtmp1, vsrc); 1912 } 1913 umov(rscratch1, vtmp1, H, 0); 1914 mulw(dst, rscratch1, isrc); 1915 sxth(dst, dst); 1916 umov(rscratch1, vtmp1, H, 1); 1917 mulw(dst, rscratch1, dst); 1918 sxth(dst, dst); 1919 break; 1920 case T_INT: 1921 if (isQ) { 1922 ins(vtmp1, D, vsrc, 0, 1); 1923 mulv(vtmp1, T2S, vtmp1, vsrc); 1924 } else { 1925 vtmp1 = vsrc; 1926 } 1927 umov(rscratch1, vtmp1, S, 0); 1928 mul(dst, rscratch1, isrc); 1929 umov(rscratch1, vtmp1, S, 1); 1930 mul(dst, rscratch1, dst); 1931 break; 1932 case T_LONG: 1933 umov(rscratch1, vsrc, D, 0); 1934 mul(dst, isrc, rscratch1); 1935 umov(rscratch1, vsrc, D, 1); 1936 mul(dst, dst, rscratch1); 1937 break; 1938 default: 1939 assert(false, "unsupported"); 1940 ShouldNotReachHere(); 1941 } 1942 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1943 } 1944 1945 // Vector reduction multiply for floating-point type with ASIMD instructions. 1946 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1947 FloatRegister fsrc, FloatRegister vsrc, 1948 unsigned vector_length_in_bytes, 1949 FloatRegister vtmp) { 1950 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1951 bool isQ = vector_length_in_bytes == 16; 1952 1953 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1954 switch(bt) { 1955 case T_FLOAT: 1956 fmuls(dst, fsrc, vsrc); 1957 ins(vtmp, S, vsrc, 0, 1); 1958 fmuls(dst, dst, vtmp); 1959 if (isQ) { 1960 ins(vtmp, S, vsrc, 0, 2); 1961 fmuls(dst, dst, vtmp); 1962 ins(vtmp, S, vsrc, 0, 3); 1963 fmuls(dst, dst, vtmp); 1964 } 1965 break; 1966 case T_DOUBLE: 1967 assert(isQ, "unsupported"); 1968 fmuld(dst, fsrc, vsrc); 1969 ins(vtmp, D, vsrc, 0, 1); 1970 fmuld(dst, dst, vtmp); 1971 break; 1972 default: 1973 assert(false, "unsupported"); 1974 ShouldNotReachHere(); 1975 } 1976 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1977 } 1978 1979 // Helper to select logical instruction 1980 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1981 Register Rn, Register Rm, 1982 enum shift_kind kind, unsigned shift) { 1983 switch(opc) { 1984 case Op_AndReductionV: 1985 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1986 break; 1987 case Op_OrReductionV: 1988 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1989 break; 1990 case Op_XorReductionV: 1991 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1992 break; 1993 default: 1994 assert(false, "unsupported"); 1995 ShouldNotReachHere(); 1996 } 1997 } 1998 1999 // Vector reduction logical operations And, Or, Xor 2000 // Clobbers: rscratch1 2001 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2002 Register isrc, FloatRegister vsrc, 2003 unsigned vector_length_in_bytes) { 2004 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2005 "unsupported"); 2006 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2007 assert_different_registers(dst, isrc); 2008 bool isQ = vector_length_in_bytes == 16; 2009 2010 BLOCK_COMMENT("neon_reduce_logical {"); 2011 umov(rscratch1, vsrc, isQ ? D : S, 0); 2012 umov(dst, vsrc, isQ ? D : S, 1); 2013 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2014 switch(bt) { 2015 case T_BYTE: 2016 if (isQ) { 2017 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2018 } 2019 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2020 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2021 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2022 sxtb(dst, dst); 2023 break; 2024 case T_SHORT: 2025 if (isQ) { 2026 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2027 } 2028 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2029 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2030 sxth(dst, dst); 2031 break; 2032 case T_INT: 2033 if (isQ) { 2034 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2035 } 2036 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2037 break; 2038 case T_LONG: 2039 assert(isQ, "unsupported"); 2040 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2041 break; 2042 default: 2043 assert(false, "unsupported"); 2044 ShouldNotReachHere(); 2045 } 2046 BLOCK_COMMENT("} neon_reduce_logical"); 2047 } 2048 2049 // Vector reduction min/max for integral type with ASIMD instructions. 2050 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2051 // Clobbers: rscratch1, rflags 2052 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2053 Register isrc, FloatRegister vsrc, 2054 unsigned vector_length_in_bytes, 2055 FloatRegister vtmp) { 2056 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2057 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2058 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2059 assert_different_registers(dst, isrc); 2060 bool isQ = vector_length_in_bytes == 16; 2061 bool is_min = opc == Op_MinReductionV; 2062 2063 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2064 if (bt == T_LONG) { 2065 assert(vtmp == fnoreg, "should be"); 2066 assert(isQ, "should be"); 2067 umov(rscratch1, vsrc, D, 0); 2068 cmp(isrc, rscratch1); 2069 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2070 umov(rscratch1, vsrc, D, 1); 2071 cmp(dst, rscratch1); 2072 csel(dst, dst, rscratch1, is_min ? LT : GT); 2073 } else { 2074 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2075 if (size == T2S) { 2076 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2077 } else { 2078 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2079 } 2080 if (bt == T_INT) { 2081 umov(dst, vtmp, S, 0); 2082 } else { 2083 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2084 } 2085 cmpw(dst, isrc); 2086 cselw(dst, dst, isrc, is_min ? LT : GT); 2087 } 2088 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2089 } 2090 2091 // Vector reduction for integral type with SVE instruction. 2092 // Supported operations are Add, And, Or, Xor, Max, Min. 2093 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2094 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2095 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2096 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2097 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2098 assert_different_registers(src1, dst); 2099 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2100 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2101 switch (opc) { 2102 case Op_AddReductionVI: { 2103 sve_uaddv(tmp, size, pg, src2); 2104 if (bt == T_BYTE) { 2105 smov(dst, tmp, size, 0); 2106 addw(dst, src1, dst, ext::sxtb); 2107 } else if (bt == T_SHORT) { 2108 smov(dst, tmp, size, 0); 2109 addw(dst, src1, dst, ext::sxth); 2110 } else { 2111 umov(dst, tmp, size, 0); 2112 addw(dst, dst, src1); 2113 } 2114 break; 2115 } 2116 case Op_AddReductionVL: { 2117 sve_uaddv(tmp, size, pg, src2); 2118 umov(dst, tmp, size, 0); 2119 add(dst, dst, src1); 2120 break; 2121 } 2122 case Op_AndReductionV: { 2123 sve_andv(tmp, size, pg, src2); 2124 if (bt == T_INT || bt == T_LONG) { 2125 umov(dst, tmp, size, 0); 2126 } else { 2127 smov(dst, tmp, size, 0); 2128 } 2129 if (bt == T_LONG) { 2130 andr(dst, dst, src1); 2131 } else { 2132 andw(dst, dst, src1); 2133 } 2134 break; 2135 } 2136 case Op_OrReductionV: { 2137 sve_orv(tmp, size, pg, src2); 2138 if (bt == T_INT || bt == T_LONG) { 2139 umov(dst, tmp, size, 0); 2140 } else { 2141 smov(dst, tmp, size, 0); 2142 } 2143 if (bt == T_LONG) { 2144 orr(dst, dst, src1); 2145 } else { 2146 orrw(dst, dst, src1); 2147 } 2148 break; 2149 } 2150 case Op_XorReductionV: { 2151 sve_eorv(tmp, size, pg, src2); 2152 if (bt == T_INT || bt == T_LONG) { 2153 umov(dst, tmp, size, 0); 2154 } else { 2155 smov(dst, tmp, size, 0); 2156 } 2157 if (bt == T_LONG) { 2158 eor(dst, dst, src1); 2159 } else { 2160 eorw(dst, dst, src1); 2161 } 2162 break; 2163 } 2164 case Op_MaxReductionV: { 2165 sve_smaxv(tmp, size, pg, src2); 2166 if (bt == T_INT || bt == T_LONG) { 2167 umov(dst, tmp, size, 0); 2168 } else { 2169 smov(dst, tmp, size, 0); 2170 } 2171 if (bt == T_LONG) { 2172 cmp(dst, src1); 2173 csel(dst, dst, src1, Assembler::GT); 2174 } else { 2175 cmpw(dst, src1); 2176 cselw(dst, dst, src1, Assembler::GT); 2177 } 2178 break; 2179 } 2180 case Op_MinReductionV: { 2181 sve_sminv(tmp, size, pg, src2); 2182 if (bt == T_INT || bt == T_LONG) { 2183 umov(dst, tmp, size, 0); 2184 } else { 2185 smov(dst, tmp, size, 0); 2186 } 2187 if (bt == T_LONG) { 2188 cmp(dst, src1); 2189 csel(dst, dst, src1, Assembler::LT); 2190 } else { 2191 cmpw(dst, src1); 2192 cselw(dst, dst, src1, Assembler::LT); 2193 } 2194 break; 2195 } 2196 default: 2197 assert(false, "unsupported"); 2198 ShouldNotReachHere(); 2199 } 2200 2201 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2202 if (bt == T_BYTE) { 2203 sxtb(dst, dst); 2204 } else if (bt == T_SHORT) { 2205 sxth(dst, dst); 2206 } 2207 } 2208 } 2209 2210 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2211 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2212 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2213 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2214 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2215 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2216 2217 // Set all elements to false if the input "lane_cnt" is zero. 2218 if (lane_cnt == 0) { 2219 sve_pfalse(dst); 2220 return; 2221 } 2222 2223 SIMD_RegVariant size = elemType_to_regVariant(bt); 2224 assert(size != Q, "invalid size"); 2225 2226 // Set all true if "lane_cnt" equals to the max lane count. 2227 if (lane_cnt == max_vector_length) { 2228 sve_ptrue(dst, size, /* ALL */ 0b11111); 2229 return; 2230 } 2231 2232 // Fixed numbers for "ptrue". 2233 switch(lane_cnt) { 2234 case 1: /* VL1 */ 2235 case 2: /* VL2 */ 2236 case 3: /* VL3 */ 2237 case 4: /* VL4 */ 2238 case 5: /* VL5 */ 2239 case 6: /* VL6 */ 2240 case 7: /* VL7 */ 2241 case 8: /* VL8 */ 2242 sve_ptrue(dst, size, lane_cnt); 2243 return; 2244 case 16: 2245 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2246 return; 2247 case 32: 2248 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2249 return; 2250 case 64: 2251 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2252 return; 2253 case 128: 2254 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2255 return; 2256 case 256: 2257 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2258 return; 2259 default: 2260 break; 2261 } 2262 2263 // Special patterns for "ptrue". 2264 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2265 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2266 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2267 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2268 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2269 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2270 } else { 2271 // Encode to "whileltw" for the remaining cases. 2272 mov(rscratch1, lane_cnt); 2273 sve_whileltw(dst, size, zr, rscratch1); 2274 } 2275 } 2276 2277 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2278 // Any remaining elements of dst will be filled with zero. 2279 // Clobbers: rscratch1 2280 // Preserves: src, mask 2281 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2282 FloatRegister vtmp1, FloatRegister vtmp2, 2283 PRegister pgtmp) { 2284 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2285 assert_different_registers(dst, src, vtmp1, vtmp2); 2286 assert_different_registers(mask, pgtmp); 2287 2288 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2289 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2290 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2291 sve_dup(vtmp2, H, 0); 2292 2293 // Extend lowest half to type INT. 2294 // dst = 00004444 00003333 00002222 00001111 2295 sve_uunpklo(dst, S, src); 2296 // pgtmp = 00000001 00000000 00000001 00000001 2297 sve_punpklo(pgtmp, mask); 2298 // Pack the active elements in size of type INT to the right, 2299 // and fill the remainings with zero. 2300 // dst = 00000000 00004444 00002222 00001111 2301 sve_compact(dst, S, dst, pgtmp); 2302 // Narrow the result back to type SHORT. 2303 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2304 sve_uzp1(dst, H, dst, vtmp2); 2305 // Count the active elements of lowest half. 2306 // rscratch1 = 3 2307 sve_cntp(rscratch1, S, ptrue, pgtmp); 2308 2309 // Repeat to the highest half. 2310 // pgtmp = 00000001 00000000 00000000 00000001 2311 sve_punpkhi(pgtmp, mask); 2312 // vtmp1 = 00008888 00007777 00006666 00005555 2313 sve_uunpkhi(vtmp1, S, src); 2314 // vtmp1 = 00000000 00000000 00008888 00005555 2315 sve_compact(vtmp1, S, vtmp1, pgtmp); 2316 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2317 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2318 2319 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2320 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2321 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2322 // TRUE_CNT is the number of active elements in the compressed low. 2323 neg(rscratch1, rscratch1); 2324 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2325 sve_index(vtmp2, H, rscratch1, 1); 2326 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2327 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2328 2329 // Combine the compressed high(after shifted) with the compressed low. 2330 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2331 sve_orr(dst, dst, vtmp1); 2332 } 2333 2334 // Clobbers: rscratch1, rscratch2 2335 // Preserves: src, mask 2336 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2337 FloatRegister vtmp1, FloatRegister vtmp2, 2338 FloatRegister vtmp3, FloatRegister vtmp4, 2339 PRegister ptmp, PRegister pgtmp) { 2340 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2341 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2342 assert_different_registers(mask, ptmp, pgtmp); 2343 // Example input: src = 88 77 66 55 44 33 22 11 2344 // mask = 01 00 00 01 01 00 01 01 2345 // Expected result: dst = 00 00 00 88 55 44 22 11 2346 2347 sve_dup(vtmp4, B, 0); 2348 // Extend lowest half to type SHORT. 2349 // vtmp1 = 0044 0033 0022 0011 2350 sve_uunpklo(vtmp1, H, src); 2351 // ptmp = 0001 0000 0001 0001 2352 sve_punpklo(ptmp, mask); 2353 // Count the active elements of lowest half. 2354 // rscratch2 = 3 2355 sve_cntp(rscratch2, H, ptrue, ptmp); 2356 // Pack the active elements in size of type SHORT to the right, 2357 // and fill the remainings with zero. 2358 // dst = 0000 0044 0022 0011 2359 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2360 // Narrow the result back to type BYTE. 2361 // dst = 00 00 00 00 00 44 22 11 2362 sve_uzp1(dst, B, dst, vtmp4); 2363 2364 // Repeat to the highest half. 2365 // ptmp = 0001 0000 0000 0001 2366 sve_punpkhi(ptmp, mask); 2367 // vtmp1 = 0088 0077 0066 0055 2368 sve_uunpkhi(vtmp2, H, src); 2369 // vtmp1 = 0000 0000 0088 0055 2370 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2371 2372 sve_dup(vtmp4, B, 0); 2373 // vtmp1 = 00 00 00 00 00 00 88 55 2374 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2375 2376 // Compressed low: dst = 00 00 00 00 00 44 22 11 2377 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2378 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2379 // TRUE_CNT is the number of active elements in the compressed low. 2380 neg(rscratch2, rscratch2); 2381 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2382 sve_index(vtmp2, B, rscratch2, 1); 2383 // vtmp1 = 00 00 00 88 55 00 00 00 2384 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2385 // Combine the compressed high(after shifted) with the compressed low. 2386 // dst = 00 00 00 88 55 44 22 11 2387 sve_orr(dst, dst, vtmp1); 2388 } 2389 2390 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2391 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2392 SIMD_Arrangement size = isQ ? T16B : T8B; 2393 if (bt == T_BYTE) { 2394 rbit(dst, size, src); 2395 } else { 2396 neon_reverse_bytes(dst, src, bt, isQ); 2397 rbit(dst, size, dst); 2398 } 2399 } 2400 2401 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2402 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2403 SIMD_Arrangement size = isQ ? T16B : T8B; 2404 switch (bt) { 2405 case T_BYTE: 2406 if (dst != src) { 2407 orr(dst, size, src, src); 2408 } 2409 break; 2410 case T_SHORT: 2411 rev16(dst, size, src); 2412 break; 2413 case T_INT: 2414 rev32(dst, size, src); 2415 break; 2416 case T_LONG: 2417 rev64(dst, size, src); 2418 break; 2419 default: 2420 assert(false, "unsupported"); 2421 ShouldNotReachHere(); 2422 } 2423 } 2424 2425 // Extract a scalar element from an sve vector at position 'idx'. 2426 // The input elements in src are expected to be of integral type. 2427 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2428 int idx, FloatRegister vtmp) { 2429 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2430 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2431 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2432 if (bt == T_INT || bt == T_LONG) { 2433 umov(dst, src, size, idx); 2434 } else { 2435 smov(dst, src, size, idx); 2436 } 2437 } else { 2438 sve_orr(vtmp, src, src); 2439 sve_ext(vtmp, vtmp, idx << size); 2440 if (bt == T_INT || bt == T_LONG) { 2441 umov(dst, vtmp, size, 0); 2442 } else { 2443 smov(dst, vtmp, size, 0); 2444 } 2445 } 2446 } 2447 2448 // java.lang.Math::round intrinsics 2449 2450 // Clobbers: rscratch1, rflags 2451 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2452 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2453 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2454 switch (T) { 2455 case T2S: 2456 case T4S: 2457 fmovs(tmp1, T, 0.5f); 2458 mov(rscratch1, jint_cast(0x1.0p23f)); 2459 break; 2460 case T2D: 2461 fmovd(tmp1, T, 0.5); 2462 mov(rscratch1, julong_cast(0x1.0p52)); 2463 break; 2464 default: 2465 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2466 } 2467 fadd(tmp1, T, tmp1, src); 2468 fcvtms(tmp1, T, tmp1); 2469 // tmp1 = floor(src + 0.5, ties to even) 2470 2471 fcvtas(dst, T, src); 2472 // dst = round(src), ties to away 2473 2474 fneg(tmp3, T, src); 2475 dup(tmp2, T, rscratch1); 2476 cm(HS, tmp3, T, tmp3, tmp2); 2477 // tmp3 is now a set of flags 2478 2479 bif(dst, T16B, tmp1, tmp3); 2480 // result in dst 2481 } 2482 2483 // Clobbers: rscratch1, rflags 2484 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2485 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2486 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2487 assert_different_registers(tmp1, tmp2, src, dst); 2488 2489 switch (T) { 2490 case S: 2491 mov(rscratch1, jint_cast(0x1.0p23f)); 2492 break; 2493 case D: 2494 mov(rscratch1, julong_cast(0x1.0p52)); 2495 break; 2496 default: 2497 assert(T == S || T == D, "invalid register variant"); 2498 } 2499 2500 sve_frinta(dst, T, ptrue, src); 2501 // dst = round(src), ties to away 2502 2503 Label none; 2504 2505 sve_fneg(tmp1, T, ptrue, src); 2506 sve_dup(tmp2, T, rscratch1); 2507 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2508 br(EQ, none); 2509 { 2510 sve_cpy(tmp1, T, pgtmp, 0.5); 2511 sve_fadd(tmp1, T, pgtmp, src); 2512 sve_frintm(dst, T, pgtmp, tmp1); 2513 // dst = floor(src + 0.5, ties to even) 2514 } 2515 bind(none); 2516 2517 sve_fcvtzs(dst, T, ptrue, dst, T); 2518 // result in dst 2519 } 2520 2521 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2522 FloatRegister one, SIMD_Arrangement T) { 2523 assert_different_registers(dst, src, zero, one); 2524 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2525 2526 facgt(dst, T, src, zero); 2527 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2528 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2529 } 2530 2531 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2532 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2533 assert_different_registers(dst, src, zero, one, vtmp); 2534 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2535 2536 sve_orr(vtmp, src, src); 2537 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2538 switch (T) { 2539 case S: 2540 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2541 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2542 // on the sign of the float value 2543 break; 2544 case D: 2545 sve_and(vtmp, T, min_jlong); 2546 sve_orr(vtmp, T, jlong_cast(1.0)); 2547 break; 2548 default: 2549 assert(false, "unsupported"); 2550 ShouldNotReachHere(); 2551 } 2552 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2553 // Result in dst 2554 } 2555 2556 bool C2_MacroAssembler::in_scratch_emit_size() { 2557 if (ciEnv::current()->task() != nullptr) { 2558 PhaseOutput* phase_output = Compile::current()->output(); 2559 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2560 return true; 2561 } 2562 } 2563 return MacroAssembler::in_scratch_emit_size(); 2564 } 2565 2566 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) { 2567 // Note: Don't clobber obj anywhere in that method! 2568 2569 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract 2570 // obj-start, so that we can load from the object's mark-word instead. Usually the address 2571 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2 2572 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and 2573 // then passes that register as obj and 0 in disp. The following code extracts the base 2574 // and offset to load the mark-word. 2575 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes(); 2576 if (index == noreg) { 2577 ldr(dst, Address(obj, offset)); 2578 } else { 2579 lea(dst, Address(obj, index, Address::lsl(scale))); 2580 ldr(dst, Address(dst, offset)); 2581 } 2582 lsr(dst, dst, markWord::klass_shift); 2583 }