1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::entry_barrier() { 50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 51 // Dummy labels for just measuring the code size 52 Label dummy_slow_path; 53 Label dummy_continuation; 54 Label dummy_guard; 55 Label* slow_path = &dummy_slow_path; 56 Label* continuation = &dummy_continuation; 57 Label* guard = &dummy_guard; 58 if (!Compile::current()->output()->in_scratch_emit_size()) { 59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 61 Compile::current()->output()->add_stub(stub); 62 slow_path = &stub->entry(); 63 continuation = &stub->continuation(); 64 guard = &stub->guard(); 65 } 66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 68 } 69 70 // jdk.internal.util.ArraysSupport.vectorizedHashCode 71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 72 FloatRegister vdata0, FloatRegister vdata1, 73 FloatRegister vdata2, FloatRegister vdata3, 74 FloatRegister vmul0, FloatRegister vmul1, 75 FloatRegister vmul2, FloatRegister vmul3, 76 FloatRegister vpow, FloatRegister vpowm, 77 BasicType eltype) { 78 ARRAYS_HASHCODE_REGISTERS; 79 80 Register tmp1 = rscratch1, tmp2 = rscratch2; 81 82 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 83 84 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 85 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 86 // use 4H for chars and shorts instead, but using 8H gives better performance. 87 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 88 : eltype == T_CHAR || eltype == T_SHORT ? 8 89 : eltype == T_INT ? 4 90 : 0; 91 guarantee(vf, "unsupported eltype"); 92 93 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 94 const size_t unroll_factor = 4; 95 96 switch (eltype) { 97 case T_BOOLEAN: 98 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 99 break; 100 case T_CHAR: 101 BLOCK_COMMENT("arrays_hashcode(char) {"); 102 break; 103 case T_BYTE: 104 BLOCK_COMMENT("arrays_hashcode(byte) {"); 105 break; 106 case T_SHORT: 107 BLOCK_COMMENT("arrays_hashcode(short) {"); 108 break; 109 case T_INT: 110 BLOCK_COMMENT("arrays_hashcode(int) {"); 111 break; 112 default: 113 ShouldNotReachHere(); 114 } 115 116 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 117 // implemented by the stub executes just once. Call the stub only if at least two iterations will 118 // be executed. 119 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 120 cmpw(cnt, large_threshold); 121 br(Assembler::HS, LARGE); 122 123 bind(TAIL); 124 125 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 126 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 127 // Iteration eats up the remainder, uf elements at a time. 128 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 129 andr(tmp2, cnt, unroll_factor - 1); 130 adr(tmp1, BR_BASE); 131 // For Cortex-A53 offset is 4 because 2 nops are generated. 132 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3); 133 movw(tmp2, 0x1f); 134 br(tmp1); 135 136 bind(LOOP); 137 for (size_t i = 0; i < unroll_factor; ++i) { 138 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 139 maddw(result, result, tmp2, tmp1); 140 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 141 // Generate 2nd nop to have 4 instructions per iteration. 142 if (VM_Version::supports_a53mac()) { 143 nop(); 144 } 145 } 146 bind(BR_BASE); 147 subsw(cnt, cnt, unroll_factor); 148 br(Assembler::HS, LOOP); 149 150 b(DONE); 151 152 bind(LARGE); 153 154 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 155 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 156 address tpc = trampoline_call(stub); 157 if (tpc == nullptr) { 158 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 159 postcond(pc() == badAddress); 160 return nullptr; 161 } 162 163 bind(DONE); 164 165 BLOCK_COMMENT("} // arrays_hashcode"); 166 167 postcond(pc() != badAddress); 168 return pc(); 169 } 170 171 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 172 Register t2, Register t3) { 173 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 174 175 // Handle inflated monitor. 176 Label inflated; 177 // Finish fast lock successfully. MUST branch to with flag == EQ 178 Label locked; 179 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 180 Label slow_path; 181 182 if (UseObjectMonitorTable) { 183 // Clear cache in case fast locking succeeds or we need to take the slow-path. 184 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 185 } 186 187 if (DiagnoseSyncOnValueBasedClasses != 0) { 188 load_klass(t1, obj); 189 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 190 tst(t1, KlassFlags::_misc_is_value_based_class); 191 br(Assembler::NE, slow_path); 192 } 193 194 const Register t1_mark = t1; 195 const Register t3_t = t3; 196 197 { // Lightweight locking 198 199 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 200 Label push; 201 202 const Register t2_top = t2; 203 204 // Check if lock-stack is full. 205 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 206 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 207 br(Assembler::GT, slow_path); 208 209 // Check if recursive. 210 subw(t3_t, t2_top, oopSize); 211 ldr(t3_t, Address(rthread, t3_t)); 212 cmp(obj, t3_t); 213 br(Assembler::EQ, push); 214 215 // Relaxed normal load to check for monitor. Optimization for monitor case. 216 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 217 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 218 219 // Not inflated 220 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 221 222 // Try to lock. Transition lock-bits 0b01 => 0b00 223 orr(t1_mark, t1_mark, markWord::unlocked_value); 224 eor(t3_t, t1_mark, markWord::unlocked_value); 225 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 226 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 227 br(Assembler::NE, slow_path); 228 229 bind(push); 230 // After successful lock, push object on lock-stack. 231 str(obj, Address(rthread, t2_top)); 232 addw(t2_top, t2_top, oopSize); 233 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 234 b(locked); 235 } 236 237 { // Handle inflated monitor. 238 bind(inflated); 239 240 const Register t1_monitor = t1; 241 242 if (!UseObjectMonitorTable) { 243 assert(t1_monitor == t1_mark, "should be the same here"); 244 } else { 245 Label monitor_found; 246 247 // Load cache address 248 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 249 250 const int num_unrolled = 2; 251 for (int i = 0; i < num_unrolled; i++) { 252 ldr(t1, Address(t3_t)); 253 cmp(obj, t1); 254 br(Assembler::EQ, monitor_found); 255 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 256 } 257 258 Label loop; 259 260 // Search for obj in cache. 261 bind(loop); 262 263 // Check for match. 264 ldr(t1, Address(t3_t)); 265 cmp(obj, t1); 266 br(Assembler::EQ, monitor_found); 267 268 // Search until null encountered, guaranteed _null_sentinel at end. 269 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 270 cbnz(t1, loop); 271 // Cache Miss, NE set from cmp above, cbnz does not set flags 272 b(slow_path); 273 274 bind(monitor_found); 275 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 276 } 277 278 const Register t2_owner_addr = t2; 279 const Register t3_owner = t3; 280 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 281 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 282 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 283 284 Label monitor_locked; 285 286 // Compute owner address. 287 lea(t2_owner_addr, owner_address); 288 289 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 290 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 291 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 292 /*release*/ false, /*weak*/ false, t3_owner); 293 br(Assembler::EQ, monitor_locked); 294 295 // Check if recursive. 296 cmp(t3_owner, rscratch2); 297 br(Assembler::NE, slow_path); 298 299 // Recursive. 300 increment(recursions_address, 1); 301 302 bind(monitor_locked); 303 if (UseObjectMonitorTable) { 304 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 305 } 306 } 307 308 bind(locked); 309 310 #ifdef ASSERT 311 // Check that locked label is reached with Flags == EQ. 312 Label flag_correct; 313 br(Assembler::EQ, flag_correct); 314 stop("Fast Lock Flag != EQ"); 315 #endif 316 317 bind(slow_path); 318 #ifdef ASSERT 319 // Check that slow_path label is reached with Flags == NE. 320 br(Assembler::NE, flag_correct); 321 stop("Fast Lock Flag != NE"); 322 bind(flag_correct); 323 #endif 324 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 325 } 326 327 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 328 Register t2, Register t3) { 329 assert_different_registers(obj, box, t1, t2, t3); 330 331 // Handle inflated monitor. 332 Label inflated, inflated_load_mark; 333 // Finish fast unlock successfully. MUST branch to with flag == EQ 334 Label unlocked; 335 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 336 Label slow_path; 337 338 const Register t1_mark = t1; 339 const Register t2_top = t2; 340 const Register t3_t = t3; 341 342 { // Lightweight unlock 343 344 Label push_and_slow_path; 345 346 // Check if obj is top of lock-stack. 347 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 348 subw(t2_top, t2_top, oopSize); 349 ldr(t3_t, Address(rthread, t2_top)); 350 cmp(obj, t3_t); 351 // Top of lock stack was not obj. Must be monitor. 352 br(Assembler::NE, inflated_load_mark); 353 354 // Pop lock-stack. 355 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 356 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 357 358 // Check if recursive. 359 subw(t3_t, t2_top, oopSize); 360 ldr(t3_t, Address(rthread, t3_t)); 361 cmp(obj, t3_t); 362 br(Assembler::EQ, unlocked); 363 364 // Not recursive. 365 // Load Mark. 366 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 367 368 // Check header for monitor (0b10). 369 // Because we got here by popping (meaning we pushed in locked) 370 // there will be no monitor in the box. So we need to push back the obj 371 // so that the runtime can fix any potential anonymous owner. 372 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 373 374 // Try to unlock. Transition lock bits 0b00 => 0b01 375 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 376 orr(t3_t, t1_mark, markWord::unlocked_value); 377 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 378 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 379 br(Assembler::EQ, unlocked); 380 381 bind(push_and_slow_path); 382 // Compare and exchange failed. 383 // Restore lock-stack and handle the unlock in runtime. 384 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 385 addw(t2_top, t2_top, oopSize); 386 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 387 b(slow_path); 388 } 389 390 391 { // Handle inflated monitor. 392 bind(inflated_load_mark); 393 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 394 #ifdef ASSERT 395 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 396 stop("Fast Unlock not monitor"); 397 #endif 398 399 bind(inflated); 400 401 #ifdef ASSERT 402 Label check_done; 403 subw(t2_top, t2_top, oopSize); 404 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 405 br(Assembler::LT, check_done); 406 ldr(t3_t, Address(rthread, t2_top)); 407 cmp(obj, t3_t); 408 br(Assembler::NE, inflated); 409 stop("Fast Unlock lock on stack"); 410 bind(check_done); 411 #endif 412 413 const Register t1_monitor = t1; 414 415 if (!UseObjectMonitorTable) { 416 assert(t1_monitor == t1_mark, "should be the same here"); 417 418 // Untag the monitor. 419 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 420 } else { 421 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 422 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 423 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 424 br(Assembler::LO, slow_path); 425 } 426 427 const Register t2_recursions = t2; 428 Label not_recursive; 429 430 // Check if recursive. 431 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 432 cbz(t2_recursions, not_recursive); 433 434 // Recursive unlock. 435 sub(t2_recursions, t2_recursions, 1u); 436 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 437 // Set flag == EQ 438 cmp(t2_recursions, t2_recursions); 439 b(unlocked); 440 441 bind(not_recursive); 442 443 const Register t2_owner_addr = t2; 444 445 // Compute owner address. 446 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 447 448 // Set owner to null. 449 // Release to satisfy the JMM 450 stlr(zr, t2_owner_addr); 451 // We need a full fence after clearing owner to avoid stranding. 452 // StoreLoad achieves this. 453 membar(StoreLoad); 454 455 // Check if the entry_list is empty. 456 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset())); 457 cmp(rscratch1, zr); 458 br(Assembler::EQ, unlocked); // If so we are done. 459 460 // Check if there is a successor. 461 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 462 cmp(rscratch1, zr); 463 br(Assembler::NE, unlocked); // If so we are done. 464 465 // Save the monitor pointer in the current thread, so we can try to 466 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 467 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 468 469 cmp(zr, rthread); // Set Flag to NE => slow path 470 b(slow_path); 471 } 472 473 bind(unlocked); 474 cmp(zr, zr); // Set Flags to EQ => fast path 475 476 #ifdef ASSERT 477 // Check that unlocked label is reached with Flags == EQ. 478 Label flag_correct; 479 br(Assembler::EQ, flag_correct); 480 stop("Fast Unlock Flag != EQ"); 481 #endif 482 483 bind(slow_path); 484 #ifdef ASSERT 485 // Check that slow_path label is reached with Flags == NE. 486 br(Assembler::NE, flag_correct); 487 stop("Fast Unlock Flag != NE"); 488 bind(flag_correct); 489 #endif 490 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 491 } 492 493 // Search for str1 in str2 and return index or -1 494 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 495 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 496 Register cnt2, Register cnt1, 497 Register tmp1, Register tmp2, 498 Register tmp3, Register tmp4, 499 Register tmp5, Register tmp6, 500 int icnt1, Register result, int ae) { 501 // NOTE: tmp5, tmp6 can be zr depending on specific method version 502 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 503 504 Register ch1 = rscratch1; 505 Register ch2 = rscratch2; 506 Register cnt1tmp = tmp1; 507 Register cnt2tmp = tmp2; 508 Register cnt1_neg = cnt1; 509 Register cnt2_neg = cnt2; 510 Register result_tmp = tmp4; 511 512 bool isL = ae == StrIntrinsicNode::LL; 513 514 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 515 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 516 int str1_chr_shift = str1_isL ? 0:1; 517 int str2_chr_shift = str2_isL ? 0:1; 518 int str1_chr_size = str1_isL ? 1:2; 519 int str2_chr_size = str2_isL ? 1:2; 520 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 521 (chr_insn)&MacroAssembler::ldrh; 522 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 523 (chr_insn)&MacroAssembler::ldrh; 524 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 525 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 526 527 // Note, inline_string_indexOf() generates checks: 528 // if (substr.count > string.count) return -1; 529 // if (substr.count == 0) return 0; 530 531 // We have two strings, a source string in str2, cnt2 and a pattern string 532 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 533 534 // For larger pattern and source we use a simplified Boyer Moore algorithm. 535 // With a small pattern and source we use linear scan. 536 537 if (icnt1 == -1) { 538 sub(result_tmp, cnt2, cnt1); 539 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 540 br(LT, LINEARSEARCH); 541 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 542 subs(zr, cnt1, 256); 543 lsr(tmp1, cnt2, 2); 544 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 545 br(GE, LINEARSTUB); 546 } 547 548 // The Boyer Moore alogorithm is based on the description here:- 549 // 550 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 551 // 552 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 553 // and the 'Good Suffix' rule. 554 // 555 // These rules are essentially heuristics for how far we can shift the 556 // pattern along the search string. 557 // 558 // The implementation here uses the 'Bad Character' rule only because of the 559 // complexity of initialisation for the 'Good Suffix' rule. 560 // 561 // This is also known as the Boyer-Moore-Horspool algorithm:- 562 // 563 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 564 // 565 // This particular implementation has few java-specific optimizations. 566 // 567 // #define ASIZE 256 568 // 569 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 570 // int i, j; 571 // unsigned c; 572 // unsigned char bc[ASIZE]; 573 // 574 // /* Preprocessing */ 575 // for (i = 0; i < ASIZE; ++i) 576 // bc[i] = m; 577 // for (i = 0; i < m - 1; ) { 578 // c = x[i]; 579 // ++i; 580 // // c < 256 for Latin1 string, so, no need for branch 581 // #ifdef PATTERN_STRING_IS_LATIN1 582 // bc[c] = m - i; 583 // #else 584 // if (c < ASIZE) bc[c] = m - i; 585 // #endif 586 // } 587 // 588 // /* Searching */ 589 // j = 0; 590 // while (j <= n - m) { 591 // c = y[i+j]; 592 // if (x[m-1] == c) 593 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 594 // if (i < 0) return j; 595 // // c < 256 for Latin1 string, so, no need for branch 596 // #ifdef SOURCE_STRING_IS_LATIN1 597 // // LL case: (c< 256) always true. Remove branch 598 // j += bc[y[j+m-1]]; 599 // #endif 600 // #ifndef PATTERN_STRING_IS_UTF 601 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 602 // if (c < ASIZE) 603 // j += bc[y[j+m-1]]; 604 // else 605 // j += 1 606 // #endif 607 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 608 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 609 // if (c < ASIZE) 610 // j += bc[y[j+m-1]]; 611 // else 612 // j += m 613 // #endif 614 // } 615 // } 616 617 if (icnt1 == -1) { 618 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 619 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 620 Register cnt1end = tmp2; 621 Register str2end = cnt2; 622 Register skipch = tmp2; 623 624 // str1 length is >=8, so, we can read at least 1 register for cases when 625 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 626 // UL case. We'll re-read last character in inner pre-loop code to have 627 // single outer pre-loop load 628 const int firstStep = isL ? 7 : 3; 629 630 const int ASIZE = 256; 631 const int STORED_BYTES = 32; // amount of bytes stored per instruction 632 sub(sp, sp, ASIZE); 633 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 634 mov(ch1, sp); 635 BIND(BM_INIT_LOOP); 636 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 637 subs(tmp5, tmp5, 1); 638 br(GT, BM_INIT_LOOP); 639 640 sub(cnt1tmp, cnt1, 1); 641 mov(tmp5, str2); 642 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 643 sub(ch2, cnt1, 1); 644 mov(tmp3, str1); 645 BIND(BCLOOP); 646 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 647 if (!str1_isL) { 648 subs(zr, ch1, ASIZE); 649 br(HS, BCSKIP); 650 } 651 strb(ch2, Address(sp, ch1)); 652 BIND(BCSKIP); 653 subs(ch2, ch2, 1); 654 br(GT, BCLOOP); 655 656 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 657 if (str1_isL == str2_isL) { 658 // load last 8 bytes (8LL/4UU symbols) 659 ldr(tmp6, Address(tmp6, -wordSize)); 660 } else { 661 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 662 // convert Latin1 to UTF. We'll have to wait until load completed, but 663 // it's still faster than per-character loads+checks 664 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 665 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 666 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 667 andr(tmp6, tmp6, 0xFF); // str1[N-4] 668 orr(ch2, ch1, ch2, LSL, 16); 669 orr(tmp6, tmp6, tmp3, LSL, 48); 670 orr(tmp6, tmp6, ch2, LSL, 16); 671 } 672 BIND(BMLOOPSTR2); 673 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 674 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 675 if (str1_isL == str2_isL) { 676 // re-init tmp3. It's for free because it's executed in parallel with 677 // load above. Alternative is to initialize it before loop, but it'll 678 // affect performance on in-order systems with 2 or more ld/st pipelines 679 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 680 } 681 if (!isL) { // UU/UL case 682 lsl(ch2, cnt1tmp, 1); // offset in bytes 683 } 684 cmp(tmp3, skipch); 685 br(NE, BMSKIP); 686 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 687 mov(ch1, tmp6); 688 if (isL) { 689 b(BMLOOPSTR1_AFTER_LOAD); 690 } else { 691 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 692 b(BMLOOPSTR1_CMP); 693 } 694 BIND(BMLOOPSTR1); 695 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 696 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 697 BIND(BMLOOPSTR1_AFTER_LOAD); 698 subs(cnt1tmp, cnt1tmp, 1); 699 br(LT, BMLOOPSTR1_LASTCMP); 700 BIND(BMLOOPSTR1_CMP); 701 cmp(ch1, ch2); 702 br(EQ, BMLOOPSTR1); 703 BIND(BMSKIP); 704 if (!isL) { 705 // if we've met UTF symbol while searching Latin1 pattern, then we can 706 // skip cnt1 symbols 707 if (str1_isL != str2_isL) { 708 mov(result_tmp, cnt1); 709 } else { 710 mov(result_tmp, 1); 711 } 712 subs(zr, skipch, ASIZE); 713 br(HS, BMADV); 714 } 715 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 716 BIND(BMADV); 717 sub(cnt1tmp, cnt1, 1); 718 add(str2, str2, result_tmp, LSL, str2_chr_shift); 719 cmp(str2, str2end); 720 br(LE, BMLOOPSTR2); 721 add(sp, sp, ASIZE); 722 b(NOMATCH); 723 BIND(BMLOOPSTR1_LASTCMP); 724 cmp(ch1, ch2); 725 br(NE, BMSKIP); 726 BIND(BMMATCH); 727 sub(result, str2, tmp5); 728 if (!str2_isL) lsr(result, result, 1); 729 add(sp, sp, ASIZE); 730 b(DONE); 731 732 BIND(LINEARSTUB); 733 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 734 br(LT, LINEAR_MEDIUM); 735 mov(result, zr); 736 RuntimeAddress stub = nullptr; 737 if (isL) { 738 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 739 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 740 } else if (str1_isL) { 741 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 742 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 743 } else { 744 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 745 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 746 } 747 address call = trampoline_call(stub); 748 if (call == nullptr) { 749 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 750 ciEnv::current()->record_failure("CodeCache is full"); 751 return; 752 } 753 b(DONE); 754 } 755 756 BIND(LINEARSEARCH); 757 { 758 Label DO1, DO2, DO3; 759 760 Register str2tmp = tmp2; 761 Register first = tmp3; 762 763 if (icnt1 == -1) 764 { 765 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 766 767 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 768 br(LT, DOSHORT); 769 BIND(LINEAR_MEDIUM); 770 (this->*str1_load_1chr)(first, Address(str1)); 771 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 772 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 773 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 774 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 775 776 BIND(FIRST_LOOP); 777 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 778 cmp(first, ch2); 779 br(EQ, STR1_LOOP); 780 BIND(STR2_NEXT); 781 adds(cnt2_neg, cnt2_neg, str2_chr_size); 782 br(LE, FIRST_LOOP); 783 b(NOMATCH); 784 785 BIND(STR1_LOOP); 786 adds(cnt1tmp, cnt1_neg, str1_chr_size); 787 add(cnt2tmp, cnt2_neg, str2_chr_size); 788 br(GE, MATCH); 789 790 BIND(STR1_NEXT); 791 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 792 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 793 cmp(ch1, ch2); 794 br(NE, STR2_NEXT); 795 adds(cnt1tmp, cnt1tmp, str1_chr_size); 796 add(cnt2tmp, cnt2tmp, str2_chr_size); 797 br(LT, STR1_NEXT); 798 b(MATCH); 799 800 BIND(DOSHORT); 801 if (str1_isL == str2_isL) { 802 cmp(cnt1, (u1)2); 803 br(LT, DO1); 804 br(GT, DO3); 805 } 806 } 807 808 if (icnt1 == 4) { 809 Label CH1_LOOP; 810 811 (this->*load_4chr)(ch1, str1); 812 sub(result_tmp, cnt2, 4); 813 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 814 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 815 816 BIND(CH1_LOOP); 817 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 818 cmp(ch1, ch2); 819 br(EQ, MATCH); 820 adds(cnt2_neg, cnt2_neg, str2_chr_size); 821 br(LE, CH1_LOOP); 822 b(NOMATCH); 823 } 824 825 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 826 Label CH1_LOOP; 827 828 BIND(DO2); 829 (this->*load_2chr)(ch1, str1); 830 if (icnt1 == 2) { 831 sub(result_tmp, cnt2, 2); 832 } 833 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 834 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 835 BIND(CH1_LOOP); 836 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 837 cmp(ch1, ch2); 838 br(EQ, MATCH); 839 adds(cnt2_neg, cnt2_neg, str2_chr_size); 840 br(LE, CH1_LOOP); 841 b(NOMATCH); 842 } 843 844 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 845 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 846 847 BIND(DO3); 848 (this->*load_2chr)(first, str1); 849 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 850 if (icnt1 == 3) { 851 sub(result_tmp, cnt2, 3); 852 } 853 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 854 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 855 BIND(FIRST_LOOP); 856 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 857 cmpw(first, ch2); 858 br(EQ, STR1_LOOP); 859 BIND(STR2_NEXT); 860 adds(cnt2_neg, cnt2_neg, str2_chr_size); 861 br(LE, FIRST_LOOP); 862 b(NOMATCH); 863 864 BIND(STR1_LOOP); 865 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 866 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 867 cmp(ch1, ch2); 868 br(NE, STR2_NEXT); 869 b(MATCH); 870 } 871 872 if (icnt1 == -1 || icnt1 == 1) { 873 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 874 875 BIND(DO1); 876 (this->*str1_load_1chr)(ch1, str1); 877 cmp(cnt2, (u1)8); 878 br(LT, DO1_SHORT); 879 880 sub(result_tmp, cnt2, 8/str2_chr_size); 881 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 882 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 883 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 884 885 if (str2_isL) { 886 orr(ch1, ch1, ch1, LSL, 8); 887 } 888 orr(ch1, ch1, ch1, LSL, 16); 889 orr(ch1, ch1, ch1, LSL, 32); 890 BIND(CH1_LOOP); 891 ldr(ch2, Address(str2, cnt2_neg)); 892 eor(ch2, ch1, ch2); 893 sub(tmp1, ch2, tmp3); 894 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 895 bics(tmp1, tmp1, tmp2); 896 br(NE, HAS_ZERO); 897 adds(cnt2_neg, cnt2_neg, 8); 898 br(LT, CH1_LOOP); 899 900 cmp(cnt2_neg, (u1)8); 901 mov(cnt2_neg, 0); 902 br(LT, CH1_LOOP); 903 b(NOMATCH); 904 905 BIND(HAS_ZERO); 906 rev(tmp1, tmp1); 907 clz(tmp1, tmp1); 908 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 909 b(MATCH); 910 911 BIND(DO1_SHORT); 912 mov(result_tmp, cnt2); 913 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 914 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 915 BIND(DO1_LOOP); 916 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 917 cmpw(ch1, ch2); 918 br(EQ, MATCH); 919 adds(cnt2_neg, cnt2_neg, str2_chr_size); 920 br(LT, DO1_LOOP); 921 } 922 } 923 BIND(NOMATCH); 924 mov(result, -1); 925 b(DONE); 926 BIND(MATCH); 927 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 928 BIND(DONE); 929 } 930 931 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 932 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 933 934 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 935 Register ch, Register result, 936 Register tmp1, Register tmp2, Register tmp3) 937 { 938 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 939 Register cnt1_neg = cnt1; 940 Register ch1 = rscratch1; 941 Register result_tmp = rscratch2; 942 943 cbz(cnt1, NOMATCH); 944 945 cmp(cnt1, (u1)4); 946 br(LT, DO1_SHORT); 947 948 orr(ch, ch, ch, LSL, 16); 949 orr(ch, ch, ch, LSL, 32); 950 951 sub(cnt1, cnt1, 4); 952 mov(result_tmp, cnt1); 953 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 954 sub(cnt1_neg, zr, cnt1, LSL, 1); 955 956 mov(tmp3, 0x0001000100010001); 957 958 BIND(CH1_LOOP); 959 ldr(ch1, Address(str1, cnt1_neg)); 960 eor(ch1, ch, ch1); 961 sub(tmp1, ch1, tmp3); 962 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 963 bics(tmp1, tmp1, tmp2); 964 br(NE, HAS_ZERO); 965 adds(cnt1_neg, cnt1_neg, 8); 966 br(LT, CH1_LOOP); 967 968 cmp(cnt1_neg, (u1)8); 969 mov(cnt1_neg, 0); 970 br(LT, CH1_LOOP); 971 b(NOMATCH); 972 973 BIND(HAS_ZERO); 974 rev(tmp1, tmp1); 975 clz(tmp1, tmp1); 976 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 977 b(MATCH); 978 979 BIND(DO1_SHORT); 980 mov(result_tmp, cnt1); 981 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 982 sub(cnt1_neg, zr, cnt1, LSL, 1); 983 BIND(DO1_LOOP); 984 ldrh(ch1, Address(str1, cnt1_neg)); 985 cmpw(ch, ch1); 986 br(EQ, MATCH); 987 adds(cnt1_neg, cnt1_neg, 2); 988 br(LT, DO1_LOOP); 989 BIND(NOMATCH); 990 mov(result, -1); 991 b(DONE); 992 BIND(MATCH); 993 add(result, result_tmp, cnt1_neg, ASR, 1); 994 BIND(DONE); 995 } 996 997 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 998 Register ch, Register result, 999 FloatRegister ztmp1, 1000 FloatRegister ztmp2, 1001 PRegister tmp_pg, 1002 PRegister tmp_pdn, bool isL) 1003 { 1004 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1005 assert(tmp_pg->is_governing(), 1006 "this register has to be a governing predicate register"); 1007 1008 Label LOOP, MATCH, DONE, NOMATCH; 1009 Register vec_len = rscratch1; 1010 Register idx = rscratch2; 1011 1012 SIMD_RegVariant T = (isL == true) ? B : H; 1013 1014 cbz(cnt1, NOMATCH); 1015 1016 // Assign the particular char throughout the vector. 1017 sve_dup(ztmp2, T, ch); 1018 if (isL) { 1019 sve_cntb(vec_len); 1020 } else { 1021 sve_cnth(vec_len); 1022 } 1023 mov(idx, 0); 1024 1025 // Generate a predicate to control the reading of input string. 1026 sve_whilelt(tmp_pg, T, idx, cnt1); 1027 1028 BIND(LOOP); 1029 // Read a vector of 8- or 16-bit data depending on the string type. Note 1030 // that inactive elements indicated by the predicate register won't cause 1031 // a data read from memory to the destination vector. 1032 if (isL) { 1033 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1034 } else { 1035 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1036 } 1037 add(idx, idx, vec_len); 1038 1039 // Perform the comparison. An element of the destination predicate is set 1040 // to active if the particular char is matched. 1041 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1042 1043 // Branch if the particular char is found. 1044 br(NE, MATCH); 1045 1046 sve_whilelt(tmp_pg, T, idx, cnt1); 1047 1048 // Loop back if the particular char not found. 1049 br(MI, LOOP); 1050 1051 BIND(NOMATCH); 1052 mov(result, -1); 1053 b(DONE); 1054 1055 BIND(MATCH); 1056 // Undo the index increment. 1057 sub(idx, idx, vec_len); 1058 1059 // Crop the vector to find its location. 1060 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1061 add(result, idx, -1); 1062 sve_incp(result, T, tmp_pdn); 1063 BIND(DONE); 1064 } 1065 1066 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1067 Register ch, Register result, 1068 Register tmp1, Register tmp2, Register tmp3) 1069 { 1070 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1071 Register cnt1_neg = cnt1; 1072 Register ch1 = rscratch1; 1073 Register result_tmp = rscratch2; 1074 1075 cbz(cnt1, NOMATCH); 1076 1077 cmp(cnt1, (u1)8); 1078 br(LT, DO1_SHORT); 1079 1080 orr(ch, ch, ch, LSL, 8); 1081 orr(ch, ch, ch, LSL, 16); 1082 orr(ch, ch, ch, LSL, 32); 1083 1084 sub(cnt1, cnt1, 8); 1085 mov(result_tmp, cnt1); 1086 lea(str1, Address(str1, cnt1)); 1087 sub(cnt1_neg, zr, cnt1); 1088 1089 mov(tmp3, 0x0101010101010101); 1090 1091 BIND(CH1_LOOP); 1092 ldr(ch1, Address(str1, cnt1_neg)); 1093 eor(ch1, ch, ch1); 1094 sub(tmp1, ch1, tmp3); 1095 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1096 bics(tmp1, tmp1, tmp2); 1097 br(NE, HAS_ZERO); 1098 adds(cnt1_neg, cnt1_neg, 8); 1099 br(LT, CH1_LOOP); 1100 1101 cmp(cnt1_neg, (u1)8); 1102 mov(cnt1_neg, 0); 1103 br(LT, CH1_LOOP); 1104 b(NOMATCH); 1105 1106 BIND(HAS_ZERO); 1107 rev(tmp1, tmp1); 1108 clz(tmp1, tmp1); 1109 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1110 b(MATCH); 1111 1112 BIND(DO1_SHORT); 1113 mov(result_tmp, cnt1); 1114 lea(str1, Address(str1, cnt1)); 1115 sub(cnt1_neg, zr, cnt1); 1116 BIND(DO1_LOOP); 1117 ldrb(ch1, Address(str1, cnt1_neg)); 1118 cmp(ch, ch1); 1119 br(EQ, MATCH); 1120 adds(cnt1_neg, cnt1_neg, 1); 1121 br(LT, DO1_LOOP); 1122 BIND(NOMATCH); 1123 mov(result, -1); 1124 b(DONE); 1125 BIND(MATCH); 1126 add(result, result_tmp, cnt1_neg); 1127 BIND(DONE); 1128 } 1129 1130 // Compare strings. 1131 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1132 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1133 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1134 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1135 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1136 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1137 SHORT_LOOP_START, TAIL_CHECK; 1138 1139 bool isLL = ae == StrIntrinsicNode::LL; 1140 bool isLU = ae == StrIntrinsicNode::LU; 1141 bool isUL = ae == StrIntrinsicNode::UL; 1142 1143 // The stub threshold for LL strings is: 72 (64 + 8) chars 1144 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1145 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1146 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1147 1148 bool str1_isL = isLL || isLU; 1149 bool str2_isL = isLL || isUL; 1150 1151 int str1_chr_shift = str1_isL ? 0 : 1; 1152 int str2_chr_shift = str2_isL ? 0 : 1; 1153 int str1_chr_size = str1_isL ? 1 : 2; 1154 int str2_chr_size = str2_isL ? 1 : 2; 1155 int minCharsInWord = isLL ? wordSize : wordSize/2; 1156 1157 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1158 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1159 (chr_insn)&MacroAssembler::ldrh; 1160 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1161 (chr_insn)&MacroAssembler::ldrh; 1162 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1163 (uxt_insn)&MacroAssembler::uxthw; 1164 1165 BLOCK_COMMENT("string_compare {"); 1166 1167 // Bizarrely, the counts are passed in bytes, regardless of whether they 1168 // are L or U strings, however the result is always in characters. 1169 if (!str1_isL) asrw(cnt1, cnt1, 1); 1170 if (!str2_isL) asrw(cnt2, cnt2, 1); 1171 1172 // Compute the minimum of the string lengths and save the difference. 1173 subsw(result, cnt1, cnt2); 1174 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1175 1176 // A very short string 1177 cmpw(cnt2, minCharsInWord); 1178 br(Assembler::LE, SHORT_STRING); 1179 1180 // Compare longwords 1181 // load first parts of strings and finish initialization while loading 1182 { 1183 if (str1_isL == str2_isL) { // LL or UU 1184 ldr(tmp1, Address(str1)); 1185 cmp(str1, str2); 1186 br(Assembler::EQ, DONE); 1187 ldr(tmp2, Address(str2)); 1188 cmp(cnt2, stub_threshold); 1189 br(GE, STUB); 1190 subsw(cnt2, cnt2, minCharsInWord); 1191 br(EQ, TAIL_CHECK); 1192 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1193 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1194 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1195 } else if (isLU) { 1196 ldrs(vtmp, Address(str1)); 1197 ldr(tmp2, Address(str2)); 1198 cmp(cnt2, stub_threshold); 1199 br(GE, STUB); 1200 subw(cnt2, cnt2, 4); 1201 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1202 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1203 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1204 zip1(vtmp, T8B, vtmp, vtmpZ); 1205 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1206 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1207 add(cnt1, cnt1, 4); 1208 fmovd(tmp1, vtmp); 1209 } else { // UL case 1210 ldr(tmp1, Address(str1)); 1211 ldrs(vtmp, Address(str2)); 1212 cmp(cnt2, stub_threshold); 1213 br(GE, STUB); 1214 subw(cnt2, cnt2, 4); 1215 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1216 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1217 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1218 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1219 zip1(vtmp, T8B, vtmp, vtmpZ); 1220 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1221 add(cnt1, cnt1, 8); 1222 fmovd(tmp2, vtmp); 1223 } 1224 adds(cnt2, cnt2, isUL ? 4 : 8); 1225 br(GE, TAIL); 1226 eor(rscratch2, tmp1, tmp2); 1227 cbnz(rscratch2, DIFF); 1228 // main loop 1229 bind(NEXT_WORD); 1230 if (str1_isL == str2_isL) { 1231 ldr(tmp1, Address(str1, cnt2)); 1232 ldr(tmp2, Address(str2, cnt2)); 1233 adds(cnt2, cnt2, 8); 1234 } else if (isLU) { 1235 ldrs(vtmp, Address(str1, cnt1)); 1236 ldr(tmp2, Address(str2, cnt2)); 1237 add(cnt1, cnt1, 4); 1238 zip1(vtmp, T8B, vtmp, vtmpZ); 1239 fmovd(tmp1, vtmp); 1240 adds(cnt2, cnt2, 8); 1241 } else { // UL 1242 ldrs(vtmp, Address(str2, cnt2)); 1243 ldr(tmp1, Address(str1, cnt1)); 1244 zip1(vtmp, T8B, vtmp, vtmpZ); 1245 add(cnt1, cnt1, 8); 1246 fmovd(tmp2, vtmp); 1247 adds(cnt2, cnt2, 4); 1248 } 1249 br(GE, TAIL); 1250 1251 eor(rscratch2, tmp1, tmp2); 1252 cbz(rscratch2, NEXT_WORD); 1253 b(DIFF); 1254 bind(TAIL); 1255 eor(rscratch2, tmp1, tmp2); 1256 cbnz(rscratch2, DIFF); 1257 // Last longword. In the case where length == 4 we compare the 1258 // same longword twice, but that's still faster than another 1259 // conditional branch. 1260 if (str1_isL == str2_isL) { 1261 ldr(tmp1, Address(str1)); 1262 ldr(tmp2, Address(str2)); 1263 } else if (isLU) { 1264 ldrs(vtmp, Address(str1)); 1265 ldr(tmp2, Address(str2)); 1266 zip1(vtmp, T8B, vtmp, vtmpZ); 1267 fmovd(tmp1, vtmp); 1268 } else { // UL 1269 ldrs(vtmp, Address(str2)); 1270 ldr(tmp1, Address(str1)); 1271 zip1(vtmp, T8B, vtmp, vtmpZ); 1272 fmovd(tmp2, vtmp); 1273 } 1274 bind(TAIL_CHECK); 1275 eor(rscratch2, tmp1, tmp2); 1276 cbz(rscratch2, DONE); 1277 1278 // Find the first different characters in the longwords and 1279 // compute their difference. 1280 bind(DIFF); 1281 rev(rscratch2, rscratch2); 1282 clz(rscratch2, rscratch2); 1283 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1284 lsrv(tmp1, tmp1, rscratch2); 1285 (this->*ext_chr)(tmp1, tmp1); 1286 lsrv(tmp2, tmp2, rscratch2); 1287 (this->*ext_chr)(tmp2, tmp2); 1288 subw(result, tmp1, tmp2); 1289 b(DONE); 1290 } 1291 1292 bind(STUB); 1293 RuntimeAddress stub = nullptr; 1294 switch(ae) { 1295 case StrIntrinsicNode::LL: 1296 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1297 break; 1298 case StrIntrinsicNode::UU: 1299 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1300 break; 1301 case StrIntrinsicNode::LU: 1302 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1303 break; 1304 case StrIntrinsicNode::UL: 1305 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1306 break; 1307 default: 1308 ShouldNotReachHere(); 1309 } 1310 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1311 address call = trampoline_call(stub); 1312 if (call == nullptr) { 1313 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1314 ciEnv::current()->record_failure("CodeCache is full"); 1315 return; 1316 } 1317 b(DONE); 1318 1319 bind(SHORT_STRING); 1320 // Is the minimum length zero? 1321 cbz(cnt2, DONE); 1322 // arrange code to do most branches while loading and loading next characters 1323 // while comparing previous 1324 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1325 subs(cnt2, cnt2, 1); 1326 br(EQ, SHORT_LAST_INIT); 1327 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1328 b(SHORT_LOOP_START); 1329 bind(SHORT_LOOP); 1330 subs(cnt2, cnt2, 1); 1331 br(EQ, SHORT_LAST); 1332 bind(SHORT_LOOP_START); 1333 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1334 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1335 cmp(tmp1, cnt1); 1336 br(NE, SHORT_LOOP_TAIL); 1337 subs(cnt2, cnt2, 1); 1338 br(EQ, SHORT_LAST2); 1339 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1340 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1341 cmp(tmp2, rscratch1); 1342 br(EQ, SHORT_LOOP); 1343 sub(result, tmp2, rscratch1); 1344 b(DONE); 1345 bind(SHORT_LOOP_TAIL); 1346 sub(result, tmp1, cnt1); 1347 b(DONE); 1348 bind(SHORT_LAST2); 1349 cmp(tmp2, rscratch1); 1350 br(EQ, DONE); 1351 sub(result, tmp2, rscratch1); 1352 1353 b(DONE); 1354 bind(SHORT_LAST_INIT); 1355 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1356 bind(SHORT_LAST); 1357 cmp(tmp1, cnt1); 1358 br(EQ, DONE); 1359 sub(result, tmp1, cnt1); 1360 1361 bind(DONE); 1362 1363 BLOCK_COMMENT("} string_compare"); 1364 } 1365 1366 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1367 FloatRegister src2, Condition cond, bool isQ) { 1368 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1369 FloatRegister zn = src1, zm = src2; 1370 bool needs_negation = false; 1371 switch (cond) { 1372 case LT: cond = GT; zn = src2; zm = src1; break; 1373 case LE: cond = GE; zn = src2; zm = src1; break; 1374 case LO: cond = HI; zn = src2; zm = src1; break; 1375 case LS: cond = HS; zn = src2; zm = src1; break; 1376 case NE: cond = EQ; needs_negation = true; break; 1377 default: 1378 break; 1379 } 1380 1381 if (is_floating_point_type(bt)) { 1382 fcm(cond, dst, size, zn, zm); 1383 } else { 1384 cm(cond, dst, size, zn, zm); 1385 } 1386 1387 if (needs_negation) { 1388 notr(dst, isQ ? T16B : T8B, dst); 1389 } 1390 } 1391 1392 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1393 Condition cond, bool isQ) { 1394 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1395 if (bt == T_FLOAT || bt == T_DOUBLE) { 1396 if (cond == Assembler::NE) { 1397 fcm(Assembler::EQ, dst, size, src); 1398 notr(dst, isQ ? T16B : T8B, dst); 1399 } else { 1400 fcm(cond, dst, size, src); 1401 } 1402 } else { 1403 if (cond == Assembler::NE) { 1404 cm(Assembler::EQ, dst, size, src); 1405 notr(dst, isQ ? T16B : T8B, dst); 1406 } else { 1407 cm(cond, dst, size, src); 1408 } 1409 } 1410 } 1411 1412 // Compress the least significant bit of each byte to the rightmost and clear 1413 // the higher garbage bits. 1414 void C2_MacroAssembler::bytemask_compress(Register dst) { 1415 // Example input, dst = 0x01 00 00 00 01 01 00 01 1416 // The "??" bytes are garbage. 1417 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1418 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1419 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1420 andr(dst, dst, 0xff); // dst = 0x8D 1421 } 1422 1423 // Pack the lowest-numbered bit of each mask element in src into a long value 1424 // in dst, at most the first 64 lane elements. 1425 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1426 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1427 FloatRegister vtmp1, FloatRegister vtmp2) { 1428 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1429 assert_different_registers(dst, rscratch1); 1430 assert_different_registers(vtmp1, vtmp2); 1431 1432 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1433 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1434 // Expected: dst = 0x658D 1435 1436 // Convert the mask into vector with sequential bytes. 1437 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1438 sve_cpy(vtmp1, size, src, 1, false); 1439 if (bt != T_BYTE) { 1440 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1441 } 1442 1443 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1444 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1445 // is to compress each significant bit of the byte in a cross-lane way. Due 1446 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1447 // (bit-compress in each lane) with the biggest lane size (T = D) then 1448 // concatenate the results. 1449 1450 // The second source input of BEXT, initialized with 0x01 in each byte. 1451 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1452 sve_dup(vtmp2, B, 1); 1453 1454 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1455 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1456 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1457 // --------------------------------------- 1458 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1459 sve_bext(vtmp1, D, vtmp1, vtmp2); 1460 1461 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1462 // result to dst. 1463 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1464 // dst = 0x658D 1465 if (lane_cnt <= 8) { 1466 // No need to concatenate. 1467 umov(dst, vtmp1, B, 0); 1468 } else if (lane_cnt <= 16) { 1469 ins(vtmp1, B, vtmp1, 1, 8); 1470 umov(dst, vtmp1, H, 0); 1471 } else { 1472 // As the lane count is 64 at most, the final expected value must be in 1473 // the lowest 64 bits after narrowing vtmp1 from D to B. 1474 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1475 umov(dst, vtmp1, D, 0); 1476 } 1477 } else if (UseSVE > 0) { 1478 // Compress the lowest 8 bytes. 1479 fmovd(dst, vtmp1); 1480 bytemask_compress(dst); 1481 if (lane_cnt <= 8) return; 1482 1483 // Repeat on higher bytes and join the results. 1484 // Compress 8 bytes in each iteration. 1485 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1486 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1487 bytemask_compress(rscratch1); 1488 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1489 } 1490 } else { 1491 assert(false, "unsupported"); 1492 ShouldNotReachHere(); 1493 } 1494 } 1495 1496 // Unpack the mask, a long value in src, into predicate register dst based on the 1497 // corresponding data type. Note that dst can support at most 64 lanes. 1498 // Below example gives the expected dst predicate register in different types, with 1499 // a valid src(0x658D) on a 1024-bit vector size machine. 1500 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1501 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1502 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1503 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1504 // 1505 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1506 // has 24 significant bits would be an invalid input if dst predicate register refers to 1507 // a LONG type 1024-bit vector, which has at most 16 lanes. 1508 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1509 FloatRegister vtmp1, FloatRegister vtmp2) { 1510 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1511 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1512 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1513 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1514 // Expected: dst = 0b01101001 10001101 1515 1516 // Put long value from general purpose register into the first lane of vector. 1517 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1518 sve_dup(vtmp1, B, 0); 1519 mov(vtmp1, D, 0, src); 1520 1521 // As sve_cmp generates mask value with the minimum unit in byte, we should 1522 // transform the value in the first lane which is mask in bit now to the 1523 // mask in byte, which can be done by SVE2's BDEP instruction. 1524 1525 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1526 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1527 if (lane_cnt <= 8) { 1528 // Nothing. As only one byte exsits. 1529 } else if (lane_cnt <= 16) { 1530 ins(vtmp1, B, vtmp1, 8, 1); 1531 mov(vtmp1, B, 1, zr); 1532 } else { 1533 sve_vector_extend(vtmp1, D, vtmp1, B); 1534 } 1535 1536 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1537 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1538 sve_dup(vtmp2, B, 1); 1539 1540 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1541 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1542 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1543 // --------------------------------------- 1544 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1545 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1546 1547 if (bt != T_BYTE) { 1548 sve_vector_extend(vtmp1, size, vtmp1, B); 1549 } 1550 // Generate mask according to the given vector, in which the elements have been 1551 // extended to expected type. 1552 // dst = 0b01101001 10001101 1553 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1554 } 1555 1556 // Clobbers: rflags 1557 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1558 FloatRegister zn, FloatRegister zm, Condition cond) { 1559 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1560 FloatRegister z1 = zn, z2 = zm; 1561 switch (cond) { 1562 case LE: z1 = zm; z2 = zn; cond = GE; break; 1563 case LT: z1 = zm; z2 = zn; cond = GT; break; 1564 case LO: z1 = zm; z2 = zn; cond = HI; break; 1565 case LS: z1 = zm; z2 = zn; cond = HS; break; 1566 default: 1567 break; 1568 } 1569 1570 SIMD_RegVariant size = elemType_to_regVariant(bt); 1571 if (is_floating_point_type(bt)) { 1572 sve_fcm(cond, pd, size, pg, z1, z2); 1573 } else { 1574 assert(is_integral_type(bt), "unsupported element type"); 1575 sve_cmp(cond, pd, size, pg, z1, z2); 1576 } 1577 } 1578 1579 // Get index of the last mask lane that is set 1580 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1581 SIMD_RegVariant size = elemType_to_regVariant(bt); 1582 sve_rev(ptmp, size, src); 1583 sve_brkb(ptmp, ptrue, ptmp, false); 1584 sve_cntp(dst, size, ptrue, ptmp); 1585 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1586 subw(dst, rscratch1, dst); 1587 } 1588 1589 // Extend integer vector src to dst with the same lane count 1590 // but larger element size, e.g. 4B -> 4I 1591 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1592 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1593 if (src_bt == T_BYTE) { 1594 // 4B to 4S/4I, 8B to 8S 1595 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1596 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported"); 1597 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1598 if (dst_bt == T_INT) { 1599 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1600 } 1601 } else if (src_bt == T_SHORT) { 1602 // 2S to 2I/2L, 4S to 4I 1603 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1604 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported"); 1605 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1606 if (dst_bt == T_LONG) { 1607 _xshll(is_unsigned, dst, T2D, dst, T2S, 0); 1608 } 1609 } else if (src_bt == T_INT) { 1610 // 2I to 2L 1611 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1612 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1613 } else { 1614 ShouldNotReachHere(); 1615 } 1616 } 1617 1618 // Narrow integer vector src down to dst with the same lane count 1619 // but smaller element size, e.g. 4I -> 4B 1620 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1621 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1622 if (src_bt == T_SHORT) { 1623 // 4S/8S to 4B/8B 1624 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1625 assert(dst_bt == T_BYTE, "unsupported"); 1626 xtn(dst, T8B, src, T8H); 1627 } else if (src_bt == T_INT) { 1628 // 2I to 2S, 4I to 4B/4S 1629 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1630 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1631 xtn(dst, T4H, src, T4S); 1632 if (dst_bt == T_BYTE) { 1633 xtn(dst, T8B, dst, T8H); 1634 } 1635 } else if (src_bt == T_LONG) { 1636 // 2L to 2S/2I 1637 assert(src_vlen_in_bytes == 16, "unsupported"); 1638 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported"); 1639 xtn(dst, T2S, src, T2D); 1640 if (dst_bt == T_SHORT) { 1641 xtn(dst, T4H, dst, T4S); 1642 } 1643 } else { 1644 ShouldNotReachHere(); 1645 } 1646 } 1647 1648 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1649 FloatRegister src, SIMD_RegVariant src_size, 1650 bool is_unsigned) { 1651 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1652 1653 if (src_size == B) { 1654 switch (dst_size) { 1655 case H: 1656 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1657 break; 1658 case S: 1659 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1660 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1661 break; 1662 case D: 1663 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1664 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1665 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1666 break; 1667 default: 1668 ShouldNotReachHere(); 1669 } 1670 } else if (src_size == H) { 1671 if (dst_size == S) { 1672 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1673 } else { // D 1674 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1675 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1676 } 1677 } else if (src_size == S) { 1678 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1679 } 1680 } 1681 1682 // Vector narrow from src to dst with specified element sizes. 1683 // High part of dst vector will be filled with zero. 1684 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1685 FloatRegister src, SIMD_RegVariant src_size, 1686 FloatRegister tmp) { 1687 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1688 assert_different_registers(src, tmp); 1689 sve_dup(tmp, src_size, 0); 1690 if (src_size == D) { 1691 switch (dst_size) { 1692 case S: 1693 sve_uzp1(dst, S, src, tmp); 1694 break; 1695 case H: 1696 assert_different_registers(dst, tmp); 1697 sve_uzp1(dst, S, src, tmp); 1698 sve_uzp1(dst, H, dst, tmp); 1699 break; 1700 case B: 1701 assert_different_registers(dst, tmp); 1702 sve_uzp1(dst, S, src, tmp); 1703 sve_uzp1(dst, H, dst, tmp); 1704 sve_uzp1(dst, B, dst, tmp); 1705 break; 1706 default: 1707 ShouldNotReachHere(); 1708 } 1709 } else if (src_size == S) { 1710 if (dst_size == H) { 1711 sve_uzp1(dst, H, src, tmp); 1712 } else { // B 1713 assert_different_registers(dst, tmp); 1714 sve_uzp1(dst, H, src, tmp); 1715 sve_uzp1(dst, B, dst, tmp); 1716 } 1717 } else if (src_size == H) { 1718 sve_uzp1(dst, B, src, tmp); 1719 } 1720 } 1721 1722 // Extend src predicate to dst predicate with the same lane count but larger 1723 // element size, e.g. 64Byte -> 512Long 1724 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1725 uint dst_element_length_in_bytes, 1726 uint src_element_length_in_bytes) { 1727 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1728 sve_punpklo(dst, src); 1729 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1730 sve_punpklo(dst, src); 1731 sve_punpklo(dst, dst); 1732 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1733 sve_punpklo(dst, src); 1734 sve_punpklo(dst, dst); 1735 sve_punpklo(dst, dst); 1736 } else { 1737 assert(false, "unsupported"); 1738 ShouldNotReachHere(); 1739 } 1740 } 1741 1742 // Narrow src predicate to dst predicate with the same lane count but 1743 // smaller element size, e.g. 512Long -> 64Byte 1744 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1745 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1746 // The insignificant bits in src predicate are expected to be zero. 1747 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1748 // passed as the second argument. An example narrowing operation with a given mask would be - 1749 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1750 // Mask (for 2 Longs) : TF 1751 // Predicate register for the above mask (16 bits) : 00000001 00000000 1752 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1753 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1754 assert_different_registers(src, ptmp); 1755 assert_different_registers(dst, ptmp); 1756 sve_pfalse(ptmp); 1757 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1758 sve_uzp1(dst, B, src, ptmp); 1759 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1760 sve_uzp1(dst, H, src, ptmp); 1761 sve_uzp1(dst, B, dst, ptmp); 1762 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1763 sve_uzp1(dst, S, src, ptmp); 1764 sve_uzp1(dst, H, dst, ptmp); 1765 sve_uzp1(dst, B, dst, ptmp); 1766 } else { 1767 assert(false, "unsupported"); 1768 ShouldNotReachHere(); 1769 } 1770 } 1771 1772 // Vector reduction add for integral type with ASIMD instructions. 1773 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1774 Register isrc, FloatRegister vsrc, 1775 unsigned vector_length_in_bytes, 1776 FloatRegister vtmp) { 1777 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1778 assert_different_registers(dst, isrc); 1779 bool isQ = vector_length_in_bytes == 16; 1780 1781 BLOCK_COMMENT("neon_reduce_add_integral {"); 1782 switch(bt) { 1783 case T_BYTE: 1784 addv(vtmp, isQ ? T16B : T8B, vsrc); 1785 smov(dst, vtmp, B, 0); 1786 addw(dst, dst, isrc, ext::sxtb); 1787 break; 1788 case T_SHORT: 1789 addv(vtmp, isQ ? T8H : T4H, vsrc); 1790 smov(dst, vtmp, H, 0); 1791 addw(dst, dst, isrc, ext::sxth); 1792 break; 1793 case T_INT: 1794 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1795 umov(dst, vtmp, S, 0); 1796 addw(dst, dst, isrc); 1797 break; 1798 case T_LONG: 1799 assert(isQ, "unsupported"); 1800 addpd(vtmp, vsrc); 1801 umov(dst, vtmp, D, 0); 1802 add(dst, dst, isrc); 1803 break; 1804 default: 1805 assert(false, "unsupported"); 1806 ShouldNotReachHere(); 1807 } 1808 BLOCK_COMMENT("} neon_reduce_add_integral"); 1809 } 1810 1811 // Vector reduction multiply for integral type with ASIMD instructions. 1812 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1813 // Clobbers: rscratch1 1814 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1815 Register isrc, FloatRegister vsrc, 1816 unsigned vector_length_in_bytes, 1817 FloatRegister vtmp1, FloatRegister vtmp2) { 1818 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1819 bool isQ = vector_length_in_bytes == 16; 1820 1821 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1822 switch(bt) { 1823 case T_BYTE: 1824 if (isQ) { 1825 // Multiply the lower half and higher half of vector iteratively. 1826 // vtmp1 = vsrc[8:15] 1827 ins(vtmp1, D, vsrc, 0, 1); 1828 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1829 mulv(vtmp1, T8B, vtmp1, vsrc); 1830 // vtmp2 = vtmp1[4:7] 1831 ins(vtmp2, S, vtmp1, 0, 1); 1832 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1833 mulv(vtmp1, T8B, vtmp2, vtmp1); 1834 } else { 1835 ins(vtmp1, S, vsrc, 0, 1); 1836 mulv(vtmp1, T8B, vtmp1, vsrc); 1837 } 1838 // vtmp2 = vtmp1[2:3] 1839 ins(vtmp2, H, vtmp1, 0, 1); 1840 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1841 mulv(vtmp2, T8B, vtmp2, vtmp1); 1842 // dst = vtmp2[0] * isrc * vtmp2[1] 1843 umov(rscratch1, vtmp2, B, 0); 1844 mulw(dst, rscratch1, isrc); 1845 sxtb(dst, dst); 1846 umov(rscratch1, vtmp2, B, 1); 1847 mulw(dst, rscratch1, dst); 1848 sxtb(dst, dst); 1849 break; 1850 case T_SHORT: 1851 if (isQ) { 1852 ins(vtmp2, D, vsrc, 0, 1); 1853 mulv(vtmp2, T4H, vtmp2, vsrc); 1854 ins(vtmp1, S, vtmp2, 0, 1); 1855 mulv(vtmp1, T4H, vtmp1, vtmp2); 1856 } else { 1857 ins(vtmp1, S, vsrc, 0, 1); 1858 mulv(vtmp1, T4H, vtmp1, vsrc); 1859 } 1860 umov(rscratch1, vtmp1, H, 0); 1861 mulw(dst, rscratch1, isrc); 1862 sxth(dst, dst); 1863 umov(rscratch1, vtmp1, H, 1); 1864 mulw(dst, rscratch1, dst); 1865 sxth(dst, dst); 1866 break; 1867 case T_INT: 1868 if (isQ) { 1869 ins(vtmp1, D, vsrc, 0, 1); 1870 mulv(vtmp1, T2S, vtmp1, vsrc); 1871 } else { 1872 vtmp1 = vsrc; 1873 } 1874 umov(rscratch1, vtmp1, S, 0); 1875 mul(dst, rscratch1, isrc); 1876 umov(rscratch1, vtmp1, S, 1); 1877 mul(dst, rscratch1, dst); 1878 break; 1879 case T_LONG: 1880 umov(rscratch1, vsrc, D, 0); 1881 mul(dst, isrc, rscratch1); 1882 umov(rscratch1, vsrc, D, 1); 1883 mul(dst, dst, rscratch1); 1884 break; 1885 default: 1886 assert(false, "unsupported"); 1887 ShouldNotReachHere(); 1888 } 1889 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1890 } 1891 1892 // Vector reduction multiply for floating-point type with ASIMD instructions. 1893 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1894 FloatRegister fsrc, FloatRegister vsrc, 1895 unsigned vector_length_in_bytes, 1896 FloatRegister vtmp) { 1897 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1898 bool isQ = vector_length_in_bytes == 16; 1899 1900 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1901 switch(bt) { 1902 case T_FLOAT: 1903 fmuls(dst, fsrc, vsrc); 1904 ins(vtmp, S, vsrc, 0, 1); 1905 fmuls(dst, dst, vtmp); 1906 if (isQ) { 1907 ins(vtmp, S, vsrc, 0, 2); 1908 fmuls(dst, dst, vtmp); 1909 ins(vtmp, S, vsrc, 0, 3); 1910 fmuls(dst, dst, vtmp); 1911 } 1912 break; 1913 case T_DOUBLE: 1914 assert(isQ, "unsupported"); 1915 fmuld(dst, fsrc, vsrc); 1916 ins(vtmp, D, vsrc, 0, 1); 1917 fmuld(dst, dst, vtmp); 1918 break; 1919 default: 1920 assert(false, "unsupported"); 1921 ShouldNotReachHere(); 1922 } 1923 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1924 } 1925 1926 // Helper to select logical instruction 1927 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1928 Register Rn, Register Rm, 1929 enum shift_kind kind, unsigned shift) { 1930 switch(opc) { 1931 case Op_AndReductionV: 1932 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1933 break; 1934 case Op_OrReductionV: 1935 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1936 break; 1937 case Op_XorReductionV: 1938 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1939 break; 1940 default: 1941 assert(false, "unsupported"); 1942 ShouldNotReachHere(); 1943 } 1944 } 1945 1946 // Vector reduction logical operations And, Or, Xor 1947 // Clobbers: rscratch1 1948 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1949 Register isrc, FloatRegister vsrc, 1950 unsigned vector_length_in_bytes) { 1951 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1952 "unsupported"); 1953 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1954 assert_different_registers(dst, isrc); 1955 bool isQ = vector_length_in_bytes == 16; 1956 1957 BLOCK_COMMENT("neon_reduce_logical {"); 1958 umov(rscratch1, vsrc, isQ ? D : S, 0); 1959 umov(dst, vsrc, isQ ? D : S, 1); 1960 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1961 switch(bt) { 1962 case T_BYTE: 1963 if (isQ) { 1964 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1965 } 1966 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1967 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1968 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1969 sxtb(dst, dst); 1970 break; 1971 case T_SHORT: 1972 if (isQ) { 1973 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1974 } 1975 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1976 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1977 sxth(dst, dst); 1978 break; 1979 case T_INT: 1980 if (isQ) { 1981 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1982 } 1983 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1984 break; 1985 case T_LONG: 1986 assert(isQ, "unsupported"); 1987 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1988 break; 1989 default: 1990 assert(false, "unsupported"); 1991 ShouldNotReachHere(); 1992 } 1993 BLOCK_COMMENT("} neon_reduce_logical"); 1994 } 1995 1996 // Vector reduction min/max for integral type with ASIMD instructions. 1997 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1998 // Clobbers: rscratch1, rflags 1999 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2000 Register isrc, FloatRegister vsrc, 2001 unsigned vector_length_in_bytes, 2002 FloatRegister vtmp) { 2003 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2004 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2005 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2006 assert_different_registers(dst, isrc); 2007 bool isQ = vector_length_in_bytes == 16; 2008 bool is_min = opc == Op_MinReductionV; 2009 2010 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2011 if (bt == T_LONG) { 2012 assert(vtmp == fnoreg, "should be"); 2013 assert(isQ, "should be"); 2014 umov(rscratch1, vsrc, D, 0); 2015 cmp(isrc, rscratch1); 2016 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2017 umov(rscratch1, vsrc, D, 1); 2018 cmp(dst, rscratch1); 2019 csel(dst, dst, rscratch1, is_min ? LT : GT); 2020 } else { 2021 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2022 if (size == T2S) { 2023 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2024 } else { 2025 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2026 } 2027 if (bt == T_INT) { 2028 umov(dst, vtmp, S, 0); 2029 } else { 2030 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2031 } 2032 cmpw(dst, isrc); 2033 cselw(dst, dst, isrc, is_min ? LT : GT); 2034 } 2035 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2036 } 2037 2038 // Vector reduction for integral type with SVE instruction. 2039 // Supported operations are Add, And, Or, Xor, Max, Min. 2040 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2041 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2042 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2043 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2044 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2045 assert_different_registers(src1, dst); 2046 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2047 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2048 switch (opc) { 2049 case Op_AddReductionVI: { 2050 sve_uaddv(tmp, size, pg, src2); 2051 if (bt == T_BYTE) { 2052 smov(dst, tmp, size, 0); 2053 addw(dst, src1, dst, ext::sxtb); 2054 } else if (bt == T_SHORT) { 2055 smov(dst, tmp, size, 0); 2056 addw(dst, src1, dst, ext::sxth); 2057 } else { 2058 umov(dst, tmp, size, 0); 2059 addw(dst, dst, src1); 2060 } 2061 break; 2062 } 2063 case Op_AddReductionVL: { 2064 sve_uaddv(tmp, size, pg, src2); 2065 umov(dst, tmp, size, 0); 2066 add(dst, dst, src1); 2067 break; 2068 } 2069 case Op_AndReductionV: { 2070 sve_andv(tmp, size, pg, src2); 2071 if (bt == T_INT || bt == T_LONG) { 2072 umov(dst, tmp, size, 0); 2073 } else { 2074 smov(dst, tmp, size, 0); 2075 } 2076 if (bt == T_LONG) { 2077 andr(dst, dst, src1); 2078 } else { 2079 andw(dst, dst, src1); 2080 } 2081 break; 2082 } 2083 case Op_OrReductionV: { 2084 sve_orv(tmp, size, pg, src2); 2085 if (bt == T_INT || bt == T_LONG) { 2086 umov(dst, tmp, size, 0); 2087 } else { 2088 smov(dst, tmp, size, 0); 2089 } 2090 if (bt == T_LONG) { 2091 orr(dst, dst, src1); 2092 } else { 2093 orrw(dst, dst, src1); 2094 } 2095 break; 2096 } 2097 case Op_XorReductionV: { 2098 sve_eorv(tmp, size, pg, src2); 2099 if (bt == T_INT || bt == T_LONG) { 2100 umov(dst, tmp, size, 0); 2101 } else { 2102 smov(dst, tmp, size, 0); 2103 } 2104 if (bt == T_LONG) { 2105 eor(dst, dst, src1); 2106 } else { 2107 eorw(dst, dst, src1); 2108 } 2109 break; 2110 } 2111 case Op_MaxReductionV: { 2112 sve_smaxv(tmp, size, pg, src2); 2113 if (bt == T_INT || bt == T_LONG) { 2114 umov(dst, tmp, size, 0); 2115 } else { 2116 smov(dst, tmp, size, 0); 2117 } 2118 if (bt == T_LONG) { 2119 cmp(dst, src1); 2120 csel(dst, dst, src1, Assembler::GT); 2121 } else { 2122 cmpw(dst, src1); 2123 cselw(dst, dst, src1, Assembler::GT); 2124 } 2125 break; 2126 } 2127 case Op_MinReductionV: { 2128 sve_sminv(tmp, size, pg, src2); 2129 if (bt == T_INT || bt == T_LONG) { 2130 umov(dst, tmp, size, 0); 2131 } else { 2132 smov(dst, tmp, size, 0); 2133 } 2134 if (bt == T_LONG) { 2135 cmp(dst, src1); 2136 csel(dst, dst, src1, Assembler::LT); 2137 } else { 2138 cmpw(dst, src1); 2139 cselw(dst, dst, src1, Assembler::LT); 2140 } 2141 break; 2142 } 2143 default: 2144 assert(false, "unsupported"); 2145 ShouldNotReachHere(); 2146 } 2147 2148 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2149 if (bt == T_BYTE) { 2150 sxtb(dst, dst); 2151 } else if (bt == T_SHORT) { 2152 sxth(dst, dst); 2153 } 2154 } 2155 } 2156 2157 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2158 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2159 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2160 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2161 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2162 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2163 2164 // Set all elements to false if the input "lane_cnt" is zero. 2165 if (lane_cnt == 0) { 2166 sve_pfalse(dst); 2167 return; 2168 } 2169 2170 SIMD_RegVariant size = elemType_to_regVariant(bt); 2171 assert(size != Q, "invalid size"); 2172 2173 // Set all true if "lane_cnt" equals to the max lane count. 2174 if (lane_cnt == max_vector_length) { 2175 sve_ptrue(dst, size, /* ALL */ 0b11111); 2176 return; 2177 } 2178 2179 // Fixed numbers for "ptrue". 2180 switch(lane_cnt) { 2181 case 1: /* VL1 */ 2182 case 2: /* VL2 */ 2183 case 3: /* VL3 */ 2184 case 4: /* VL4 */ 2185 case 5: /* VL5 */ 2186 case 6: /* VL6 */ 2187 case 7: /* VL7 */ 2188 case 8: /* VL8 */ 2189 sve_ptrue(dst, size, lane_cnt); 2190 return; 2191 case 16: 2192 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2193 return; 2194 case 32: 2195 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2196 return; 2197 case 64: 2198 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2199 return; 2200 case 128: 2201 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2202 return; 2203 case 256: 2204 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2205 return; 2206 default: 2207 break; 2208 } 2209 2210 // Special patterns for "ptrue". 2211 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2212 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2213 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2214 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2215 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2216 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2217 } else { 2218 // Encode to "whileltw" for the remaining cases. 2219 mov(rscratch1, lane_cnt); 2220 sve_whileltw(dst, size, zr, rscratch1); 2221 } 2222 } 2223 2224 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2225 // Any remaining elements of dst will be filled with zero. 2226 // Clobbers: rscratch1 2227 // Preserves: src, mask 2228 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2229 FloatRegister vtmp1, FloatRegister vtmp2, 2230 PRegister pgtmp) { 2231 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2232 assert_different_registers(dst, src, vtmp1, vtmp2); 2233 assert_different_registers(mask, pgtmp); 2234 2235 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2236 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2237 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2238 sve_dup(vtmp2, H, 0); 2239 2240 // Extend lowest half to type INT. 2241 // dst = 00004444 00003333 00002222 00001111 2242 sve_uunpklo(dst, S, src); 2243 // pgtmp = 00000001 00000000 00000001 00000001 2244 sve_punpklo(pgtmp, mask); 2245 // Pack the active elements in size of type INT to the right, 2246 // and fill the remainings with zero. 2247 // dst = 00000000 00004444 00002222 00001111 2248 sve_compact(dst, S, dst, pgtmp); 2249 // Narrow the result back to type SHORT. 2250 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2251 sve_uzp1(dst, H, dst, vtmp2); 2252 // Count the active elements of lowest half. 2253 // rscratch1 = 3 2254 sve_cntp(rscratch1, S, ptrue, pgtmp); 2255 2256 // Repeat to the highest half. 2257 // pgtmp = 00000001 00000000 00000000 00000001 2258 sve_punpkhi(pgtmp, mask); 2259 // vtmp1 = 00008888 00007777 00006666 00005555 2260 sve_uunpkhi(vtmp1, S, src); 2261 // vtmp1 = 00000000 00000000 00008888 00005555 2262 sve_compact(vtmp1, S, vtmp1, pgtmp); 2263 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2264 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2265 2266 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2267 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2268 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2269 // TRUE_CNT is the number of active elements in the compressed low. 2270 neg(rscratch1, rscratch1); 2271 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2272 sve_index(vtmp2, H, rscratch1, 1); 2273 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2274 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2275 2276 // Combine the compressed high(after shifted) with the compressed low. 2277 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2278 sve_orr(dst, dst, vtmp1); 2279 } 2280 2281 // Clobbers: rscratch1, rscratch2 2282 // Preserves: src, mask 2283 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2284 FloatRegister vtmp1, FloatRegister vtmp2, 2285 FloatRegister vtmp3, FloatRegister vtmp4, 2286 PRegister ptmp, PRegister pgtmp) { 2287 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2288 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2289 assert_different_registers(mask, ptmp, pgtmp); 2290 // Example input: src = 88 77 66 55 44 33 22 11 2291 // mask = 01 00 00 01 01 00 01 01 2292 // Expected result: dst = 00 00 00 88 55 44 22 11 2293 2294 sve_dup(vtmp4, B, 0); 2295 // Extend lowest half to type SHORT. 2296 // vtmp1 = 0044 0033 0022 0011 2297 sve_uunpklo(vtmp1, H, src); 2298 // ptmp = 0001 0000 0001 0001 2299 sve_punpklo(ptmp, mask); 2300 // Count the active elements of lowest half. 2301 // rscratch2 = 3 2302 sve_cntp(rscratch2, H, ptrue, ptmp); 2303 // Pack the active elements in size of type SHORT to the right, 2304 // and fill the remainings with zero. 2305 // dst = 0000 0044 0022 0011 2306 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2307 // Narrow the result back to type BYTE. 2308 // dst = 00 00 00 00 00 44 22 11 2309 sve_uzp1(dst, B, dst, vtmp4); 2310 2311 // Repeat to the highest half. 2312 // ptmp = 0001 0000 0000 0001 2313 sve_punpkhi(ptmp, mask); 2314 // vtmp1 = 0088 0077 0066 0055 2315 sve_uunpkhi(vtmp2, H, src); 2316 // vtmp1 = 0000 0000 0088 0055 2317 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2318 2319 sve_dup(vtmp4, B, 0); 2320 // vtmp1 = 00 00 00 00 00 00 88 55 2321 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2322 2323 // Compressed low: dst = 00 00 00 00 00 44 22 11 2324 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2325 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2326 // TRUE_CNT is the number of active elements in the compressed low. 2327 neg(rscratch2, rscratch2); 2328 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2329 sve_index(vtmp2, B, rscratch2, 1); 2330 // vtmp1 = 00 00 00 88 55 00 00 00 2331 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2332 // Combine the compressed high(after shifted) with the compressed low. 2333 // dst = 00 00 00 88 55 44 22 11 2334 sve_orr(dst, dst, vtmp1); 2335 } 2336 2337 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2338 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2339 SIMD_Arrangement size = isQ ? T16B : T8B; 2340 if (bt == T_BYTE) { 2341 rbit(dst, size, src); 2342 } else { 2343 neon_reverse_bytes(dst, src, bt, isQ); 2344 rbit(dst, size, dst); 2345 } 2346 } 2347 2348 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2349 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2350 SIMD_Arrangement size = isQ ? T16B : T8B; 2351 switch (bt) { 2352 case T_BYTE: 2353 if (dst != src) { 2354 orr(dst, size, src, src); 2355 } 2356 break; 2357 case T_SHORT: 2358 rev16(dst, size, src); 2359 break; 2360 case T_INT: 2361 rev32(dst, size, src); 2362 break; 2363 case T_LONG: 2364 rev64(dst, size, src); 2365 break; 2366 default: 2367 assert(false, "unsupported"); 2368 ShouldNotReachHere(); 2369 } 2370 } 2371 2372 // VectorRearrange implementation for short/int/float/long/double types with NEON 2373 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. 2374 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group. 2375 // For VectorRearrange long/double, we compare the shuffle input with iota indices, 2376 // and use bsl to implement the operation. 2377 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, 2378 FloatRegister shuffle, FloatRegister tmp, 2379 BasicType bt, bool isQ) { 2380 assert_different_registers(dst, src, shuffle, tmp); 2381 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2382 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2383 2384 // Here is an example that rearranges a NEON vector with 4 ints: 2385 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] 2386 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. 2387 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector 2388 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get 2389 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. 2390 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], 2391 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] 2392 // 4. Use Vm as index register, and use V1 as table register. 2393 // Then get V2 as the result by tbl NEON instructions. 2394 switch (bt) { 2395 case T_SHORT: 2396 mov(tmp, size1, 0x02); 2397 mulv(dst, size2, shuffle, tmp); 2398 mov(tmp, size2, 0x0100); 2399 addv(dst, size1, dst, tmp); 2400 tbl(dst, size1, src, 1, dst); 2401 break; 2402 case T_INT: 2403 case T_FLOAT: 2404 mov(tmp, size1, 0x04); 2405 mulv(dst, size2, shuffle, tmp); 2406 mov(tmp, size2, 0x03020100); 2407 addv(dst, size1, dst, tmp); 2408 tbl(dst, size1, src, 1, dst); 2409 break; 2410 case T_LONG: 2411 case T_DOUBLE: 2412 // Load the iota indices for Long type. The indices are ordered by 2413 // type B/S/I/L/F/D, and the offset between two types is 16; Hence 2414 // the offset for L is 48. 2415 lea(rscratch1, 2416 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); 2417 ldrq(tmp, rscratch1); 2418 // Check whether the input "shuffle" is the same with iota indices. 2419 // Return "src" if true, otherwise swap the two elements of "src". 2420 cm(EQ, dst, size2, shuffle, tmp); 2421 ext(tmp, size1, src, src, 8); 2422 bsl(dst, size1, src, tmp); 2423 break; 2424 default: 2425 assert(false, "unsupported element type"); 2426 ShouldNotReachHere(); 2427 } 2428 } 2429 2430 // Extract a scalar element from an sve vector at position 'idx'. 2431 // The input elements in src are expected to be of integral type. 2432 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2433 int idx, FloatRegister vtmp) { 2434 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2435 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2436 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2437 if (bt == T_INT || bt == T_LONG) { 2438 umov(dst, src, size, idx); 2439 } else { 2440 smov(dst, src, size, idx); 2441 } 2442 } else { 2443 sve_orr(vtmp, src, src); 2444 sve_ext(vtmp, vtmp, idx << size); 2445 if (bt == T_INT || bt == T_LONG) { 2446 umov(dst, vtmp, size, 0); 2447 } else { 2448 smov(dst, vtmp, size, 0); 2449 } 2450 } 2451 } 2452 2453 // java.lang.Math::round intrinsics 2454 2455 // Clobbers: rscratch1, rflags 2456 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2457 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2458 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2459 switch (T) { 2460 case T2S: 2461 case T4S: 2462 fmovs(tmp1, T, 0.5f); 2463 mov(rscratch1, jint_cast(0x1.0p23f)); 2464 break; 2465 case T2D: 2466 fmovd(tmp1, T, 0.5); 2467 mov(rscratch1, julong_cast(0x1.0p52)); 2468 break; 2469 default: 2470 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2471 } 2472 fadd(tmp1, T, tmp1, src); 2473 fcvtms(tmp1, T, tmp1); 2474 // tmp1 = floor(src + 0.5, ties to even) 2475 2476 fcvtas(dst, T, src); 2477 // dst = round(src), ties to away 2478 2479 fneg(tmp3, T, src); 2480 dup(tmp2, T, rscratch1); 2481 cm(HS, tmp3, T, tmp3, tmp2); 2482 // tmp3 is now a set of flags 2483 2484 bif(dst, T16B, tmp1, tmp3); 2485 // result in dst 2486 } 2487 2488 // Clobbers: rscratch1, rflags 2489 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2490 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2491 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2492 assert_different_registers(tmp1, tmp2, src, dst); 2493 2494 switch (T) { 2495 case S: 2496 mov(rscratch1, jint_cast(0x1.0p23f)); 2497 break; 2498 case D: 2499 mov(rscratch1, julong_cast(0x1.0p52)); 2500 break; 2501 default: 2502 assert(T == S || T == D, "invalid register variant"); 2503 } 2504 2505 sve_frinta(dst, T, ptrue, src); 2506 // dst = round(src), ties to away 2507 2508 Label none; 2509 2510 sve_fneg(tmp1, T, ptrue, src); 2511 sve_dup(tmp2, T, rscratch1); 2512 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2513 br(EQ, none); 2514 { 2515 sve_cpy(tmp1, T, pgtmp, 0.5); 2516 sve_fadd(tmp1, T, pgtmp, src); 2517 sve_frintm(dst, T, pgtmp, tmp1); 2518 // dst = floor(src + 0.5, ties to even) 2519 } 2520 bind(none); 2521 2522 sve_fcvtzs(dst, T, ptrue, dst, T); 2523 // result in dst 2524 } 2525 2526 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2527 FloatRegister one, SIMD_Arrangement T) { 2528 assert_different_registers(dst, src, zero, one); 2529 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2530 2531 facgt(dst, T, src, zero); 2532 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2533 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2534 } 2535 2536 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2537 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2538 assert_different_registers(dst, src, zero, one, vtmp); 2539 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2540 2541 sve_orr(vtmp, src, src); 2542 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2543 switch (T) { 2544 case S: 2545 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2546 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2547 // on the sign of the float value 2548 break; 2549 case D: 2550 sve_and(vtmp, T, min_jlong); 2551 sve_orr(vtmp, T, jlong_cast(1.0)); 2552 break; 2553 default: 2554 assert(false, "unsupported"); 2555 ShouldNotReachHere(); 2556 } 2557 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2558 // Result in dst 2559 } 2560 2561 bool C2_MacroAssembler::in_scratch_emit_size() { 2562 if (ciEnv::current()->task() != nullptr) { 2563 PhaseOutput* phase_output = Compile::current()->output(); 2564 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2565 return true; 2566 } 2567 } 2568 return MacroAssembler::in_scratch_emit_size(); 2569 } 2570 2571 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 2572 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 2573 } 2574 2575 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) { 2576 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2577 if (t == TypeInt::INT) { 2578 return; 2579 } 2580 BLOCK_COMMENT("verify_int_in_range {"); 2581 Label L_success, L_failure; 2582 2583 jint lo = t->_lo; 2584 jint hi = t->_hi; 2585 2586 if (lo != min_jint && hi != max_jint) { 2587 subsw(rtmp, rval, lo); 2588 br(Assembler::LT, L_failure); 2589 subsw(rtmp, rval, hi); 2590 br(Assembler::LE, L_success); 2591 } else if (lo != min_jint) { 2592 subsw(rtmp, rval, lo); 2593 br(Assembler::GE, L_success); 2594 } else if (hi != max_jint) { 2595 subsw(rtmp, rval, hi); 2596 br(Assembler::LE, L_success); 2597 } else { 2598 ShouldNotReachHere(); 2599 } 2600 2601 bind(L_failure); 2602 movw(c_rarg0, idx); 2603 mov(c_rarg1, rval); 2604 movw(c_rarg2, lo); 2605 movw(c_rarg3, hi); 2606 reconstruct_frame_pointer(rtmp); 2607 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp); 2608 hlt(0); 2609 2610 bind(L_success); 2611 BLOCK_COMMENT("} verify_int_in_range"); 2612 } 2613 2614 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 2615 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 2616 } 2617 2618 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) { 2619 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2620 if (t == TypeLong::LONG) { 2621 return; 2622 } 2623 BLOCK_COMMENT("verify_long_in_range {"); 2624 Label L_success, L_failure; 2625 2626 jlong lo = t->_lo; 2627 jlong hi = t->_hi; 2628 2629 if (lo != min_jlong && hi != max_jlong) { 2630 subs(rtmp, rval, lo); 2631 br(Assembler::LT, L_failure); 2632 subs(rtmp, rval, hi); 2633 br(Assembler::LE, L_success); 2634 } else if (lo != min_jlong) { 2635 subs(rtmp, rval, lo); 2636 br(Assembler::GE, L_success); 2637 } else if (hi != max_jlong) { 2638 subs(rtmp, rval, hi); 2639 br(Assembler::LE, L_success); 2640 } else { 2641 ShouldNotReachHere(); 2642 } 2643 2644 bind(L_failure); 2645 movw(c_rarg0, idx); 2646 mov(c_rarg1, rval); 2647 mov(c_rarg2, lo); 2648 mov(c_rarg3, hi); 2649 reconstruct_frame_pointer(rtmp); 2650 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp); 2651 hlt(0); 2652 2653 bind(L_success); 2654 BLOCK_COMMENT("} verify_long_in_range"); 2655 } 2656 2657 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 2658 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 2659 if (PreserveFramePointer) { 2660 // frame pointer is valid 2661 #ifdef ASSERT 2662 // Verify frame pointer value in rfp. 2663 add(rtmp, sp, framesize - 2 * wordSize); 2664 Label L_success; 2665 cmp(rfp, rtmp); 2666 br(Assembler::EQ, L_success); 2667 stop("frame pointer mismatch"); 2668 bind(L_success); 2669 #endif // ASSERT 2670 } else { 2671 add(rfp, sp, framesize - 2 * wordSize); 2672 } 2673 } 2674 2675 // Selects elements from two source vectors (src1, src2) based on index values in the index register 2676 // using Neon instructions and places it in the destination vector element corresponding to the 2677 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), 2678 // where NUM_ELEM is the number of BasicType elements per vector. 2679 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) 2680 // Otherwise, selects src2[idx – NUM_ELEM] 2681 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1, 2682 FloatRegister src2, FloatRegister index, 2683 FloatRegister tmp, unsigned vector_length_in_bytes) { 2684 assert_different_registers(dst, src1, src2, tmp); 2685 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B; 2686 2687 if (vector_length_in_bytes == 16) { 2688 assert(UseSVE <= 1, "sve must be <= 1"); 2689 assert(src1->successor() == src2, "Source registers must be ordered"); 2690 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table 2691 tbl(dst, size, src1, 2, index); 2692 } else { // vector length == 8 2693 assert(UseSVE == 0, "must be Neon only"); 2694 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the 2695 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl" 2696 // instruction with one vector lookup 2697 ins(tmp, D, src1, 0, 0); 2698 ins(tmp, D, src2, 1, 0); 2699 tbl(dst, size, tmp, 1, index); 2700 } 2701 } 2702 2703 // Selects elements from two source vectors (src1, src2) based on index values in the index register 2704 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the 2705 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), 2706 // where NUM_ELEM is the number of BasicType elements per vector. 2707 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) 2708 // Otherwise, selects src2[idx – NUM_ELEM] 2709 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1, 2710 FloatRegister src2, FloatRegister index, 2711 FloatRegister tmp, SIMD_RegVariant T, 2712 unsigned vector_length_in_bytes) { 2713 assert_different_registers(dst, src1, src2, index, tmp); 2714 2715 if (vector_length_in_bytes == 8) { 2716 // We need to fit both the source vectors (src1, src2) in a single vector register because the 2717 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to 2718 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl" 2719 // instruction with one vector lookup 2720 assert(UseSVE >= 1, "sve must be >= 1"); 2721 ins(tmp, D, src1, 0, 0); 2722 ins(tmp, D, src2, 1, 0); 2723 sve_tbl(dst, T, tmp, index); 2724 } else { // UseSVE == 2 and vector_length_in_bytes > 8 2725 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table. 2726 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation 2727 // is not executed on machines where vector_length_in_bytes < MaxVectorSize 2728 // with the only exception of 8B vector length. 2729 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be"); 2730 assert(src1->successor() == src2, "Source registers must be ordered"); 2731 sve_tbl(dst, T, src1, src2, index); 2732 } 2733 } 2734 2735 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1, 2736 FloatRegister src2, FloatRegister index, 2737 FloatRegister tmp, BasicType bt, 2738 unsigned vector_length_in_bytes) { 2739 2740 assert_different_registers(dst, src1, src2, index, tmp); 2741 2742 // The cases that can reach this method are - 2743 // - UseSVE = 0, vector_length_in_bytes = 8 or 16 2744 // - UseSVE = 1, vector_length_in_bytes = 8 or 16 2745 // - UseSVE = 2, vector_length_in_bytes >= 8 2746 // 2747 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8 2748 // and UseSVE = 2 with vector_length_in_bytes >= 8 2749 // 2750 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and 2751 // UseSVE = 1 with vector_length_in_bytes = 16 2752 2753 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) { 2754 SIMD_RegVariant T = elemType_to_regVariant(bt); 2755 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes); 2756 return; 2757 } 2758 2759 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT 2760 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type"); 2761 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16"); 2762 2763 bool isQ = vector_length_in_bytes == 16; 2764 2765 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2766 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2767 2768 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of 2769 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table. 2770 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM 2771 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length, 2772 // the indices can range from [0, 8). 2773 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0] 2774 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202] 2775 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000] 2776 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100] 2777 // Add the multiplied result to the vector in tmp to obtain the byte level 2778 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100] 2779 // Use these offsets in the "tbl" instruction to select chunks of 2B. 2780 2781 if (bt == T_BYTE) { 2782 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes); 2783 } else { 2784 int elem_size = (bt == T_SHORT) ? 2 : 4; 2785 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u; 2786 2787 mov(tmp, size1, elem_size); 2788 mulv(dst, size2, index, tmp); 2789 mov(tmp, size2, tbl_offset); 2790 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements 2791 // to select a set of 2B/4B 2792 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes); 2793 } 2794 }