1 /* 2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "opto/c2_MacroAssembler.hpp" 28 #include "opto/compile.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/matcher.hpp" 31 #include "opto/output.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/stubRoutines.hpp" 34 #include "utilities/globalDefinitions.hpp" 35 #include "utilities/powerOfTwo.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 // jdk.internal.util.ArraysSupport.vectorizedHashCode 50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 51 FloatRegister vdata0, FloatRegister vdata1, 52 FloatRegister vdata2, FloatRegister vdata3, 53 FloatRegister vmul0, FloatRegister vmul1, 54 FloatRegister vmul2, FloatRegister vmul3, 55 FloatRegister vpow, FloatRegister vpowm, 56 BasicType eltype) { 57 ARRAYS_HASHCODE_REGISTERS; 58 59 Register tmp1 = rscratch1, tmp2 = rscratch2; 60 61 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 62 63 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 64 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 65 // use 4H for chars and shorts instead, but using 8H gives better performance. 66 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 67 : eltype == T_CHAR || eltype == T_SHORT ? 8 68 : eltype == T_INT ? 4 69 : 0; 70 guarantee(vf, "unsupported eltype"); 71 72 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 73 const size_t unroll_factor = 4; 74 75 switch (eltype) { 76 case T_BOOLEAN: 77 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 78 break; 79 case T_CHAR: 80 BLOCK_COMMENT("arrays_hashcode(char) {"); 81 break; 82 case T_BYTE: 83 BLOCK_COMMENT("arrays_hashcode(byte) {"); 84 break; 85 case T_SHORT: 86 BLOCK_COMMENT("arrays_hashcode(short) {"); 87 break; 88 case T_INT: 89 BLOCK_COMMENT("arrays_hashcode(int) {"); 90 break; 91 default: 92 ShouldNotReachHere(); 93 } 94 95 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 96 // implemented by the stub executes just once. Call the stub only if at least two iterations will 97 // be executed. 98 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 99 cmpw(cnt, large_threshold); 100 br(Assembler::HS, LARGE); 101 102 bind(TAIL); 103 104 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 105 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 106 // Iteration eats up the remainder, uf elements at a time. 107 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 108 andr(tmp2, cnt, unroll_factor - 1); 109 adr(tmp1, BR_BASE); 110 // For Cortex-A53 offset is 4 because 2 nops are generated. 111 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3); 112 movw(tmp2, 0x1f); 113 br(tmp1); 114 115 bind(LOOP); 116 for (size_t i = 0; i < unroll_factor; ++i) { 117 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 118 maddw(result, result, tmp2, tmp1); 119 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 120 // Generate 2nd nop to have 4 instructions per iteration. 121 if (VM_Version::supports_a53mac()) { 122 nop(); 123 } 124 } 125 bind(BR_BASE); 126 subsw(cnt, cnt, unroll_factor); 127 br(Assembler::HS, LOOP); 128 129 b(DONE); 130 131 bind(LARGE); 132 133 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 134 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 135 address tpc = trampoline_call(stub); 136 if (tpc == nullptr) { 137 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 138 postcond(pc() == badAddress); 139 return nullptr; 140 } 141 142 bind(DONE); 143 144 BLOCK_COMMENT("} // arrays_hashcode"); 145 146 postcond(pc() != badAddress); 147 return pc(); 148 } 149 150 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 151 Register t2, Register t3) { 152 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 153 154 // Handle inflated monitor. 155 Label inflated; 156 // Finish fast lock successfully. MUST branch to with flag == EQ 157 Label locked; 158 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 159 Label slow_path; 160 161 if (UseObjectMonitorTable) { 162 // Clear cache in case fast locking succeeds or we need to take the slow-path. 163 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 164 } 165 166 if (DiagnoseSyncOnValueBasedClasses != 0) { 167 load_klass(t1, obj); 168 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 169 tst(t1, KlassFlags::_misc_is_value_based_class); 170 br(Assembler::NE, slow_path); 171 } 172 173 const Register t1_mark = t1; 174 const Register t3_t = t3; 175 176 { // Lightweight locking 177 178 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 179 Label push; 180 181 const Register t2_top = t2; 182 183 // Check if lock-stack is full. 184 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 185 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 186 br(Assembler::GT, slow_path); 187 188 // Check if recursive. 189 subw(t3_t, t2_top, oopSize); 190 ldr(t3_t, Address(rthread, t3_t)); 191 cmp(obj, t3_t); 192 br(Assembler::EQ, push); 193 194 // Relaxed normal load to check for monitor. Optimization for monitor case. 195 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 196 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 197 198 // Not inflated 199 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 200 201 // Try to lock. Transition lock-bits 0b01 => 0b00 202 orr(t1_mark, t1_mark, markWord::unlocked_value); 203 eor(t3_t, t1_mark, markWord::unlocked_value); 204 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 205 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 206 br(Assembler::NE, slow_path); 207 208 bind(push); 209 // After successful lock, push object on lock-stack. 210 str(obj, Address(rthread, t2_top)); 211 addw(t2_top, t2_top, oopSize); 212 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 213 b(locked); 214 } 215 216 { // Handle inflated monitor. 217 bind(inflated); 218 219 const Register t1_monitor = t1; 220 221 if (!UseObjectMonitorTable) { 222 assert(t1_monitor == t1_mark, "should be the same here"); 223 } else { 224 Label monitor_found; 225 226 // Load cache address 227 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 228 229 const int num_unrolled = 2; 230 for (int i = 0; i < num_unrolled; i++) { 231 ldr(t1, Address(t3_t)); 232 cmp(obj, t1); 233 br(Assembler::EQ, monitor_found); 234 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 235 } 236 237 Label loop; 238 239 // Search for obj in cache. 240 bind(loop); 241 242 // Check for match. 243 ldr(t1, Address(t3_t)); 244 cmp(obj, t1); 245 br(Assembler::EQ, monitor_found); 246 247 // Search until null encountered, guaranteed _null_sentinel at end. 248 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 249 cbnz(t1, loop); 250 // Cache Miss, NE set from cmp above, cbnz does not set flags 251 b(slow_path); 252 253 bind(monitor_found); 254 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 255 } 256 257 const Register t2_owner_addr = t2; 258 const Register t3_owner = t3; 259 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 260 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 261 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 262 263 Label monitor_locked; 264 265 // Compute owner address. 266 lea(t2_owner_addr, owner_address); 267 268 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 269 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 270 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 271 /*release*/ false, /*weak*/ false, t3_owner); 272 br(Assembler::EQ, monitor_locked); 273 274 // Check if recursive. 275 cmp(t3_owner, rscratch2); 276 br(Assembler::NE, slow_path); 277 278 // Recursive. 279 increment(recursions_address, 1); 280 281 bind(monitor_locked); 282 if (UseObjectMonitorTable) { 283 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 284 } 285 } 286 287 bind(locked); 288 289 #ifdef ASSERT 290 // Check that locked label is reached with Flags == EQ. 291 Label flag_correct; 292 br(Assembler::EQ, flag_correct); 293 stop("Fast Lock Flag != EQ"); 294 #endif 295 296 bind(slow_path); 297 #ifdef ASSERT 298 // Check that slow_path label is reached with Flags == NE. 299 br(Assembler::NE, flag_correct); 300 stop("Fast Lock Flag != NE"); 301 bind(flag_correct); 302 #endif 303 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 304 } 305 306 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 307 Register t2, Register t3) { 308 assert_different_registers(obj, box, t1, t2, t3); 309 310 // Handle inflated monitor. 311 Label inflated, inflated_load_mark; 312 // Finish fast unlock successfully. MUST branch to with flag == EQ 313 Label unlocked; 314 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 315 Label slow_path; 316 317 const Register t1_mark = t1; 318 const Register t2_top = t2; 319 const Register t3_t = t3; 320 321 { // Lightweight unlock 322 323 Label push_and_slow_path; 324 325 // Check if obj is top of lock-stack. 326 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 327 subw(t2_top, t2_top, oopSize); 328 ldr(t3_t, Address(rthread, t2_top)); 329 cmp(obj, t3_t); 330 // Top of lock stack was not obj. Must be monitor. 331 br(Assembler::NE, inflated_load_mark); 332 333 // Pop lock-stack. 334 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 335 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 336 337 // Check if recursive. 338 subw(t3_t, t2_top, oopSize); 339 ldr(t3_t, Address(rthread, t3_t)); 340 cmp(obj, t3_t); 341 br(Assembler::EQ, unlocked); 342 343 // Not recursive. 344 // Load Mark. 345 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 346 347 // Check header for monitor (0b10). 348 // Because we got here by popping (meaning we pushed in locked) 349 // there will be no monitor in the box. So we need to push back the obj 350 // so that the runtime can fix any potential anonymous owner. 351 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 352 353 // Try to unlock. Transition lock bits 0b00 => 0b01 354 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 355 orr(t3_t, t1_mark, markWord::unlocked_value); 356 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 357 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 358 br(Assembler::EQ, unlocked); 359 360 bind(push_and_slow_path); 361 // Compare and exchange failed. 362 // Restore lock-stack and handle the unlock in runtime. 363 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 364 addw(t2_top, t2_top, oopSize); 365 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 366 b(slow_path); 367 } 368 369 370 { // Handle inflated monitor. 371 bind(inflated_load_mark); 372 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 373 #ifdef ASSERT 374 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 375 stop("Fast Unlock not monitor"); 376 #endif 377 378 bind(inflated); 379 380 #ifdef ASSERT 381 Label check_done; 382 subw(t2_top, t2_top, oopSize); 383 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 384 br(Assembler::LT, check_done); 385 ldr(t3_t, Address(rthread, t2_top)); 386 cmp(obj, t3_t); 387 br(Assembler::NE, inflated); 388 stop("Fast Unlock lock on stack"); 389 bind(check_done); 390 #endif 391 392 const Register t1_monitor = t1; 393 394 if (!UseObjectMonitorTable) { 395 assert(t1_monitor == t1_mark, "should be the same here"); 396 397 // Untag the monitor. 398 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 399 } else { 400 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 401 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 402 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 403 br(Assembler::LO, slow_path); 404 } 405 406 const Register t2_recursions = t2; 407 Label not_recursive; 408 409 // Check if recursive. 410 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 411 cbz(t2_recursions, not_recursive); 412 413 // Recursive unlock. 414 sub(t2_recursions, t2_recursions, 1u); 415 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 416 // Set flag == EQ 417 cmp(t2_recursions, t2_recursions); 418 b(unlocked); 419 420 bind(not_recursive); 421 422 const Register t2_owner_addr = t2; 423 424 // Compute owner address. 425 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 426 427 // Set owner to null. 428 // Release to satisfy the JMM 429 stlr(zr, t2_owner_addr); 430 // We need a full fence after clearing owner to avoid stranding. 431 // StoreLoad achieves this. 432 membar(StoreLoad); 433 434 // Check if the entry_list is empty. 435 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset())); 436 cmp(rscratch1, zr); 437 br(Assembler::EQ, unlocked); // If so we are done. 438 439 // Check if there is a successor. 440 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 441 cmp(rscratch1, zr); 442 br(Assembler::NE, unlocked); // If so we are done. 443 444 // Save the monitor pointer in the current thread, so we can try to 445 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 446 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 447 448 cmp(zr, rthread); // Set Flag to NE => slow path 449 b(slow_path); 450 } 451 452 bind(unlocked); 453 cmp(zr, zr); // Set Flags to EQ => fast path 454 455 #ifdef ASSERT 456 // Check that unlocked label is reached with Flags == EQ. 457 Label flag_correct; 458 br(Assembler::EQ, flag_correct); 459 stop("Fast Unlock Flag != EQ"); 460 #endif 461 462 bind(slow_path); 463 #ifdef ASSERT 464 // Check that slow_path label is reached with Flags == NE. 465 br(Assembler::NE, flag_correct); 466 stop("Fast Unlock Flag != NE"); 467 bind(flag_correct); 468 #endif 469 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 470 } 471 472 // Search for str1 in str2 and return index or -1 473 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 474 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 475 Register cnt2, Register cnt1, 476 Register tmp1, Register tmp2, 477 Register tmp3, Register tmp4, 478 Register tmp5, Register tmp6, 479 int icnt1, Register result, int ae) { 480 // NOTE: tmp5, tmp6 can be zr depending on specific method version 481 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 482 483 Register ch1 = rscratch1; 484 Register ch2 = rscratch2; 485 Register cnt1tmp = tmp1; 486 Register cnt2tmp = tmp2; 487 Register cnt1_neg = cnt1; 488 Register cnt2_neg = cnt2; 489 Register result_tmp = tmp4; 490 491 bool isL = ae == StrIntrinsicNode::LL; 492 493 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 494 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 495 int str1_chr_shift = str1_isL ? 0:1; 496 int str2_chr_shift = str2_isL ? 0:1; 497 int str1_chr_size = str1_isL ? 1:2; 498 int str2_chr_size = str2_isL ? 1:2; 499 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 500 (chr_insn)&MacroAssembler::ldrh; 501 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 502 (chr_insn)&MacroAssembler::ldrh; 503 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 504 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 505 506 // Note, inline_string_indexOf() generates checks: 507 // if (substr.count > string.count) return -1; 508 // if (substr.count == 0) return 0; 509 510 // We have two strings, a source string in str2, cnt2 and a pattern string 511 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 512 513 // For larger pattern and source we use a simplified Boyer Moore algorithm. 514 // With a small pattern and source we use linear scan. 515 516 if (icnt1 == -1) { 517 sub(result_tmp, cnt2, cnt1); 518 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 519 br(LT, LINEARSEARCH); 520 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 521 subs(zr, cnt1, 256); 522 lsr(tmp1, cnt2, 2); 523 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 524 br(GE, LINEARSTUB); 525 } 526 527 // The Boyer Moore alogorithm is based on the description here:- 528 // 529 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 530 // 531 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 532 // and the 'Good Suffix' rule. 533 // 534 // These rules are essentially heuristics for how far we can shift the 535 // pattern along the search string. 536 // 537 // The implementation here uses the 'Bad Character' rule only because of the 538 // complexity of initialisation for the 'Good Suffix' rule. 539 // 540 // This is also known as the Boyer-Moore-Horspool algorithm:- 541 // 542 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 543 // 544 // This particular implementation has few java-specific optimizations. 545 // 546 // #define ASIZE 256 547 // 548 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 549 // int i, j; 550 // unsigned c; 551 // unsigned char bc[ASIZE]; 552 // 553 // /* Preprocessing */ 554 // for (i = 0; i < ASIZE; ++i) 555 // bc[i] = m; 556 // for (i = 0; i < m - 1; ) { 557 // c = x[i]; 558 // ++i; 559 // // c < 256 for Latin1 string, so, no need for branch 560 // #ifdef PATTERN_STRING_IS_LATIN1 561 // bc[c] = m - i; 562 // #else 563 // if (c < ASIZE) bc[c] = m - i; 564 // #endif 565 // } 566 // 567 // /* Searching */ 568 // j = 0; 569 // while (j <= n - m) { 570 // c = y[i+j]; 571 // if (x[m-1] == c) 572 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 573 // if (i < 0) return j; 574 // // c < 256 for Latin1 string, so, no need for branch 575 // #ifdef SOURCE_STRING_IS_LATIN1 576 // // LL case: (c< 256) always true. Remove branch 577 // j += bc[y[j+m-1]]; 578 // #endif 579 // #ifndef PATTERN_STRING_IS_UTF 580 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 581 // if (c < ASIZE) 582 // j += bc[y[j+m-1]]; 583 // else 584 // j += 1 585 // #endif 586 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 587 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 588 // if (c < ASIZE) 589 // j += bc[y[j+m-1]]; 590 // else 591 // j += m 592 // #endif 593 // } 594 // } 595 596 if (icnt1 == -1) { 597 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 598 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 599 Register cnt1end = tmp2; 600 Register str2end = cnt2; 601 Register skipch = tmp2; 602 603 // str1 length is >=8, so, we can read at least 1 register for cases when 604 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 605 // UL case. We'll re-read last character in inner pre-loop code to have 606 // single outer pre-loop load 607 const int firstStep = isL ? 7 : 3; 608 609 const int ASIZE = 256; 610 const int STORED_BYTES = 32; // amount of bytes stored per instruction 611 sub(sp, sp, ASIZE); 612 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 613 mov(ch1, sp); 614 BIND(BM_INIT_LOOP); 615 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 616 subs(tmp5, tmp5, 1); 617 br(GT, BM_INIT_LOOP); 618 619 sub(cnt1tmp, cnt1, 1); 620 mov(tmp5, str2); 621 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 622 sub(ch2, cnt1, 1); 623 mov(tmp3, str1); 624 BIND(BCLOOP); 625 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 626 if (!str1_isL) { 627 subs(zr, ch1, ASIZE); 628 br(HS, BCSKIP); 629 } 630 strb(ch2, Address(sp, ch1)); 631 BIND(BCSKIP); 632 subs(ch2, ch2, 1); 633 br(GT, BCLOOP); 634 635 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 636 if (str1_isL == str2_isL) { 637 // load last 8 bytes (8LL/4UU symbols) 638 ldr(tmp6, Address(tmp6, -wordSize)); 639 } else { 640 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 641 // convert Latin1 to UTF. We'll have to wait until load completed, but 642 // it's still faster than per-character loads+checks 643 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 644 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 645 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 646 andr(tmp6, tmp6, 0xFF); // str1[N-4] 647 orr(ch2, ch1, ch2, LSL, 16); 648 orr(tmp6, tmp6, tmp3, LSL, 48); 649 orr(tmp6, tmp6, ch2, LSL, 16); 650 } 651 BIND(BMLOOPSTR2); 652 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 653 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 654 if (str1_isL == str2_isL) { 655 // re-init tmp3. It's for free because it's executed in parallel with 656 // load above. Alternative is to initialize it before loop, but it'll 657 // affect performance on in-order systems with 2 or more ld/st pipelines 658 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 659 } 660 if (!isL) { // UU/UL case 661 lsl(ch2, cnt1tmp, 1); // offset in bytes 662 } 663 cmp(tmp3, skipch); 664 br(NE, BMSKIP); 665 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 666 mov(ch1, tmp6); 667 if (isL) { 668 b(BMLOOPSTR1_AFTER_LOAD); 669 } else { 670 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 671 b(BMLOOPSTR1_CMP); 672 } 673 BIND(BMLOOPSTR1); 674 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 675 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 676 BIND(BMLOOPSTR1_AFTER_LOAD); 677 subs(cnt1tmp, cnt1tmp, 1); 678 br(LT, BMLOOPSTR1_LASTCMP); 679 BIND(BMLOOPSTR1_CMP); 680 cmp(ch1, ch2); 681 br(EQ, BMLOOPSTR1); 682 BIND(BMSKIP); 683 if (!isL) { 684 // if we've met UTF symbol while searching Latin1 pattern, then we can 685 // skip cnt1 symbols 686 if (str1_isL != str2_isL) { 687 mov(result_tmp, cnt1); 688 } else { 689 mov(result_tmp, 1); 690 } 691 subs(zr, skipch, ASIZE); 692 br(HS, BMADV); 693 } 694 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 695 BIND(BMADV); 696 sub(cnt1tmp, cnt1, 1); 697 add(str2, str2, result_tmp, LSL, str2_chr_shift); 698 cmp(str2, str2end); 699 br(LE, BMLOOPSTR2); 700 add(sp, sp, ASIZE); 701 b(NOMATCH); 702 BIND(BMLOOPSTR1_LASTCMP); 703 cmp(ch1, ch2); 704 br(NE, BMSKIP); 705 BIND(BMMATCH); 706 sub(result, str2, tmp5); 707 if (!str2_isL) lsr(result, result, 1); 708 add(sp, sp, ASIZE); 709 b(DONE); 710 711 BIND(LINEARSTUB); 712 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 713 br(LT, LINEAR_MEDIUM); 714 mov(result, zr); 715 RuntimeAddress stub = nullptr; 716 if (isL) { 717 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 718 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 719 } else if (str1_isL) { 720 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 721 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 722 } else { 723 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 724 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 725 } 726 address call = trampoline_call(stub); 727 if (call == nullptr) { 728 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 729 ciEnv::current()->record_failure("CodeCache is full"); 730 return; 731 } 732 b(DONE); 733 } 734 735 BIND(LINEARSEARCH); 736 { 737 Label DO1, DO2, DO3; 738 739 Register str2tmp = tmp2; 740 Register first = tmp3; 741 742 if (icnt1 == -1) 743 { 744 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 745 746 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 747 br(LT, DOSHORT); 748 BIND(LINEAR_MEDIUM); 749 (this->*str1_load_1chr)(first, Address(str1)); 750 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 751 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 753 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 754 755 BIND(FIRST_LOOP); 756 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 757 cmp(first, ch2); 758 br(EQ, STR1_LOOP); 759 BIND(STR2_NEXT); 760 adds(cnt2_neg, cnt2_neg, str2_chr_size); 761 br(LE, FIRST_LOOP); 762 b(NOMATCH); 763 764 BIND(STR1_LOOP); 765 adds(cnt1tmp, cnt1_neg, str1_chr_size); 766 add(cnt2tmp, cnt2_neg, str2_chr_size); 767 br(GE, MATCH); 768 769 BIND(STR1_NEXT); 770 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 771 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 772 cmp(ch1, ch2); 773 br(NE, STR2_NEXT); 774 adds(cnt1tmp, cnt1tmp, str1_chr_size); 775 add(cnt2tmp, cnt2tmp, str2_chr_size); 776 br(LT, STR1_NEXT); 777 b(MATCH); 778 779 BIND(DOSHORT); 780 if (str1_isL == str2_isL) { 781 cmp(cnt1, (u1)2); 782 br(LT, DO1); 783 br(GT, DO3); 784 } 785 } 786 787 if (icnt1 == 4) { 788 Label CH1_LOOP; 789 790 (this->*load_4chr)(ch1, str1); 791 sub(result_tmp, cnt2, 4); 792 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 793 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 794 795 BIND(CH1_LOOP); 796 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 797 cmp(ch1, ch2); 798 br(EQ, MATCH); 799 adds(cnt2_neg, cnt2_neg, str2_chr_size); 800 br(LE, CH1_LOOP); 801 b(NOMATCH); 802 } 803 804 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 805 Label CH1_LOOP; 806 807 BIND(DO2); 808 (this->*load_2chr)(ch1, str1); 809 if (icnt1 == 2) { 810 sub(result_tmp, cnt2, 2); 811 } 812 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 813 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 814 BIND(CH1_LOOP); 815 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 816 cmp(ch1, ch2); 817 br(EQ, MATCH); 818 adds(cnt2_neg, cnt2_neg, str2_chr_size); 819 br(LE, CH1_LOOP); 820 b(NOMATCH); 821 } 822 823 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 824 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 825 826 BIND(DO3); 827 (this->*load_2chr)(first, str1); 828 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 829 if (icnt1 == 3) { 830 sub(result_tmp, cnt2, 3); 831 } 832 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 833 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 834 BIND(FIRST_LOOP); 835 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 836 cmpw(first, ch2); 837 br(EQ, STR1_LOOP); 838 BIND(STR2_NEXT); 839 adds(cnt2_neg, cnt2_neg, str2_chr_size); 840 br(LE, FIRST_LOOP); 841 b(NOMATCH); 842 843 BIND(STR1_LOOP); 844 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 845 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 846 cmp(ch1, ch2); 847 br(NE, STR2_NEXT); 848 b(MATCH); 849 } 850 851 if (icnt1 == -1 || icnt1 == 1) { 852 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 853 854 BIND(DO1); 855 (this->*str1_load_1chr)(ch1, str1); 856 cmp(cnt2, (u1)8); 857 br(LT, DO1_SHORT); 858 859 sub(result_tmp, cnt2, 8/str2_chr_size); 860 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 861 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 862 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 863 864 if (str2_isL) { 865 orr(ch1, ch1, ch1, LSL, 8); 866 } 867 orr(ch1, ch1, ch1, LSL, 16); 868 orr(ch1, ch1, ch1, LSL, 32); 869 BIND(CH1_LOOP); 870 ldr(ch2, Address(str2, cnt2_neg)); 871 eor(ch2, ch1, ch2); 872 sub(tmp1, ch2, tmp3); 873 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 874 bics(tmp1, tmp1, tmp2); 875 br(NE, HAS_ZERO); 876 adds(cnt2_neg, cnt2_neg, 8); 877 br(LT, CH1_LOOP); 878 879 cmp(cnt2_neg, (u1)8); 880 mov(cnt2_neg, 0); 881 br(LT, CH1_LOOP); 882 b(NOMATCH); 883 884 BIND(HAS_ZERO); 885 rev(tmp1, tmp1); 886 clz(tmp1, tmp1); 887 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 888 b(MATCH); 889 890 BIND(DO1_SHORT); 891 mov(result_tmp, cnt2); 892 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 893 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 894 BIND(DO1_LOOP); 895 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 896 cmpw(ch1, ch2); 897 br(EQ, MATCH); 898 adds(cnt2_neg, cnt2_neg, str2_chr_size); 899 br(LT, DO1_LOOP); 900 } 901 } 902 BIND(NOMATCH); 903 mov(result, -1); 904 b(DONE); 905 BIND(MATCH); 906 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 907 BIND(DONE); 908 } 909 910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 912 913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 914 Register ch, Register result, 915 Register tmp1, Register tmp2, Register tmp3) 916 { 917 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 918 Register cnt1_neg = cnt1; 919 Register ch1 = rscratch1; 920 Register result_tmp = rscratch2; 921 922 cbz(cnt1, NOMATCH); 923 924 cmp(cnt1, (u1)4); 925 br(LT, DO1_SHORT); 926 927 orr(ch, ch, ch, LSL, 16); 928 orr(ch, ch, ch, LSL, 32); 929 930 sub(cnt1, cnt1, 4); 931 mov(result_tmp, cnt1); 932 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 933 sub(cnt1_neg, zr, cnt1, LSL, 1); 934 935 mov(tmp3, 0x0001000100010001); 936 937 BIND(CH1_LOOP); 938 ldr(ch1, Address(str1, cnt1_neg)); 939 eor(ch1, ch, ch1); 940 sub(tmp1, ch1, tmp3); 941 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 942 bics(tmp1, tmp1, tmp2); 943 br(NE, HAS_ZERO); 944 adds(cnt1_neg, cnt1_neg, 8); 945 br(LT, CH1_LOOP); 946 947 cmp(cnt1_neg, (u1)8); 948 mov(cnt1_neg, 0); 949 br(LT, CH1_LOOP); 950 b(NOMATCH); 951 952 BIND(HAS_ZERO); 953 rev(tmp1, tmp1); 954 clz(tmp1, tmp1); 955 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 956 b(MATCH); 957 958 BIND(DO1_SHORT); 959 mov(result_tmp, cnt1); 960 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 961 sub(cnt1_neg, zr, cnt1, LSL, 1); 962 BIND(DO1_LOOP); 963 ldrh(ch1, Address(str1, cnt1_neg)); 964 cmpw(ch, ch1); 965 br(EQ, MATCH); 966 adds(cnt1_neg, cnt1_neg, 2); 967 br(LT, DO1_LOOP); 968 BIND(NOMATCH); 969 mov(result, -1); 970 b(DONE); 971 BIND(MATCH); 972 add(result, result_tmp, cnt1_neg, ASR, 1); 973 BIND(DONE); 974 } 975 976 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 977 Register ch, Register result, 978 FloatRegister ztmp1, 979 FloatRegister ztmp2, 980 PRegister tmp_pg, 981 PRegister tmp_pdn, bool isL) 982 { 983 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 984 assert(tmp_pg->is_governing(), 985 "this register has to be a governing predicate register"); 986 987 Label LOOP, MATCH, DONE, NOMATCH; 988 Register vec_len = rscratch1; 989 Register idx = rscratch2; 990 991 SIMD_RegVariant T = (isL == true) ? B : H; 992 993 cbz(cnt1, NOMATCH); 994 995 // Assign the particular char throughout the vector. 996 sve_dup(ztmp2, T, ch); 997 if (isL) { 998 sve_cntb(vec_len); 999 } else { 1000 sve_cnth(vec_len); 1001 } 1002 mov(idx, 0); 1003 1004 // Generate a predicate to control the reading of input string. 1005 sve_whilelt(tmp_pg, T, idx, cnt1); 1006 1007 BIND(LOOP); 1008 // Read a vector of 8- or 16-bit data depending on the string type. Note 1009 // that inactive elements indicated by the predicate register won't cause 1010 // a data read from memory to the destination vector. 1011 if (isL) { 1012 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1013 } else { 1014 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1015 } 1016 add(idx, idx, vec_len); 1017 1018 // Perform the comparison. An element of the destination predicate is set 1019 // to active if the particular char is matched. 1020 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1021 1022 // Branch if the particular char is found. 1023 br(NE, MATCH); 1024 1025 sve_whilelt(tmp_pg, T, idx, cnt1); 1026 1027 // Loop back if the particular char not found. 1028 br(MI, LOOP); 1029 1030 BIND(NOMATCH); 1031 mov(result, -1); 1032 b(DONE); 1033 1034 BIND(MATCH); 1035 // Undo the index increment. 1036 sub(idx, idx, vec_len); 1037 1038 // Crop the vector to find its location. 1039 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1040 add(result, idx, -1); 1041 sve_incp(result, T, tmp_pdn); 1042 BIND(DONE); 1043 } 1044 1045 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1046 Register ch, Register result, 1047 Register tmp1, Register tmp2, Register tmp3) 1048 { 1049 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1050 Register cnt1_neg = cnt1; 1051 Register ch1 = rscratch1; 1052 Register result_tmp = rscratch2; 1053 1054 cbz(cnt1, NOMATCH); 1055 1056 cmp(cnt1, (u1)8); 1057 br(LT, DO1_SHORT); 1058 1059 orr(ch, ch, ch, LSL, 8); 1060 orr(ch, ch, ch, LSL, 16); 1061 orr(ch, ch, ch, LSL, 32); 1062 1063 sub(cnt1, cnt1, 8); 1064 mov(result_tmp, cnt1); 1065 lea(str1, Address(str1, cnt1)); 1066 sub(cnt1_neg, zr, cnt1); 1067 1068 mov(tmp3, 0x0101010101010101); 1069 1070 BIND(CH1_LOOP); 1071 ldr(ch1, Address(str1, cnt1_neg)); 1072 eor(ch1, ch, ch1); 1073 sub(tmp1, ch1, tmp3); 1074 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1075 bics(tmp1, tmp1, tmp2); 1076 br(NE, HAS_ZERO); 1077 adds(cnt1_neg, cnt1_neg, 8); 1078 br(LT, CH1_LOOP); 1079 1080 cmp(cnt1_neg, (u1)8); 1081 mov(cnt1_neg, 0); 1082 br(LT, CH1_LOOP); 1083 b(NOMATCH); 1084 1085 BIND(HAS_ZERO); 1086 rev(tmp1, tmp1); 1087 clz(tmp1, tmp1); 1088 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1089 b(MATCH); 1090 1091 BIND(DO1_SHORT); 1092 mov(result_tmp, cnt1); 1093 lea(str1, Address(str1, cnt1)); 1094 sub(cnt1_neg, zr, cnt1); 1095 BIND(DO1_LOOP); 1096 ldrb(ch1, Address(str1, cnt1_neg)); 1097 cmp(ch, ch1); 1098 br(EQ, MATCH); 1099 adds(cnt1_neg, cnt1_neg, 1); 1100 br(LT, DO1_LOOP); 1101 BIND(NOMATCH); 1102 mov(result, -1); 1103 b(DONE); 1104 BIND(MATCH); 1105 add(result, result_tmp, cnt1_neg); 1106 BIND(DONE); 1107 } 1108 1109 // Compare strings. 1110 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1111 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1112 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1113 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1114 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1115 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1116 SHORT_LOOP_START, TAIL_CHECK; 1117 1118 bool isLL = ae == StrIntrinsicNode::LL; 1119 bool isLU = ae == StrIntrinsicNode::LU; 1120 bool isUL = ae == StrIntrinsicNode::UL; 1121 1122 // The stub threshold for LL strings is: 72 (64 + 8) chars 1123 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1124 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1125 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1126 1127 bool str1_isL = isLL || isLU; 1128 bool str2_isL = isLL || isUL; 1129 1130 int str1_chr_shift = str1_isL ? 0 : 1; 1131 int str2_chr_shift = str2_isL ? 0 : 1; 1132 int str1_chr_size = str1_isL ? 1 : 2; 1133 int str2_chr_size = str2_isL ? 1 : 2; 1134 int minCharsInWord = isLL ? wordSize : wordSize/2; 1135 1136 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1137 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1138 (chr_insn)&MacroAssembler::ldrh; 1139 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1140 (chr_insn)&MacroAssembler::ldrh; 1141 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1142 (uxt_insn)&MacroAssembler::uxthw; 1143 1144 BLOCK_COMMENT("string_compare {"); 1145 1146 // Bizarrely, the counts are passed in bytes, regardless of whether they 1147 // are L or U strings, however the result is always in characters. 1148 if (!str1_isL) asrw(cnt1, cnt1, 1); 1149 if (!str2_isL) asrw(cnt2, cnt2, 1); 1150 1151 // Compute the minimum of the string lengths and save the difference. 1152 subsw(result, cnt1, cnt2); 1153 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1154 1155 // A very short string 1156 cmpw(cnt2, minCharsInWord); 1157 br(Assembler::LE, SHORT_STRING); 1158 1159 // Compare longwords 1160 // load first parts of strings and finish initialization while loading 1161 { 1162 if (str1_isL == str2_isL) { // LL or UU 1163 ldr(tmp1, Address(str1)); 1164 cmp(str1, str2); 1165 br(Assembler::EQ, DONE); 1166 ldr(tmp2, Address(str2)); 1167 cmp(cnt2, stub_threshold); 1168 br(GE, STUB); 1169 subsw(cnt2, cnt2, minCharsInWord); 1170 br(EQ, TAIL_CHECK); 1171 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1172 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1173 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1174 } else if (isLU) { 1175 ldrs(vtmp, Address(str1)); 1176 ldr(tmp2, Address(str2)); 1177 cmp(cnt2, stub_threshold); 1178 br(GE, STUB); 1179 subw(cnt2, cnt2, 4); 1180 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1181 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1182 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1183 zip1(vtmp, T8B, vtmp, vtmpZ); 1184 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1185 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1186 add(cnt1, cnt1, 4); 1187 fmovd(tmp1, vtmp); 1188 } else { // UL case 1189 ldr(tmp1, Address(str1)); 1190 ldrs(vtmp, Address(str2)); 1191 cmp(cnt2, stub_threshold); 1192 br(GE, STUB); 1193 subw(cnt2, cnt2, 4); 1194 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1195 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1196 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1197 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1198 zip1(vtmp, T8B, vtmp, vtmpZ); 1199 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1200 add(cnt1, cnt1, 8); 1201 fmovd(tmp2, vtmp); 1202 } 1203 adds(cnt2, cnt2, isUL ? 4 : 8); 1204 br(GE, TAIL); 1205 eor(rscratch2, tmp1, tmp2); 1206 cbnz(rscratch2, DIFF); 1207 // main loop 1208 bind(NEXT_WORD); 1209 if (str1_isL == str2_isL) { 1210 ldr(tmp1, Address(str1, cnt2)); 1211 ldr(tmp2, Address(str2, cnt2)); 1212 adds(cnt2, cnt2, 8); 1213 } else if (isLU) { 1214 ldrs(vtmp, Address(str1, cnt1)); 1215 ldr(tmp2, Address(str2, cnt2)); 1216 add(cnt1, cnt1, 4); 1217 zip1(vtmp, T8B, vtmp, vtmpZ); 1218 fmovd(tmp1, vtmp); 1219 adds(cnt2, cnt2, 8); 1220 } else { // UL 1221 ldrs(vtmp, Address(str2, cnt2)); 1222 ldr(tmp1, Address(str1, cnt1)); 1223 zip1(vtmp, T8B, vtmp, vtmpZ); 1224 add(cnt1, cnt1, 8); 1225 fmovd(tmp2, vtmp); 1226 adds(cnt2, cnt2, 4); 1227 } 1228 br(GE, TAIL); 1229 1230 eor(rscratch2, tmp1, tmp2); 1231 cbz(rscratch2, NEXT_WORD); 1232 b(DIFF); 1233 bind(TAIL); 1234 eor(rscratch2, tmp1, tmp2); 1235 cbnz(rscratch2, DIFF); 1236 // Last longword. In the case where length == 4 we compare the 1237 // same longword twice, but that's still faster than another 1238 // conditional branch. 1239 if (str1_isL == str2_isL) { 1240 ldr(tmp1, Address(str1)); 1241 ldr(tmp2, Address(str2)); 1242 } else if (isLU) { 1243 ldrs(vtmp, Address(str1)); 1244 ldr(tmp2, Address(str2)); 1245 zip1(vtmp, T8B, vtmp, vtmpZ); 1246 fmovd(tmp1, vtmp); 1247 } else { // UL 1248 ldrs(vtmp, Address(str2)); 1249 ldr(tmp1, Address(str1)); 1250 zip1(vtmp, T8B, vtmp, vtmpZ); 1251 fmovd(tmp2, vtmp); 1252 } 1253 bind(TAIL_CHECK); 1254 eor(rscratch2, tmp1, tmp2); 1255 cbz(rscratch2, DONE); 1256 1257 // Find the first different characters in the longwords and 1258 // compute their difference. 1259 bind(DIFF); 1260 rev(rscratch2, rscratch2); 1261 clz(rscratch2, rscratch2); 1262 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1263 lsrv(tmp1, tmp1, rscratch2); 1264 (this->*ext_chr)(tmp1, tmp1); 1265 lsrv(tmp2, tmp2, rscratch2); 1266 (this->*ext_chr)(tmp2, tmp2); 1267 subw(result, tmp1, tmp2); 1268 b(DONE); 1269 } 1270 1271 bind(STUB); 1272 RuntimeAddress stub = nullptr; 1273 switch(ae) { 1274 case StrIntrinsicNode::LL: 1275 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1276 break; 1277 case StrIntrinsicNode::UU: 1278 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1279 break; 1280 case StrIntrinsicNode::LU: 1281 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1282 break; 1283 case StrIntrinsicNode::UL: 1284 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1285 break; 1286 default: 1287 ShouldNotReachHere(); 1288 } 1289 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1290 address call = trampoline_call(stub); 1291 if (call == nullptr) { 1292 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1293 ciEnv::current()->record_failure("CodeCache is full"); 1294 return; 1295 } 1296 b(DONE); 1297 1298 bind(SHORT_STRING); 1299 // Is the minimum length zero? 1300 cbz(cnt2, DONE); 1301 // arrange code to do most branches while loading and loading next characters 1302 // while comparing previous 1303 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1304 subs(cnt2, cnt2, 1); 1305 br(EQ, SHORT_LAST_INIT); 1306 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1307 b(SHORT_LOOP_START); 1308 bind(SHORT_LOOP); 1309 subs(cnt2, cnt2, 1); 1310 br(EQ, SHORT_LAST); 1311 bind(SHORT_LOOP_START); 1312 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1313 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1314 cmp(tmp1, cnt1); 1315 br(NE, SHORT_LOOP_TAIL); 1316 subs(cnt2, cnt2, 1); 1317 br(EQ, SHORT_LAST2); 1318 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1319 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1320 cmp(tmp2, rscratch1); 1321 br(EQ, SHORT_LOOP); 1322 sub(result, tmp2, rscratch1); 1323 b(DONE); 1324 bind(SHORT_LOOP_TAIL); 1325 sub(result, tmp1, cnt1); 1326 b(DONE); 1327 bind(SHORT_LAST2); 1328 cmp(tmp2, rscratch1); 1329 br(EQ, DONE); 1330 sub(result, tmp2, rscratch1); 1331 1332 b(DONE); 1333 bind(SHORT_LAST_INIT); 1334 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1335 bind(SHORT_LAST); 1336 cmp(tmp1, cnt1); 1337 br(EQ, DONE); 1338 sub(result, tmp1, cnt1); 1339 1340 bind(DONE); 1341 1342 BLOCK_COMMENT("} string_compare"); 1343 } 1344 1345 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1346 FloatRegister src2, Condition cond, bool isQ) { 1347 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1348 FloatRegister zn = src1, zm = src2; 1349 bool needs_negation = false; 1350 switch (cond) { 1351 case LT: cond = GT; zn = src2; zm = src1; break; 1352 case LE: cond = GE; zn = src2; zm = src1; break; 1353 case LO: cond = HI; zn = src2; zm = src1; break; 1354 case LS: cond = HS; zn = src2; zm = src1; break; 1355 case NE: cond = EQ; needs_negation = true; break; 1356 default: 1357 break; 1358 } 1359 1360 if (is_floating_point_type(bt)) { 1361 fcm(cond, dst, size, zn, zm); 1362 } else { 1363 cm(cond, dst, size, zn, zm); 1364 } 1365 1366 if (needs_negation) { 1367 notr(dst, isQ ? T16B : T8B, dst); 1368 } 1369 } 1370 1371 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1372 Condition cond, bool isQ) { 1373 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1374 if (bt == T_FLOAT || bt == T_DOUBLE) { 1375 if (cond == Assembler::NE) { 1376 fcm(Assembler::EQ, dst, size, src); 1377 notr(dst, isQ ? T16B : T8B, dst); 1378 } else { 1379 fcm(cond, dst, size, src); 1380 } 1381 } else { 1382 if (cond == Assembler::NE) { 1383 cm(Assembler::EQ, dst, size, src); 1384 notr(dst, isQ ? T16B : T8B, dst); 1385 } else { 1386 cm(cond, dst, size, src); 1387 } 1388 } 1389 } 1390 1391 // Compress the least significant bit of each byte to the rightmost and clear 1392 // the higher garbage bits. 1393 void C2_MacroAssembler::bytemask_compress(Register dst) { 1394 // Example input, dst = 0x01 00 00 00 01 01 00 01 1395 // The "??" bytes are garbage. 1396 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1397 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1398 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1399 andr(dst, dst, 0xff); // dst = 0x8D 1400 } 1401 1402 // Pack the lowest-numbered bit of each mask element in src into a long value 1403 // in dst, at most the first 64 lane elements. 1404 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1405 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1406 FloatRegister vtmp1, FloatRegister vtmp2) { 1407 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1408 assert_different_registers(dst, rscratch1); 1409 assert_different_registers(vtmp1, vtmp2); 1410 1411 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1412 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1413 // Expected: dst = 0x658D 1414 1415 // Convert the mask into vector with sequential bytes. 1416 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1417 sve_cpy(vtmp1, size, src, 1, false); 1418 if (bt != T_BYTE) { 1419 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1420 } 1421 1422 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1423 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1424 // is to compress each significant bit of the byte in a cross-lane way. Due 1425 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1426 // (bit-compress in each lane) with the biggest lane size (T = D) then 1427 // concatenate the results. 1428 1429 // The second source input of BEXT, initialized with 0x01 in each byte. 1430 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1431 sve_dup(vtmp2, B, 1); 1432 1433 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1434 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1435 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1436 // --------------------------------------- 1437 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1438 sve_bext(vtmp1, D, vtmp1, vtmp2); 1439 1440 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1441 // result to dst. 1442 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1443 // dst = 0x658D 1444 if (lane_cnt <= 8) { 1445 // No need to concatenate. 1446 umov(dst, vtmp1, B, 0); 1447 } else if (lane_cnt <= 16) { 1448 ins(vtmp1, B, vtmp1, 1, 8); 1449 umov(dst, vtmp1, H, 0); 1450 } else { 1451 // As the lane count is 64 at most, the final expected value must be in 1452 // the lowest 64 bits after narrowing vtmp1 from D to B. 1453 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1454 umov(dst, vtmp1, D, 0); 1455 } 1456 } else if (UseSVE > 0) { 1457 // Compress the lowest 8 bytes. 1458 fmovd(dst, vtmp1); 1459 bytemask_compress(dst); 1460 if (lane_cnt <= 8) return; 1461 1462 // Repeat on higher bytes and join the results. 1463 // Compress 8 bytes in each iteration. 1464 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1465 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1466 bytemask_compress(rscratch1); 1467 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1468 } 1469 } else { 1470 assert(false, "unsupported"); 1471 ShouldNotReachHere(); 1472 } 1473 } 1474 1475 // Unpack the mask, a long value in src, into predicate register dst based on the 1476 // corresponding data type. Note that dst can support at most 64 lanes. 1477 // Below example gives the expected dst predicate register in different types, with 1478 // a valid src(0x658D) on a 1024-bit vector size machine. 1479 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1480 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1481 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1482 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1483 // 1484 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1485 // has 24 significant bits would be an invalid input if dst predicate register refers to 1486 // a LONG type 1024-bit vector, which has at most 16 lanes. 1487 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1488 FloatRegister vtmp1, FloatRegister vtmp2) { 1489 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1490 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1491 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1492 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1493 // Expected: dst = 0b01101001 10001101 1494 1495 // Put long value from general purpose register into the first lane of vector. 1496 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1497 sve_dup(vtmp1, B, 0); 1498 mov(vtmp1, D, 0, src); 1499 1500 // As sve_cmp generates mask value with the minimum unit in byte, we should 1501 // transform the value in the first lane which is mask in bit now to the 1502 // mask in byte, which can be done by SVE2's BDEP instruction. 1503 1504 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1505 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1506 if (lane_cnt <= 8) { 1507 // Nothing. As only one byte exsits. 1508 } else if (lane_cnt <= 16) { 1509 ins(vtmp1, B, vtmp1, 8, 1); 1510 mov(vtmp1, B, 1, zr); 1511 } else { 1512 sve_vector_extend(vtmp1, D, vtmp1, B); 1513 } 1514 1515 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1516 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1517 sve_dup(vtmp2, B, 1); 1518 1519 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1520 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1521 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1522 // --------------------------------------- 1523 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1524 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1525 1526 if (bt != T_BYTE) { 1527 sve_vector_extend(vtmp1, size, vtmp1, B); 1528 } 1529 // Generate mask according to the given vector, in which the elements have been 1530 // extended to expected type. 1531 // dst = 0b01101001 10001101 1532 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1533 } 1534 1535 // Clobbers: rflags 1536 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1537 FloatRegister zn, FloatRegister zm, Condition cond) { 1538 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1539 FloatRegister z1 = zn, z2 = zm; 1540 switch (cond) { 1541 case LE: z1 = zm; z2 = zn; cond = GE; break; 1542 case LT: z1 = zm; z2 = zn; cond = GT; break; 1543 case LO: z1 = zm; z2 = zn; cond = HI; break; 1544 case LS: z1 = zm; z2 = zn; cond = HS; break; 1545 default: 1546 break; 1547 } 1548 1549 SIMD_RegVariant size = elemType_to_regVariant(bt); 1550 if (is_floating_point_type(bt)) { 1551 sve_fcm(cond, pd, size, pg, z1, z2); 1552 } else { 1553 assert(is_integral_type(bt), "unsupported element type"); 1554 sve_cmp(cond, pd, size, pg, z1, z2); 1555 } 1556 } 1557 1558 // Get index of the last mask lane that is set 1559 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1560 SIMD_RegVariant size = elemType_to_regVariant(bt); 1561 sve_rev(ptmp, size, src); 1562 sve_brkb(ptmp, ptrue, ptmp, false); 1563 sve_cntp(dst, size, ptrue, ptmp); 1564 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1565 subw(dst, rscratch1, dst); 1566 } 1567 1568 // Extend integer vector src to dst with the same lane count 1569 // but larger element size, e.g. 4B -> 4I 1570 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1571 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1572 if (src_bt == T_BYTE) { 1573 // 4B to 4S/4I, 8B to 8S 1574 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1575 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported"); 1576 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1577 if (dst_bt == T_INT) { 1578 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1579 } 1580 } else if (src_bt == T_SHORT) { 1581 // 2S to 2I/2L, 4S to 4I 1582 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1583 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported"); 1584 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1585 if (dst_bt == T_LONG) { 1586 _xshll(is_unsigned, dst, T2D, dst, T2S, 0); 1587 } 1588 } else if (src_bt == T_INT) { 1589 // 2I to 2L 1590 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1591 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1592 } else { 1593 ShouldNotReachHere(); 1594 } 1595 } 1596 1597 // Narrow integer vector src down to dst with the same lane count 1598 // but smaller element size, e.g. 4I -> 4B 1599 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1600 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1601 if (src_bt == T_SHORT) { 1602 // 4S/8S to 4B/8B 1603 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1604 assert(dst_bt == T_BYTE, "unsupported"); 1605 xtn(dst, T8B, src, T8H); 1606 } else if (src_bt == T_INT) { 1607 // 2I to 2S, 4I to 4B/4S 1608 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1609 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1610 xtn(dst, T4H, src, T4S); 1611 if (dst_bt == T_BYTE) { 1612 xtn(dst, T8B, dst, T8H); 1613 } 1614 } else if (src_bt == T_LONG) { 1615 // 2L to 2S/2I 1616 assert(src_vlen_in_bytes == 16, "unsupported"); 1617 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported"); 1618 xtn(dst, T2S, src, T2D); 1619 if (dst_bt == T_SHORT) { 1620 xtn(dst, T4H, dst, T4S); 1621 } 1622 } else { 1623 ShouldNotReachHere(); 1624 } 1625 } 1626 1627 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1628 FloatRegister src, SIMD_RegVariant src_size, 1629 bool is_unsigned) { 1630 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1631 1632 if (src_size == B) { 1633 switch (dst_size) { 1634 case H: 1635 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1636 break; 1637 case S: 1638 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1639 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1640 break; 1641 case D: 1642 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1643 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1644 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1645 break; 1646 default: 1647 ShouldNotReachHere(); 1648 } 1649 } else if (src_size == H) { 1650 if (dst_size == S) { 1651 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1652 } else { // D 1653 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1654 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1655 } 1656 } else if (src_size == S) { 1657 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1658 } 1659 } 1660 1661 // Vector narrow from src to dst with specified element sizes. 1662 // High part of dst vector will be filled with zero. 1663 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1664 FloatRegister src, SIMD_RegVariant src_size, 1665 FloatRegister tmp) { 1666 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1667 assert_different_registers(src, tmp); 1668 sve_dup(tmp, src_size, 0); 1669 if (src_size == D) { 1670 switch (dst_size) { 1671 case S: 1672 sve_uzp1(dst, S, src, tmp); 1673 break; 1674 case H: 1675 assert_different_registers(dst, tmp); 1676 sve_uzp1(dst, S, src, tmp); 1677 sve_uzp1(dst, H, dst, tmp); 1678 break; 1679 case B: 1680 assert_different_registers(dst, tmp); 1681 sve_uzp1(dst, S, src, tmp); 1682 sve_uzp1(dst, H, dst, tmp); 1683 sve_uzp1(dst, B, dst, tmp); 1684 break; 1685 default: 1686 ShouldNotReachHere(); 1687 } 1688 } else if (src_size == S) { 1689 if (dst_size == H) { 1690 sve_uzp1(dst, H, src, tmp); 1691 } else { // B 1692 assert_different_registers(dst, tmp); 1693 sve_uzp1(dst, H, src, tmp); 1694 sve_uzp1(dst, B, dst, tmp); 1695 } 1696 } else if (src_size == H) { 1697 sve_uzp1(dst, B, src, tmp); 1698 } 1699 } 1700 1701 // Extend src predicate to dst predicate with the same lane count but larger 1702 // element size, e.g. 64Byte -> 512Long 1703 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1704 uint dst_element_length_in_bytes, 1705 uint src_element_length_in_bytes) { 1706 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1707 sve_punpklo(dst, src); 1708 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1709 sve_punpklo(dst, src); 1710 sve_punpklo(dst, dst); 1711 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1712 sve_punpklo(dst, src); 1713 sve_punpklo(dst, dst); 1714 sve_punpklo(dst, dst); 1715 } else { 1716 assert(false, "unsupported"); 1717 ShouldNotReachHere(); 1718 } 1719 } 1720 1721 // Narrow src predicate to dst predicate with the same lane count but 1722 // smaller element size, e.g. 512Long -> 64Byte 1723 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1724 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1725 // The insignificant bits in src predicate are expected to be zero. 1726 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1727 // passed as the second argument. An example narrowing operation with a given mask would be - 1728 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1729 // Mask (for 2 Longs) : TF 1730 // Predicate register for the above mask (16 bits) : 00000001 00000000 1731 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1732 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1733 assert_different_registers(src, ptmp); 1734 assert_different_registers(dst, ptmp); 1735 sve_pfalse(ptmp); 1736 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1737 sve_uzp1(dst, B, src, ptmp); 1738 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1739 sve_uzp1(dst, H, src, ptmp); 1740 sve_uzp1(dst, B, dst, ptmp); 1741 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1742 sve_uzp1(dst, S, src, ptmp); 1743 sve_uzp1(dst, H, dst, ptmp); 1744 sve_uzp1(dst, B, dst, ptmp); 1745 } else { 1746 assert(false, "unsupported"); 1747 ShouldNotReachHere(); 1748 } 1749 } 1750 1751 // Vector reduction add for integral type with ASIMD instructions. 1752 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1753 Register isrc, FloatRegister vsrc, 1754 unsigned vector_length_in_bytes, 1755 FloatRegister vtmp) { 1756 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1757 assert_different_registers(dst, isrc); 1758 bool isQ = vector_length_in_bytes == 16; 1759 1760 BLOCK_COMMENT("neon_reduce_add_integral {"); 1761 switch(bt) { 1762 case T_BYTE: 1763 addv(vtmp, isQ ? T16B : T8B, vsrc); 1764 smov(dst, vtmp, B, 0); 1765 addw(dst, dst, isrc, ext::sxtb); 1766 break; 1767 case T_SHORT: 1768 addv(vtmp, isQ ? T8H : T4H, vsrc); 1769 smov(dst, vtmp, H, 0); 1770 addw(dst, dst, isrc, ext::sxth); 1771 break; 1772 case T_INT: 1773 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1774 umov(dst, vtmp, S, 0); 1775 addw(dst, dst, isrc); 1776 break; 1777 case T_LONG: 1778 assert(isQ, "unsupported"); 1779 addpd(vtmp, vsrc); 1780 umov(dst, vtmp, D, 0); 1781 add(dst, dst, isrc); 1782 break; 1783 default: 1784 assert(false, "unsupported"); 1785 ShouldNotReachHere(); 1786 } 1787 BLOCK_COMMENT("} neon_reduce_add_integral"); 1788 } 1789 1790 // Vector reduction multiply for integral type with ASIMD instructions. 1791 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1792 // Clobbers: rscratch1 1793 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1794 Register isrc, FloatRegister vsrc, 1795 unsigned vector_length_in_bytes, 1796 FloatRegister vtmp1, FloatRegister vtmp2) { 1797 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1798 bool isQ = vector_length_in_bytes == 16; 1799 1800 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1801 switch(bt) { 1802 case T_BYTE: 1803 if (isQ) { 1804 // Multiply the lower half and higher half of vector iteratively. 1805 // vtmp1 = vsrc[8:15] 1806 ins(vtmp1, D, vsrc, 0, 1); 1807 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1808 mulv(vtmp1, T8B, vtmp1, vsrc); 1809 // vtmp2 = vtmp1[4:7] 1810 ins(vtmp2, S, vtmp1, 0, 1); 1811 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1812 mulv(vtmp1, T8B, vtmp2, vtmp1); 1813 } else { 1814 ins(vtmp1, S, vsrc, 0, 1); 1815 mulv(vtmp1, T8B, vtmp1, vsrc); 1816 } 1817 // vtmp2 = vtmp1[2:3] 1818 ins(vtmp2, H, vtmp1, 0, 1); 1819 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1820 mulv(vtmp2, T8B, vtmp2, vtmp1); 1821 // dst = vtmp2[0] * isrc * vtmp2[1] 1822 umov(rscratch1, vtmp2, B, 0); 1823 mulw(dst, rscratch1, isrc); 1824 sxtb(dst, dst); 1825 umov(rscratch1, vtmp2, B, 1); 1826 mulw(dst, rscratch1, dst); 1827 sxtb(dst, dst); 1828 break; 1829 case T_SHORT: 1830 if (isQ) { 1831 ins(vtmp2, D, vsrc, 0, 1); 1832 mulv(vtmp2, T4H, vtmp2, vsrc); 1833 ins(vtmp1, S, vtmp2, 0, 1); 1834 mulv(vtmp1, T4H, vtmp1, vtmp2); 1835 } else { 1836 ins(vtmp1, S, vsrc, 0, 1); 1837 mulv(vtmp1, T4H, vtmp1, vsrc); 1838 } 1839 umov(rscratch1, vtmp1, H, 0); 1840 mulw(dst, rscratch1, isrc); 1841 sxth(dst, dst); 1842 umov(rscratch1, vtmp1, H, 1); 1843 mulw(dst, rscratch1, dst); 1844 sxth(dst, dst); 1845 break; 1846 case T_INT: 1847 if (isQ) { 1848 ins(vtmp1, D, vsrc, 0, 1); 1849 mulv(vtmp1, T2S, vtmp1, vsrc); 1850 } else { 1851 vtmp1 = vsrc; 1852 } 1853 umov(rscratch1, vtmp1, S, 0); 1854 mul(dst, rscratch1, isrc); 1855 umov(rscratch1, vtmp1, S, 1); 1856 mul(dst, rscratch1, dst); 1857 break; 1858 case T_LONG: 1859 umov(rscratch1, vsrc, D, 0); 1860 mul(dst, isrc, rscratch1); 1861 umov(rscratch1, vsrc, D, 1); 1862 mul(dst, dst, rscratch1); 1863 break; 1864 default: 1865 assert(false, "unsupported"); 1866 ShouldNotReachHere(); 1867 } 1868 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1869 } 1870 1871 // Vector reduction multiply for floating-point type with ASIMD instructions. 1872 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1873 FloatRegister fsrc, FloatRegister vsrc, 1874 unsigned vector_length_in_bytes, 1875 FloatRegister vtmp) { 1876 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1877 bool isQ = vector_length_in_bytes == 16; 1878 1879 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1880 switch(bt) { 1881 case T_FLOAT: 1882 fmuls(dst, fsrc, vsrc); 1883 ins(vtmp, S, vsrc, 0, 1); 1884 fmuls(dst, dst, vtmp); 1885 if (isQ) { 1886 ins(vtmp, S, vsrc, 0, 2); 1887 fmuls(dst, dst, vtmp); 1888 ins(vtmp, S, vsrc, 0, 3); 1889 fmuls(dst, dst, vtmp); 1890 } 1891 break; 1892 case T_DOUBLE: 1893 assert(isQ, "unsupported"); 1894 fmuld(dst, fsrc, vsrc); 1895 ins(vtmp, D, vsrc, 0, 1); 1896 fmuld(dst, dst, vtmp); 1897 break; 1898 default: 1899 assert(false, "unsupported"); 1900 ShouldNotReachHere(); 1901 } 1902 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1903 } 1904 1905 // Helper to select logical instruction 1906 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1907 Register Rn, Register Rm, 1908 enum shift_kind kind, unsigned shift) { 1909 switch(opc) { 1910 case Op_AndReductionV: 1911 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1912 break; 1913 case Op_OrReductionV: 1914 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1915 break; 1916 case Op_XorReductionV: 1917 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1918 break; 1919 default: 1920 assert(false, "unsupported"); 1921 ShouldNotReachHere(); 1922 } 1923 } 1924 1925 // Vector reduction logical operations And, Or, Xor 1926 // Clobbers: rscratch1 1927 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1928 Register isrc, FloatRegister vsrc, 1929 unsigned vector_length_in_bytes) { 1930 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1931 "unsupported"); 1932 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1933 assert_different_registers(dst, isrc); 1934 bool isQ = vector_length_in_bytes == 16; 1935 1936 BLOCK_COMMENT("neon_reduce_logical {"); 1937 umov(rscratch1, vsrc, isQ ? D : S, 0); 1938 umov(dst, vsrc, isQ ? D : S, 1); 1939 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1940 switch(bt) { 1941 case T_BYTE: 1942 if (isQ) { 1943 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1944 } 1945 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1946 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1947 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1948 sxtb(dst, dst); 1949 break; 1950 case T_SHORT: 1951 if (isQ) { 1952 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1953 } 1954 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1955 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1956 sxth(dst, dst); 1957 break; 1958 case T_INT: 1959 if (isQ) { 1960 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1961 } 1962 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1963 break; 1964 case T_LONG: 1965 assert(isQ, "unsupported"); 1966 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1967 break; 1968 default: 1969 assert(false, "unsupported"); 1970 ShouldNotReachHere(); 1971 } 1972 BLOCK_COMMENT("} neon_reduce_logical"); 1973 } 1974 1975 // Vector reduction min/max for integral type with ASIMD instructions. 1976 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1977 // Clobbers: rscratch1, rflags 1978 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1979 Register isrc, FloatRegister vsrc, 1980 unsigned vector_length_in_bytes, 1981 FloatRegister vtmp) { 1982 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1983 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1984 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1985 assert_different_registers(dst, isrc); 1986 bool isQ = vector_length_in_bytes == 16; 1987 bool is_min = opc == Op_MinReductionV; 1988 1989 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1990 if (bt == T_LONG) { 1991 assert(vtmp == fnoreg, "should be"); 1992 assert(isQ, "should be"); 1993 umov(rscratch1, vsrc, D, 0); 1994 cmp(isrc, rscratch1); 1995 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1996 umov(rscratch1, vsrc, D, 1); 1997 cmp(dst, rscratch1); 1998 csel(dst, dst, rscratch1, is_min ? LT : GT); 1999 } else { 2000 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2001 if (size == T2S) { 2002 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2003 } else { 2004 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2005 } 2006 if (bt == T_INT) { 2007 umov(dst, vtmp, S, 0); 2008 } else { 2009 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2010 } 2011 cmpw(dst, isrc); 2012 cselw(dst, dst, isrc, is_min ? LT : GT); 2013 } 2014 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2015 } 2016 2017 // Vector reduction for integral type with SVE instruction. 2018 // Supported operations are Add, And, Or, Xor, Max, Min. 2019 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2020 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2021 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2022 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2023 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2024 assert_different_registers(src1, dst); 2025 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2026 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2027 switch (opc) { 2028 case Op_AddReductionVI: { 2029 sve_uaddv(tmp, size, pg, src2); 2030 if (bt == T_BYTE) { 2031 smov(dst, tmp, size, 0); 2032 addw(dst, src1, dst, ext::sxtb); 2033 } else if (bt == T_SHORT) { 2034 smov(dst, tmp, size, 0); 2035 addw(dst, src1, dst, ext::sxth); 2036 } else { 2037 umov(dst, tmp, size, 0); 2038 addw(dst, dst, src1); 2039 } 2040 break; 2041 } 2042 case Op_AddReductionVL: { 2043 sve_uaddv(tmp, size, pg, src2); 2044 umov(dst, tmp, size, 0); 2045 add(dst, dst, src1); 2046 break; 2047 } 2048 case Op_AndReductionV: { 2049 sve_andv(tmp, size, pg, src2); 2050 if (bt == T_INT || bt == T_LONG) { 2051 umov(dst, tmp, size, 0); 2052 } else { 2053 smov(dst, tmp, size, 0); 2054 } 2055 if (bt == T_LONG) { 2056 andr(dst, dst, src1); 2057 } else { 2058 andw(dst, dst, src1); 2059 } 2060 break; 2061 } 2062 case Op_OrReductionV: { 2063 sve_orv(tmp, size, pg, src2); 2064 if (bt == T_INT || bt == T_LONG) { 2065 umov(dst, tmp, size, 0); 2066 } else { 2067 smov(dst, tmp, size, 0); 2068 } 2069 if (bt == T_LONG) { 2070 orr(dst, dst, src1); 2071 } else { 2072 orrw(dst, dst, src1); 2073 } 2074 break; 2075 } 2076 case Op_XorReductionV: { 2077 sve_eorv(tmp, size, pg, src2); 2078 if (bt == T_INT || bt == T_LONG) { 2079 umov(dst, tmp, size, 0); 2080 } else { 2081 smov(dst, tmp, size, 0); 2082 } 2083 if (bt == T_LONG) { 2084 eor(dst, dst, src1); 2085 } else { 2086 eorw(dst, dst, src1); 2087 } 2088 break; 2089 } 2090 case Op_MaxReductionV: { 2091 sve_smaxv(tmp, size, pg, src2); 2092 if (bt == T_INT || bt == T_LONG) { 2093 umov(dst, tmp, size, 0); 2094 } else { 2095 smov(dst, tmp, size, 0); 2096 } 2097 if (bt == T_LONG) { 2098 cmp(dst, src1); 2099 csel(dst, dst, src1, Assembler::GT); 2100 } else { 2101 cmpw(dst, src1); 2102 cselw(dst, dst, src1, Assembler::GT); 2103 } 2104 break; 2105 } 2106 case Op_MinReductionV: { 2107 sve_sminv(tmp, size, pg, src2); 2108 if (bt == T_INT || bt == T_LONG) { 2109 umov(dst, tmp, size, 0); 2110 } else { 2111 smov(dst, tmp, size, 0); 2112 } 2113 if (bt == T_LONG) { 2114 cmp(dst, src1); 2115 csel(dst, dst, src1, Assembler::LT); 2116 } else { 2117 cmpw(dst, src1); 2118 cselw(dst, dst, src1, Assembler::LT); 2119 } 2120 break; 2121 } 2122 default: 2123 assert(false, "unsupported"); 2124 ShouldNotReachHere(); 2125 } 2126 2127 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2128 if (bt == T_BYTE) { 2129 sxtb(dst, dst); 2130 } else if (bt == T_SHORT) { 2131 sxth(dst, dst); 2132 } 2133 } 2134 } 2135 2136 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2137 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2138 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2139 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2140 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2141 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2142 2143 // Set all elements to false if the input "lane_cnt" is zero. 2144 if (lane_cnt == 0) { 2145 sve_pfalse(dst); 2146 return; 2147 } 2148 2149 SIMD_RegVariant size = elemType_to_regVariant(bt); 2150 assert(size != Q, "invalid size"); 2151 2152 // Set all true if "lane_cnt" equals to the max lane count. 2153 if (lane_cnt == max_vector_length) { 2154 sve_ptrue(dst, size, /* ALL */ 0b11111); 2155 return; 2156 } 2157 2158 // Fixed numbers for "ptrue". 2159 switch(lane_cnt) { 2160 case 1: /* VL1 */ 2161 case 2: /* VL2 */ 2162 case 3: /* VL3 */ 2163 case 4: /* VL4 */ 2164 case 5: /* VL5 */ 2165 case 6: /* VL6 */ 2166 case 7: /* VL7 */ 2167 case 8: /* VL8 */ 2168 sve_ptrue(dst, size, lane_cnt); 2169 return; 2170 case 16: 2171 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2172 return; 2173 case 32: 2174 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2175 return; 2176 case 64: 2177 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2178 return; 2179 case 128: 2180 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2181 return; 2182 case 256: 2183 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2184 return; 2185 default: 2186 break; 2187 } 2188 2189 // Special patterns for "ptrue". 2190 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2191 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2192 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2193 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2194 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2195 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2196 } else { 2197 // Encode to "whileltw" for the remaining cases. 2198 mov(rscratch1, lane_cnt); 2199 sve_whileltw(dst, size, zr, rscratch1); 2200 } 2201 } 2202 2203 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2204 // Any remaining elements of dst will be filled with zero. 2205 // Clobbers: rscratch1 2206 // Preserves: src, mask 2207 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2208 FloatRegister vtmp1, FloatRegister vtmp2, 2209 PRegister pgtmp) { 2210 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2211 assert_different_registers(dst, src, vtmp1, vtmp2); 2212 assert_different_registers(mask, pgtmp); 2213 2214 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2215 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2216 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2217 sve_dup(vtmp2, H, 0); 2218 2219 // Extend lowest half to type INT. 2220 // dst = 00004444 00003333 00002222 00001111 2221 sve_uunpklo(dst, S, src); 2222 // pgtmp = 00000001 00000000 00000001 00000001 2223 sve_punpklo(pgtmp, mask); 2224 // Pack the active elements in size of type INT to the right, 2225 // and fill the remainings with zero. 2226 // dst = 00000000 00004444 00002222 00001111 2227 sve_compact(dst, S, dst, pgtmp); 2228 // Narrow the result back to type SHORT. 2229 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2230 sve_uzp1(dst, H, dst, vtmp2); 2231 // Count the active elements of lowest half. 2232 // rscratch1 = 3 2233 sve_cntp(rscratch1, S, ptrue, pgtmp); 2234 2235 // Repeat to the highest half. 2236 // pgtmp = 00000001 00000000 00000000 00000001 2237 sve_punpkhi(pgtmp, mask); 2238 // vtmp1 = 00008888 00007777 00006666 00005555 2239 sve_uunpkhi(vtmp1, S, src); 2240 // vtmp1 = 00000000 00000000 00008888 00005555 2241 sve_compact(vtmp1, S, vtmp1, pgtmp); 2242 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2243 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2244 2245 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2246 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2247 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2248 // TRUE_CNT is the number of active elements in the compressed low. 2249 neg(rscratch1, rscratch1); 2250 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2251 sve_index(vtmp2, H, rscratch1, 1); 2252 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2253 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2254 2255 // Combine the compressed high(after shifted) with the compressed low. 2256 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2257 sve_orr(dst, dst, vtmp1); 2258 } 2259 2260 // Clobbers: rscratch1, rscratch2 2261 // Preserves: src, mask 2262 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2263 FloatRegister vtmp1, FloatRegister vtmp2, 2264 FloatRegister vtmp3, FloatRegister vtmp4, 2265 PRegister ptmp, PRegister pgtmp) { 2266 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2267 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2268 assert_different_registers(mask, ptmp, pgtmp); 2269 // Example input: src = 88 77 66 55 44 33 22 11 2270 // mask = 01 00 00 01 01 00 01 01 2271 // Expected result: dst = 00 00 00 88 55 44 22 11 2272 2273 sve_dup(vtmp4, B, 0); 2274 // Extend lowest half to type SHORT. 2275 // vtmp1 = 0044 0033 0022 0011 2276 sve_uunpklo(vtmp1, H, src); 2277 // ptmp = 0001 0000 0001 0001 2278 sve_punpklo(ptmp, mask); 2279 // Count the active elements of lowest half. 2280 // rscratch2 = 3 2281 sve_cntp(rscratch2, H, ptrue, ptmp); 2282 // Pack the active elements in size of type SHORT to the right, 2283 // and fill the remainings with zero. 2284 // dst = 0000 0044 0022 0011 2285 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2286 // Narrow the result back to type BYTE. 2287 // dst = 00 00 00 00 00 44 22 11 2288 sve_uzp1(dst, B, dst, vtmp4); 2289 2290 // Repeat to the highest half. 2291 // ptmp = 0001 0000 0000 0001 2292 sve_punpkhi(ptmp, mask); 2293 // vtmp1 = 0088 0077 0066 0055 2294 sve_uunpkhi(vtmp2, H, src); 2295 // vtmp1 = 0000 0000 0088 0055 2296 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2297 2298 sve_dup(vtmp4, B, 0); 2299 // vtmp1 = 00 00 00 00 00 00 88 55 2300 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2301 2302 // Compressed low: dst = 00 00 00 00 00 44 22 11 2303 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2304 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2305 // TRUE_CNT is the number of active elements in the compressed low. 2306 neg(rscratch2, rscratch2); 2307 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2308 sve_index(vtmp2, B, rscratch2, 1); 2309 // vtmp1 = 00 00 00 88 55 00 00 00 2310 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2311 // Combine the compressed high(after shifted) with the compressed low. 2312 // dst = 00 00 00 88 55 44 22 11 2313 sve_orr(dst, dst, vtmp1); 2314 } 2315 2316 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2317 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2318 SIMD_Arrangement size = isQ ? T16B : T8B; 2319 if (bt == T_BYTE) { 2320 rbit(dst, size, src); 2321 } else { 2322 neon_reverse_bytes(dst, src, bt, isQ); 2323 rbit(dst, size, dst); 2324 } 2325 } 2326 2327 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2328 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2329 SIMD_Arrangement size = isQ ? T16B : T8B; 2330 switch (bt) { 2331 case T_BYTE: 2332 if (dst != src) { 2333 orr(dst, size, src, src); 2334 } 2335 break; 2336 case T_SHORT: 2337 rev16(dst, size, src); 2338 break; 2339 case T_INT: 2340 rev32(dst, size, src); 2341 break; 2342 case T_LONG: 2343 rev64(dst, size, src); 2344 break; 2345 default: 2346 assert(false, "unsupported"); 2347 ShouldNotReachHere(); 2348 } 2349 } 2350 2351 // VectorRearrange implementation for short/int/float/long/double types with NEON 2352 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. 2353 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group. 2354 // For VectorRearrange long/double, we compare the shuffle input with iota indices, 2355 // and use bsl to implement the operation. 2356 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, 2357 FloatRegister shuffle, FloatRegister tmp, 2358 BasicType bt, bool isQ) { 2359 assert_different_registers(dst, src, shuffle, tmp); 2360 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2361 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2362 2363 // Here is an example that rearranges a NEON vector with 4 ints: 2364 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] 2365 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. 2366 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector 2367 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get 2368 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. 2369 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], 2370 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] 2371 // 4. Use Vm as index register, and use V1 as table register. 2372 // Then get V2 as the result by tbl NEON instructions. 2373 switch (bt) { 2374 case T_SHORT: 2375 mov(tmp, size1, 0x02); 2376 mulv(dst, size2, shuffle, tmp); 2377 mov(tmp, size2, 0x0100); 2378 addv(dst, size1, dst, tmp); 2379 tbl(dst, size1, src, 1, dst); 2380 break; 2381 case T_INT: 2382 case T_FLOAT: 2383 mov(tmp, size1, 0x04); 2384 mulv(dst, size2, shuffle, tmp); 2385 mov(tmp, size2, 0x03020100); 2386 addv(dst, size1, dst, tmp); 2387 tbl(dst, size1, src, 1, dst); 2388 break; 2389 case T_LONG: 2390 case T_DOUBLE: 2391 // Load the iota indices for Long type. The indices are ordered by 2392 // type B/S/I/L/F/D, and the offset between two types is 16; Hence 2393 // the offset for L is 48. 2394 lea(rscratch1, 2395 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); 2396 ldrq(tmp, rscratch1); 2397 // Check whether the input "shuffle" is the same with iota indices. 2398 // Return "src" if true, otherwise swap the two elements of "src". 2399 cm(EQ, dst, size2, shuffle, tmp); 2400 ext(tmp, size1, src, src, 8); 2401 bsl(dst, size1, src, tmp); 2402 break; 2403 default: 2404 assert(false, "unsupported element type"); 2405 ShouldNotReachHere(); 2406 } 2407 } 2408 2409 // Extract a scalar element from an sve vector at position 'idx'. 2410 // The input elements in src are expected to be of integral type. 2411 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2412 int idx, FloatRegister vtmp) { 2413 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2414 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2415 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2416 if (bt == T_INT || bt == T_LONG) { 2417 umov(dst, src, size, idx); 2418 } else { 2419 smov(dst, src, size, idx); 2420 } 2421 } else { 2422 sve_orr(vtmp, src, src); 2423 sve_ext(vtmp, vtmp, idx << size); 2424 if (bt == T_INT || bt == T_LONG) { 2425 umov(dst, vtmp, size, 0); 2426 } else { 2427 smov(dst, vtmp, size, 0); 2428 } 2429 } 2430 } 2431 2432 // java.lang.Math::round intrinsics 2433 2434 // Clobbers: rscratch1, rflags 2435 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2436 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2437 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2438 switch (T) { 2439 case T2S: 2440 case T4S: 2441 fmovs(tmp1, T, 0.5f); 2442 mov(rscratch1, jint_cast(0x1.0p23f)); 2443 break; 2444 case T2D: 2445 fmovd(tmp1, T, 0.5); 2446 mov(rscratch1, julong_cast(0x1.0p52)); 2447 break; 2448 default: 2449 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2450 } 2451 fadd(tmp1, T, tmp1, src); 2452 fcvtms(tmp1, T, tmp1); 2453 // tmp1 = floor(src + 0.5, ties to even) 2454 2455 fcvtas(dst, T, src); 2456 // dst = round(src), ties to away 2457 2458 fneg(tmp3, T, src); 2459 dup(tmp2, T, rscratch1); 2460 cm(HS, tmp3, T, tmp3, tmp2); 2461 // tmp3 is now a set of flags 2462 2463 bif(dst, T16B, tmp1, tmp3); 2464 // result in dst 2465 } 2466 2467 // Clobbers: rscratch1, rflags 2468 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2469 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2470 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2471 assert_different_registers(tmp1, tmp2, src, dst); 2472 2473 switch (T) { 2474 case S: 2475 mov(rscratch1, jint_cast(0x1.0p23f)); 2476 break; 2477 case D: 2478 mov(rscratch1, julong_cast(0x1.0p52)); 2479 break; 2480 default: 2481 assert(T == S || T == D, "invalid register variant"); 2482 } 2483 2484 sve_frinta(dst, T, ptrue, src); 2485 // dst = round(src), ties to away 2486 2487 Label none; 2488 2489 sve_fneg(tmp1, T, ptrue, src); 2490 sve_dup(tmp2, T, rscratch1); 2491 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2492 br(EQ, none); 2493 { 2494 sve_cpy(tmp1, T, pgtmp, 0.5); 2495 sve_fadd(tmp1, T, pgtmp, src); 2496 sve_frintm(dst, T, pgtmp, tmp1); 2497 // dst = floor(src + 0.5, ties to even) 2498 } 2499 bind(none); 2500 2501 sve_fcvtzs(dst, T, ptrue, dst, T); 2502 // result in dst 2503 } 2504 2505 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2506 FloatRegister one, SIMD_Arrangement T) { 2507 assert_different_registers(dst, src, zero, one); 2508 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2509 2510 facgt(dst, T, src, zero); 2511 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2512 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2513 } 2514 2515 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2516 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2517 assert_different_registers(dst, src, zero, one, vtmp); 2518 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2519 2520 sve_orr(vtmp, src, src); 2521 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2522 switch (T) { 2523 case S: 2524 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2525 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2526 // on the sign of the float value 2527 break; 2528 case D: 2529 sve_and(vtmp, T, min_jlong); 2530 sve_orr(vtmp, T, jlong_cast(1.0)); 2531 break; 2532 default: 2533 assert(false, "unsupported"); 2534 ShouldNotReachHere(); 2535 } 2536 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2537 // Result in dst 2538 } 2539 2540 bool C2_MacroAssembler::in_scratch_emit_size() { 2541 if (ciEnv::current()->task() != nullptr) { 2542 PhaseOutput* phase_output = Compile::current()->output(); 2543 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2544 return true; 2545 } 2546 } 2547 return MacroAssembler::in_scratch_emit_size(); 2548 } 2549 2550 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 2551 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 2552 } 2553 2554 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) { 2555 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2556 if (t == TypeInt::INT) { 2557 return; 2558 } 2559 BLOCK_COMMENT("verify_int_in_range {"); 2560 Label L_success, L_failure; 2561 2562 jint lo = t->_lo; 2563 jint hi = t->_hi; 2564 2565 if (lo != min_jint && hi != max_jint) { 2566 subsw(rtmp, rval, lo); 2567 br(Assembler::LT, L_failure); 2568 subsw(rtmp, rval, hi); 2569 br(Assembler::LE, L_success); 2570 } else if (lo != min_jint) { 2571 subsw(rtmp, rval, lo); 2572 br(Assembler::GE, L_success); 2573 } else if (hi != max_jint) { 2574 subsw(rtmp, rval, hi); 2575 br(Assembler::LE, L_success); 2576 } else { 2577 ShouldNotReachHere(); 2578 } 2579 2580 bind(L_failure); 2581 movw(c_rarg0, idx); 2582 mov(c_rarg1, rval); 2583 movw(c_rarg2, lo); 2584 movw(c_rarg3, hi); 2585 reconstruct_frame_pointer(rtmp); 2586 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp); 2587 hlt(0); 2588 2589 bind(L_success); 2590 BLOCK_COMMENT("} verify_int_in_range"); 2591 } 2592 2593 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 2594 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 2595 } 2596 2597 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) { 2598 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2599 if (t == TypeLong::LONG) { 2600 return; 2601 } 2602 BLOCK_COMMENT("verify_long_in_range {"); 2603 Label L_success, L_failure; 2604 2605 jlong lo = t->_lo; 2606 jlong hi = t->_hi; 2607 2608 if (lo != min_jlong && hi != max_jlong) { 2609 subs(rtmp, rval, lo); 2610 br(Assembler::LT, L_failure); 2611 subs(rtmp, rval, hi); 2612 br(Assembler::LE, L_success); 2613 } else if (lo != min_jlong) { 2614 subs(rtmp, rval, lo); 2615 br(Assembler::GE, L_success); 2616 } else if (hi != max_jlong) { 2617 subs(rtmp, rval, hi); 2618 br(Assembler::LE, L_success); 2619 } else { 2620 ShouldNotReachHere(); 2621 } 2622 2623 bind(L_failure); 2624 movw(c_rarg0, idx); 2625 mov(c_rarg1, rval); 2626 mov(c_rarg2, lo); 2627 mov(c_rarg3, hi); 2628 reconstruct_frame_pointer(rtmp); 2629 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp); 2630 hlt(0); 2631 2632 bind(L_success); 2633 BLOCK_COMMENT("} verify_long_in_range"); 2634 } 2635 2636 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 2637 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 2638 if (PreserveFramePointer) { 2639 // frame pointer is valid 2640 #ifdef ASSERT 2641 // Verify frame pointer value in rfp. 2642 add(rtmp, sp, framesize - 2 * wordSize); 2643 Label L_success; 2644 cmp(rfp, rtmp); 2645 br(Assembler::EQ, L_success); 2646 stop("frame pointer mismatch"); 2647 bind(L_success); 2648 #endif // ASSERT 2649 } else { 2650 add(rfp, sp, framesize - 2 * wordSize); 2651 } 2652 } 2653 2654 // Selects elements from two source vectors (src1, src2) based on index values in the index register 2655 // using Neon instructions and places it in the destination vector element corresponding to the 2656 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), 2657 // where NUM_ELEM is the number of BasicType elements per vector. 2658 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) 2659 // Otherwise, selects src2[idx – NUM_ELEM] 2660 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1, 2661 FloatRegister src2, FloatRegister index, 2662 FloatRegister tmp, unsigned vector_length_in_bytes) { 2663 assert_different_registers(dst, src1, src2, tmp); 2664 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B; 2665 2666 if (vector_length_in_bytes == 16) { 2667 assert(UseSVE <= 1, "sve must be <= 1"); 2668 assert(src1->successor() == src2, "Source registers must be ordered"); 2669 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table 2670 tbl(dst, size, src1, 2, index); 2671 } else { // vector length == 8 2672 assert(UseSVE == 0, "must be Neon only"); 2673 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the 2674 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl" 2675 // instruction with one vector lookup 2676 ins(tmp, D, src1, 0, 0); 2677 ins(tmp, D, src2, 1, 0); 2678 tbl(dst, size, tmp, 1, index); 2679 } 2680 } 2681 2682 // Selects elements from two source vectors (src1, src2) based on index values in the index register 2683 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the 2684 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), 2685 // where NUM_ELEM is the number of BasicType elements per vector. 2686 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) 2687 // Otherwise, selects src2[idx – NUM_ELEM] 2688 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1, 2689 FloatRegister src2, FloatRegister index, 2690 FloatRegister tmp, SIMD_RegVariant T, 2691 unsigned vector_length_in_bytes) { 2692 assert_different_registers(dst, src1, src2, index, tmp); 2693 2694 if (vector_length_in_bytes == 8) { 2695 // We need to fit both the source vectors (src1, src2) in a single vector register because the 2696 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to 2697 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl" 2698 // instruction with one vector lookup 2699 assert(UseSVE >= 1, "sve must be >= 1"); 2700 ins(tmp, D, src1, 0, 0); 2701 ins(tmp, D, src2, 1, 0); 2702 sve_tbl(dst, T, tmp, index); 2703 } else { // UseSVE == 2 and vector_length_in_bytes > 8 2704 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table. 2705 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation 2706 // is not executed on machines where vector_length_in_bytes < MaxVectorSize 2707 // with the only exception of 8B vector length. 2708 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be"); 2709 assert(src1->successor() == src2, "Source registers must be ordered"); 2710 sve_tbl(dst, T, src1, src2, index); 2711 } 2712 } 2713 2714 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1, 2715 FloatRegister src2, FloatRegister index, 2716 FloatRegister tmp, BasicType bt, 2717 unsigned vector_length_in_bytes) { 2718 2719 assert_different_registers(dst, src1, src2, index, tmp); 2720 2721 // The cases that can reach this method are - 2722 // - UseSVE = 0, vector_length_in_bytes = 8 or 16 2723 // - UseSVE = 1, vector_length_in_bytes = 8 or 16 2724 // - UseSVE = 2, vector_length_in_bytes >= 8 2725 // 2726 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8 2727 // and UseSVE = 2 with vector_length_in_bytes >= 8 2728 // 2729 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and 2730 // UseSVE = 1 with vector_length_in_bytes = 16 2731 2732 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) { 2733 SIMD_RegVariant T = elemType_to_regVariant(bt); 2734 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes); 2735 return; 2736 } 2737 2738 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT 2739 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type"); 2740 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16"); 2741 2742 bool isQ = vector_length_in_bytes == 16; 2743 2744 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2745 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2746 2747 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of 2748 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table. 2749 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM 2750 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length, 2751 // the indices can range from [0, 8). 2752 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0] 2753 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202] 2754 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000] 2755 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100] 2756 // Add the multiplied result to the vector in tmp to obtain the byte level 2757 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100] 2758 // Use these offsets in the "tbl" instruction to select chunks of 2B. 2759 2760 if (bt == T_BYTE) { 2761 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes); 2762 } else { 2763 int elem_size = (bt == T_SHORT) ? 2 : 4; 2764 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u; 2765 2766 mov(tmp, size1, elem_size); 2767 mulv(dst, size2, index, tmp); 2768 mov(tmp, size2, tbl_offset); 2769 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements 2770 // to select a set of 2B/4B 2771 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes); 2772 } 2773 }