1 /* 2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. 3 * Copyright 2026 Arm Limited and/or its affiliates. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/objectMonitorTable.hpp" 35 #include "runtime/stubRoutines.hpp" 36 #include "runtime/synchronizer.hpp" 37 #include "utilities/globalDefinitions.hpp" 38 #include "utilities/powerOfTwo.hpp" 39 40 #ifdef PRODUCT 41 #define BLOCK_COMMENT(str) /* nothing */ 42 #define STOP(error) stop(error) 43 #else 44 #define BLOCK_COMMENT(str) block_comment(str) 45 #define STOP(error) block_comment(error); stop(error) 46 #endif 47 48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 49 50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 51 52 void C2_MacroAssembler::entry_barrier() { 53 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 54 // Dummy labels for just measuring the code size 55 Label dummy_slow_path; 56 Label dummy_continuation; 57 Label dummy_guard; 58 Label* slow_path = &dummy_slow_path; 59 Label* continuation = &dummy_continuation; 60 Label* guard = &dummy_guard; 61 if (!Compile::current()->output()->in_scratch_emit_size()) { 62 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 63 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 64 Compile::current()->output()->add_stub(stub); 65 slow_path = &stub->entry(); 66 continuation = &stub->continuation(); 67 guard = &stub->guard(); 68 } 69 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 70 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 71 } 72 73 // jdk.internal.util.ArraysSupport.vectorizedHashCode 74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 75 FloatRegister vdata0, FloatRegister vdata1, 76 FloatRegister vdata2, FloatRegister vdata3, 77 FloatRegister vmul0, FloatRegister vmul1, 78 FloatRegister vmul2, FloatRegister vmul3, 79 FloatRegister vpow, FloatRegister vpowm, 80 BasicType eltype) { 81 ARRAYS_HASHCODE_REGISTERS; 82 83 Register tmp1 = rscratch1, tmp2 = rscratch2; 84 85 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 86 87 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 88 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 89 // use 4H for chars and shorts instead, but using 8H gives better performance. 90 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 91 : eltype == T_CHAR || eltype == T_SHORT ? 8 92 : eltype == T_INT ? 4 93 : 0; 94 guarantee(vf, "unsupported eltype"); 95 96 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 97 const size_t unroll_factor = 4; 98 99 switch (eltype) { 100 case T_BOOLEAN: 101 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 102 break; 103 case T_CHAR: 104 BLOCK_COMMENT("arrays_hashcode(char) {"); 105 break; 106 case T_BYTE: 107 BLOCK_COMMENT("arrays_hashcode(byte) {"); 108 break; 109 case T_SHORT: 110 BLOCK_COMMENT("arrays_hashcode(short) {"); 111 break; 112 case T_INT: 113 BLOCK_COMMENT("arrays_hashcode(int) {"); 114 break; 115 default: 116 ShouldNotReachHere(); 117 } 118 119 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 120 // implemented by the stub executes just once. Call the stub only if at least two iterations will 121 // be executed. 122 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 123 cmpw(cnt, large_threshold); 124 br(Assembler::HS, LARGE); 125 126 bind(TAIL); 127 128 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 129 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 130 // Iteration eats up the remainder, uf elements at a time. 131 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 132 andr(tmp2, cnt, unroll_factor - 1); 133 adr(tmp1, BR_BASE); 134 // For Cortex-A53 offset is 4 because 2 nops are generated. 135 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3); 136 movw(tmp2, 0x1f); 137 br(tmp1); 138 139 bind(LOOP); 140 for (size_t i = 0; i < unroll_factor; ++i) { 141 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 142 maddw(result, result, tmp2, tmp1); 143 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 144 // Generate 2nd nop to have 4 instructions per iteration. 145 if (VM_Version::supports_a53mac()) { 146 nop(); 147 } 148 } 149 bind(BR_BASE); 150 subsw(cnt, cnt, unroll_factor); 151 br(Assembler::HS, LOOP); 152 153 b(DONE); 154 155 bind(LARGE); 156 157 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 158 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 159 address tpc = trampoline_call(stub); 160 if (tpc == nullptr) { 161 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 162 postcond(pc() == badAddress); 163 return nullptr; 164 } 165 166 bind(DONE); 167 168 BLOCK_COMMENT("} // arrays_hashcode"); 169 170 postcond(pc() != badAddress); 171 return pc(); 172 } 173 174 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1, 175 Register t2, Register t3) { 176 assert_different_registers(obj, box, t1, t2, t3, rscratch2); 177 178 // Handle inflated monitor. 179 Label inflated; 180 // Finish fast lock successfully. MUST branch to with flag == EQ 181 Label locked; 182 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 183 Label slow_path; 184 185 if (UseObjectMonitorTable) { 186 // Clear cache in case fast locking succeeds or we need to take the slow-path. 187 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 188 } 189 190 if (DiagnoseSyncOnValueBasedClasses != 0) { 191 load_klass(t1, obj); 192 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 193 tst(t1, KlassFlags::_misc_is_value_based_class); 194 br(Assembler::NE, slow_path); 195 } 196 197 const Register t1_mark = t1; 198 const Register t3_t = t3; 199 200 { // Fast locking 201 202 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 203 Label push; 204 205 const Register t2_top = t2; 206 207 // Check if lock-stack is full. 208 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 209 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 210 br(Assembler::GT, slow_path); 211 212 // Check if recursive. 213 subw(t3_t, t2_top, oopSize); 214 ldr(t3_t, Address(rthread, t3_t)); 215 cmp(obj, t3_t); 216 br(Assembler::EQ, push); 217 218 // Relaxed normal load to check for monitor. Optimization for monitor case. 219 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 220 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 221 222 // Not inflated 223 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 224 225 // Try to lock. Transition lock-bits 0b01 => 0b00 226 orr(t1_mark, t1_mark, markWord::unlocked_value); 227 eor(t3_t, t1_mark, markWord::unlocked_value); 228 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 229 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 230 br(Assembler::NE, slow_path); 231 232 bind(push); 233 // After successful lock, push object on lock-stack. 234 str(obj, Address(rthread, t2_top)); 235 addw(t2_top, t2_top, oopSize); 236 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 237 b(locked); 238 } 239 240 { // Handle inflated monitor. 241 bind(inflated); 242 243 const Register t1_monitor = t1; 244 245 if (!UseObjectMonitorTable) { 246 assert(t1_monitor == t1_mark, "should be the same here"); 247 } else { 248 const Register t1_hash = t1; 249 Label monitor_found; 250 251 // Save the mark, we might need it to extract the hash. 252 mov(t3, t1_mark); 253 254 // Look for the monitor in the om_cache. 255 256 ByteSize cache_offset = JavaThread::om_cache_oops_offset(); 257 ByteSize monitor_offset = OMCache::oop_to_monitor_difference(); 258 const int num_unrolled = OMCache::CAPACITY; 259 for (int i = 0; i < num_unrolled; i++) { 260 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset)); 261 ldr(t2, Address(rthread, cache_offset)); 262 cmp(obj, t2); 263 br(Assembler::EQ, monitor_found); 264 cache_offset = cache_offset + OMCache::oop_to_oop_difference(); 265 } 266 267 // Look for the monitor in the table. 268 269 // Get the hash code. 270 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits); 271 272 // Get the table and calculate the bucket's address 273 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address())); 274 ldr(t3, Address(t3)); 275 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset())); 276 ands(t1_hash, t1_hash, t2); 277 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset())); 278 279 // Read the monitor from the bucket. 280 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord))); 281 282 // Check if the monitor in the bucket is special (empty, tombstone or removed). 283 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special); 284 br(Assembler::LO, slow_path); 285 286 // Check if object matches. 287 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset())); 288 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 289 bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path); 290 cmp(t3, obj); 291 br(Assembler::NE, slow_path); 292 293 bind(monitor_found); 294 } 295 296 const Register t2_owner_addr = t2; 297 const Register t3_owner = t3; 298 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 299 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 300 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 301 302 Label monitor_locked; 303 304 // Compute owner address. 305 lea(t2_owner_addr, owner_address); 306 307 // Try to CAS owner (no owner => current thread's _monitor_owner_id). 308 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset())); 309 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true, 310 /*release*/ false, /*weak*/ false, t3_owner); 311 br(Assembler::EQ, monitor_locked); 312 313 // Check if recursive. 314 cmp(t3_owner, rscratch2); 315 br(Assembler::NE, slow_path); 316 317 // Recursive. 318 increment(recursions_address, 1); 319 320 bind(monitor_locked); 321 if (UseObjectMonitorTable) { 322 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 323 } 324 } 325 326 bind(locked); 327 328 #ifdef ASSERT 329 // Check that locked label is reached with Flags == EQ. 330 Label flag_correct; 331 br(Assembler::EQ, flag_correct); 332 stop("Fast Lock Flag != EQ"); 333 #endif 334 335 bind(slow_path); 336 #ifdef ASSERT 337 // Check that slow_path label is reached with Flags == NE. 338 br(Assembler::NE, flag_correct); 339 stop("Fast Lock Flag != NE"); 340 bind(flag_correct); 341 #endif 342 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 343 } 344 345 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1, 346 Register t2, Register t3) { 347 assert_different_registers(obj, box, t1, t2, t3); 348 349 // Handle inflated monitor. 350 Label inflated, inflated_load_mark; 351 // Finish fast unlock successfully. MUST branch to with flag == EQ 352 Label unlocked; 353 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 354 Label slow_path; 355 356 const Register t1_mark = t1; 357 const Register t2_top = t2; 358 const Register t3_t = t3; 359 360 { // Fast unlock 361 362 Label push_and_slow_path; 363 364 // Check if obj is top of lock-stack. 365 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 366 subw(t2_top, t2_top, oopSize); 367 ldr(t3_t, Address(rthread, t2_top)); 368 cmp(obj, t3_t); 369 // Top of lock stack was not obj. Must be monitor. 370 br(Assembler::NE, inflated_load_mark); 371 372 // Pop lock-stack. 373 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 374 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 375 376 // Check if recursive. 377 subw(t3_t, t2_top, oopSize); 378 ldr(t3_t, Address(rthread, t3_t)); 379 cmp(obj, t3_t); 380 br(Assembler::EQ, unlocked); 381 382 // Not recursive. 383 // Load Mark. 384 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 385 386 // Check header for monitor (0b10). 387 // Because we got here by popping (meaning we pushed in locked) 388 // there will be no monitor in the box. So we need to push back the obj 389 // so that the runtime can fix any potential anonymous owner. 390 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 391 392 // Try to unlock. Transition lock bits 0b00 => 0b01 393 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 394 orr(t3_t, t1_mark, markWord::unlocked_value); 395 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 396 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 397 br(Assembler::EQ, unlocked); 398 399 bind(push_and_slow_path); 400 // Compare and exchange failed. 401 // Restore lock-stack and handle the unlock in runtime. 402 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 403 addw(t2_top, t2_top, oopSize); 404 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 405 b(slow_path); 406 } 407 408 409 { // Handle inflated monitor. 410 bind(inflated_load_mark); 411 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 412 #ifdef ASSERT 413 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 414 stop("Fast Unlock not monitor"); 415 #endif 416 417 bind(inflated); 418 419 #ifdef ASSERT 420 Label check_done; 421 subw(t2_top, t2_top, oopSize); 422 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 423 br(Assembler::LT, check_done); 424 ldr(t3_t, Address(rthread, t2_top)); 425 cmp(obj, t3_t); 426 br(Assembler::NE, inflated); 427 stop("Fast Unlock lock on stack"); 428 bind(check_done); 429 #endif 430 431 const Register t1_monitor = t1; 432 433 if (!UseObjectMonitorTable) { 434 assert(t1_monitor == t1_mark, "should be the same here"); 435 436 // Untag the monitor. 437 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 438 } else { 439 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 440 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 441 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 442 br(Assembler::LO, slow_path); 443 } 444 445 const Register t2_recursions = t2; 446 Label not_recursive; 447 448 // Check if recursive. 449 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 450 cbz(t2_recursions, not_recursive); 451 452 // Recursive unlock. 453 sub(t2_recursions, t2_recursions, 1u); 454 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 455 // Set flag == EQ 456 cmp(t2_recursions, t2_recursions); 457 b(unlocked); 458 459 bind(not_recursive); 460 461 const Register t2_owner_addr = t2; 462 463 // Compute owner address. 464 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 465 466 // Set owner to null. 467 // Release to satisfy the JMM 468 stlr(zr, t2_owner_addr); 469 // We need a full fence after clearing owner to avoid stranding. 470 // StoreLoad achieves this. 471 membar(StoreLoad); 472 473 // Check if the entry_list is empty. 474 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset())); 475 cmp(rscratch1, zr); 476 br(Assembler::EQ, unlocked); // If so we are done. 477 478 // Check if there is a successor. 479 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 480 cmp(rscratch1, zr); 481 br(Assembler::NE, unlocked); // If so we are done. 482 483 // Save the monitor pointer in the current thread, so we can try to 484 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 485 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 486 487 cmp(zr, rthread); // Set Flag to NE => slow path 488 b(slow_path); 489 } 490 491 bind(unlocked); 492 cmp(zr, zr); // Set Flags to EQ => fast path 493 494 #ifdef ASSERT 495 // Check that unlocked label is reached with Flags == EQ. 496 Label flag_correct; 497 br(Assembler::EQ, flag_correct); 498 stop("Fast Unlock Flag != EQ"); 499 #endif 500 501 bind(slow_path); 502 #ifdef ASSERT 503 // Check that slow_path label is reached with Flags == NE. 504 br(Assembler::NE, flag_correct); 505 stop("Fast Unlock Flag != NE"); 506 bind(flag_correct); 507 #endif 508 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 509 } 510 511 // Search for str1 in str2 and return index or -1 512 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 513 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 514 Register cnt2, Register cnt1, 515 Register tmp1, Register tmp2, 516 Register tmp3, Register tmp4, 517 Register tmp5, Register tmp6, 518 int icnt1, Register result, int ae) { 519 // NOTE: tmp5, tmp6 can be zr depending on specific method version 520 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 521 522 Register ch1 = rscratch1; 523 Register ch2 = rscratch2; 524 Register cnt1tmp = tmp1; 525 Register cnt2tmp = tmp2; 526 Register cnt1_neg = cnt1; 527 Register cnt2_neg = cnt2; 528 Register result_tmp = tmp4; 529 530 bool isL = ae == StrIntrinsicNode::LL; 531 532 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 533 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 534 int str1_chr_shift = str1_isL ? 0:1; 535 int str2_chr_shift = str2_isL ? 0:1; 536 int str1_chr_size = str1_isL ? 1:2; 537 int str2_chr_size = str2_isL ? 1:2; 538 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 539 (chr_insn)&MacroAssembler::ldrh; 540 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 541 (chr_insn)&MacroAssembler::ldrh; 542 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 543 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 544 545 // Note, inline_string_indexOf() generates checks: 546 // if (substr.count > string.count) return -1; 547 // if (substr.count == 0) return 0; 548 549 // We have two strings, a source string in str2, cnt2 and a pattern string 550 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 551 552 // For larger pattern and source we use a simplified Boyer Moore algorithm. 553 // With a small pattern and source we use linear scan. 554 555 if (icnt1 == -1) { 556 sub(result_tmp, cnt2, cnt1); 557 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 558 br(LT, LINEARSEARCH); 559 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 560 subs(zr, cnt1, 256); 561 lsr(tmp1, cnt2, 2); 562 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 563 br(GE, LINEARSTUB); 564 } 565 566 // The Boyer Moore alogorithm is based on the description here:- 567 // 568 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 569 // 570 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 571 // and the 'Good Suffix' rule. 572 // 573 // These rules are essentially heuristics for how far we can shift the 574 // pattern along the search string. 575 // 576 // The implementation here uses the 'Bad Character' rule only because of the 577 // complexity of initialisation for the 'Good Suffix' rule. 578 // 579 // This is also known as the Boyer-Moore-Horspool algorithm:- 580 // 581 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 582 // 583 // This particular implementation has few java-specific optimizations. 584 // 585 // #define ASIZE 256 586 // 587 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 588 // int i, j; 589 // unsigned c; 590 // unsigned char bc[ASIZE]; 591 // 592 // /* Preprocessing */ 593 // for (i = 0; i < ASIZE; ++i) 594 // bc[i] = m; 595 // for (i = 0; i < m - 1; ) { 596 // c = x[i]; 597 // ++i; 598 // // c < 256 for Latin1 string, so, no need for branch 599 // #ifdef PATTERN_STRING_IS_LATIN1 600 // bc[c] = m - i; 601 // #else 602 // if (c < ASIZE) bc[c] = m - i; 603 // #endif 604 // } 605 // 606 // /* Searching */ 607 // j = 0; 608 // while (j <= n - m) { 609 // c = y[i+j]; 610 // if (x[m-1] == c) 611 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 612 // if (i < 0) return j; 613 // // c < 256 for Latin1 string, so, no need for branch 614 // #ifdef SOURCE_STRING_IS_LATIN1 615 // // LL case: (c< 256) always true. Remove branch 616 // j += bc[y[j+m-1]]; 617 // #endif 618 // #ifndef PATTERN_STRING_IS_UTF 619 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 620 // if (c < ASIZE) 621 // j += bc[y[j+m-1]]; 622 // else 623 // j += 1 624 // #endif 625 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 626 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 627 // if (c < ASIZE) 628 // j += bc[y[j+m-1]]; 629 // else 630 // j += m 631 // #endif 632 // } 633 // } 634 635 if (icnt1 == -1) { 636 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 637 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 638 Register cnt1end = tmp2; 639 Register str2end = cnt2; 640 Register skipch = tmp2; 641 642 // str1 length is >=8, so, we can read at least 1 register for cases when 643 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 644 // UL case. We'll re-read last character in inner pre-loop code to have 645 // single outer pre-loop load 646 const int firstStep = isL ? 7 : 3; 647 648 const int ASIZE = 256; 649 const int STORED_BYTES = 32; // amount of bytes stored per instruction 650 sub(sp, sp, ASIZE); 651 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 652 mov(ch1, sp); 653 BIND(BM_INIT_LOOP); 654 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 655 subs(tmp5, tmp5, 1); 656 br(GT, BM_INIT_LOOP); 657 658 sub(cnt1tmp, cnt1, 1); 659 mov(tmp5, str2); 660 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 661 sub(ch2, cnt1, 1); 662 mov(tmp3, str1); 663 BIND(BCLOOP); 664 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 665 if (!str1_isL) { 666 subs(zr, ch1, ASIZE); 667 br(HS, BCSKIP); 668 } 669 strb(ch2, Address(sp, ch1)); 670 BIND(BCSKIP); 671 subs(ch2, ch2, 1); 672 br(GT, BCLOOP); 673 674 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 675 if (str1_isL == str2_isL) { 676 // load last 8 bytes (8LL/4UU symbols) 677 ldr(tmp6, Address(tmp6, -wordSize)); 678 } else { 679 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 680 // convert Latin1 to UTF. We'll have to wait until load completed, but 681 // it's still faster than per-character loads+checks 682 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 683 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 684 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 685 andr(tmp6, tmp6, 0xFF); // str1[N-4] 686 orr(ch2, ch1, ch2, LSL, 16); 687 orr(tmp6, tmp6, tmp3, LSL, 48); 688 orr(tmp6, tmp6, ch2, LSL, 16); 689 } 690 BIND(BMLOOPSTR2); 691 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 692 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 693 if (str1_isL == str2_isL) { 694 // re-init tmp3. It's for free because it's executed in parallel with 695 // load above. Alternative is to initialize it before loop, but it'll 696 // affect performance on in-order systems with 2 or more ld/st pipelines 697 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 698 } 699 if (!isL) { // UU/UL case 700 lsl(ch2, cnt1tmp, 1); // offset in bytes 701 } 702 cmp(tmp3, skipch); 703 br(NE, BMSKIP); 704 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 705 mov(ch1, tmp6); 706 if (isL) { 707 b(BMLOOPSTR1_AFTER_LOAD); 708 } else { 709 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 710 b(BMLOOPSTR1_CMP); 711 } 712 BIND(BMLOOPSTR1); 713 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 714 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 715 BIND(BMLOOPSTR1_AFTER_LOAD); 716 subs(cnt1tmp, cnt1tmp, 1); 717 br(LT, BMLOOPSTR1_LASTCMP); 718 BIND(BMLOOPSTR1_CMP); 719 cmp(ch1, ch2); 720 br(EQ, BMLOOPSTR1); 721 BIND(BMSKIP); 722 if (!isL) { 723 // if we've met UTF symbol while searching Latin1 pattern, then we can 724 // skip cnt1 symbols 725 if (str1_isL != str2_isL) { 726 mov(result_tmp, cnt1); 727 } else { 728 mov(result_tmp, 1); 729 } 730 subs(zr, skipch, ASIZE); 731 br(HS, BMADV); 732 } 733 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 734 BIND(BMADV); 735 sub(cnt1tmp, cnt1, 1); 736 add(str2, str2, result_tmp, LSL, str2_chr_shift); 737 cmp(str2, str2end); 738 br(LE, BMLOOPSTR2); 739 add(sp, sp, ASIZE); 740 b(NOMATCH); 741 BIND(BMLOOPSTR1_LASTCMP); 742 cmp(ch1, ch2); 743 br(NE, BMSKIP); 744 BIND(BMMATCH); 745 sub(result, str2, tmp5); 746 if (!str2_isL) lsr(result, result, 1); 747 add(sp, sp, ASIZE); 748 b(DONE); 749 750 BIND(LINEARSTUB); 751 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 752 br(LT, LINEAR_MEDIUM); 753 mov(result, zr); 754 RuntimeAddress stub = nullptr; 755 if (isL) { 756 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 757 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 758 } else if (str1_isL) { 759 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 760 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 761 } else { 762 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 763 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 764 } 765 address call = trampoline_call(stub); 766 if (call == nullptr) { 767 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 768 ciEnv::current()->record_failure("CodeCache is full"); 769 return; 770 } 771 b(DONE); 772 } 773 774 BIND(LINEARSEARCH); 775 { 776 Label DO1, DO2, DO3; 777 778 Register str2tmp = tmp2; 779 Register first = tmp3; 780 781 if (icnt1 == -1) 782 { 783 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 784 785 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 786 br(LT, DOSHORT); 787 BIND(LINEAR_MEDIUM); 788 (this->*str1_load_1chr)(first, Address(str1)); 789 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 790 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 791 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 792 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 793 794 BIND(FIRST_LOOP); 795 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 796 cmp(first, ch2); 797 br(EQ, STR1_LOOP); 798 BIND(STR2_NEXT); 799 adds(cnt2_neg, cnt2_neg, str2_chr_size); 800 br(LE, FIRST_LOOP); 801 b(NOMATCH); 802 803 BIND(STR1_LOOP); 804 adds(cnt1tmp, cnt1_neg, str1_chr_size); 805 add(cnt2tmp, cnt2_neg, str2_chr_size); 806 br(GE, MATCH); 807 808 BIND(STR1_NEXT); 809 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 810 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 811 cmp(ch1, ch2); 812 br(NE, STR2_NEXT); 813 adds(cnt1tmp, cnt1tmp, str1_chr_size); 814 add(cnt2tmp, cnt2tmp, str2_chr_size); 815 br(LT, STR1_NEXT); 816 b(MATCH); 817 818 BIND(DOSHORT); 819 if (str1_isL == str2_isL) { 820 cmp(cnt1, (u1)2); 821 br(LT, DO1); 822 br(GT, DO3); 823 } 824 } 825 826 if (icnt1 == 4) { 827 Label CH1_LOOP; 828 829 (this->*load_4chr)(ch1, str1); 830 sub(result_tmp, cnt2, 4); 831 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 832 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 833 834 BIND(CH1_LOOP); 835 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 836 cmp(ch1, ch2); 837 br(EQ, MATCH); 838 adds(cnt2_neg, cnt2_neg, str2_chr_size); 839 br(LE, CH1_LOOP); 840 b(NOMATCH); 841 } 842 843 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 844 Label CH1_LOOP; 845 846 BIND(DO2); 847 (this->*load_2chr)(ch1, str1); 848 if (icnt1 == 2) { 849 sub(result_tmp, cnt2, 2); 850 } 851 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 852 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 853 BIND(CH1_LOOP); 854 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 855 cmp(ch1, ch2); 856 br(EQ, MATCH); 857 adds(cnt2_neg, cnt2_neg, str2_chr_size); 858 br(LE, CH1_LOOP); 859 b(NOMATCH); 860 } 861 862 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 863 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 864 865 BIND(DO3); 866 (this->*load_2chr)(first, str1); 867 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 868 if (icnt1 == 3) { 869 sub(result_tmp, cnt2, 3); 870 } 871 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 872 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 873 BIND(FIRST_LOOP); 874 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 875 cmpw(first, ch2); 876 br(EQ, STR1_LOOP); 877 BIND(STR2_NEXT); 878 adds(cnt2_neg, cnt2_neg, str2_chr_size); 879 br(LE, FIRST_LOOP); 880 b(NOMATCH); 881 882 BIND(STR1_LOOP); 883 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 884 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 885 cmp(ch1, ch2); 886 br(NE, STR2_NEXT); 887 b(MATCH); 888 } 889 890 if (icnt1 == -1 || icnt1 == 1) { 891 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 892 893 BIND(DO1); 894 (this->*str1_load_1chr)(ch1, str1); 895 cmp(cnt2, (u1)8); 896 br(LT, DO1_SHORT); 897 898 sub(result_tmp, cnt2, 8/str2_chr_size); 899 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 900 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 901 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 902 903 if (str2_isL) { 904 orr(ch1, ch1, ch1, LSL, 8); 905 } 906 orr(ch1, ch1, ch1, LSL, 16); 907 orr(ch1, ch1, ch1, LSL, 32); 908 BIND(CH1_LOOP); 909 ldr(ch2, Address(str2, cnt2_neg)); 910 eor(ch2, ch1, ch2); 911 sub(tmp1, ch2, tmp3); 912 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 913 bics(tmp1, tmp1, tmp2); 914 br(NE, HAS_ZERO); 915 adds(cnt2_neg, cnt2_neg, 8); 916 br(LT, CH1_LOOP); 917 918 cmp(cnt2_neg, (u1)8); 919 mov(cnt2_neg, 0); 920 br(LT, CH1_LOOP); 921 b(NOMATCH); 922 923 BIND(HAS_ZERO); 924 rev(tmp1, tmp1); 925 clz(tmp1, tmp1); 926 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 927 b(MATCH); 928 929 BIND(DO1_SHORT); 930 mov(result_tmp, cnt2); 931 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 932 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 933 BIND(DO1_LOOP); 934 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 935 cmpw(ch1, ch2); 936 br(EQ, MATCH); 937 adds(cnt2_neg, cnt2_neg, str2_chr_size); 938 br(LT, DO1_LOOP); 939 } 940 } 941 BIND(NOMATCH); 942 mov(result, -1); 943 b(DONE); 944 BIND(MATCH); 945 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 946 BIND(DONE); 947 } 948 949 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 950 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 951 952 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 953 Register ch, Register result, 954 Register tmp1, Register tmp2, Register tmp3) 955 { 956 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 957 Register cnt1_neg = cnt1; 958 Register ch1 = rscratch1; 959 Register result_tmp = rscratch2; 960 961 cbz(cnt1, NOMATCH); 962 963 cmp(cnt1, (u1)4); 964 br(LT, DO1_SHORT); 965 966 orr(ch, ch, ch, LSL, 16); 967 orr(ch, ch, ch, LSL, 32); 968 969 sub(cnt1, cnt1, 4); 970 mov(result_tmp, cnt1); 971 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 972 sub(cnt1_neg, zr, cnt1, LSL, 1); 973 974 mov(tmp3, 0x0001000100010001); 975 976 BIND(CH1_LOOP); 977 ldr(ch1, Address(str1, cnt1_neg)); 978 eor(ch1, ch, ch1); 979 sub(tmp1, ch1, tmp3); 980 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 981 bics(tmp1, tmp1, tmp2); 982 br(NE, HAS_ZERO); 983 adds(cnt1_neg, cnt1_neg, 8); 984 br(LT, CH1_LOOP); 985 986 cmp(cnt1_neg, (u1)8); 987 mov(cnt1_neg, 0); 988 br(LT, CH1_LOOP); 989 b(NOMATCH); 990 991 BIND(HAS_ZERO); 992 rev(tmp1, tmp1); 993 clz(tmp1, tmp1); 994 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 995 b(MATCH); 996 997 BIND(DO1_SHORT); 998 mov(result_tmp, cnt1); 999 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1000 sub(cnt1_neg, zr, cnt1, LSL, 1); 1001 BIND(DO1_LOOP); 1002 ldrh(ch1, Address(str1, cnt1_neg)); 1003 cmpw(ch, ch1); 1004 br(EQ, MATCH); 1005 adds(cnt1_neg, cnt1_neg, 2); 1006 br(LT, DO1_LOOP); 1007 BIND(NOMATCH); 1008 mov(result, -1); 1009 b(DONE); 1010 BIND(MATCH); 1011 add(result, result_tmp, cnt1_neg, ASR, 1); 1012 BIND(DONE); 1013 } 1014 1015 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1016 Register ch, Register result, 1017 FloatRegister ztmp1, 1018 FloatRegister ztmp2, 1019 PRegister tmp_pg, 1020 PRegister tmp_pdn, bool isL) 1021 { 1022 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1023 assert(tmp_pg->is_governing(), 1024 "this register has to be a governing predicate register"); 1025 1026 Label LOOP, MATCH, DONE, NOMATCH; 1027 Register vec_len = rscratch1; 1028 Register idx = rscratch2; 1029 1030 SIMD_RegVariant T = (isL == true) ? B : H; 1031 1032 cbz(cnt1, NOMATCH); 1033 1034 // Assign the particular char throughout the vector. 1035 sve_dup(ztmp2, T, ch); 1036 if (isL) { 1037 sve_cntb(vec_len); 1038 } else { 1039 sve_cnth(vec_len); 1040 } 1041 mov(idx, 0); 1042 1043 // Generate a predicate to control the reading of input string. 1044 sve_whilelt(tmp_pg, T, idx, cnt1); 1045 1046 BIND(LOOP); 1047 // Read a vector of 8- or 16-bit data depending on the string type. Note 1048 // that inactive elements indicated by the predicate register won't cause 1049 // a data read from memory to the destination vector. 1050 if (isL) { 1051 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1052 } else { 1053 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1054 } 1055 add(idx, idx, vec_len); 1056 1057 // Perform the comparison. An element of the destination predicate is set 1058 // to active if the particular char is matched. 1059 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1060 1061 // Branch if the particular char is found. 1062 br(NE, MATCH); 1063 1064 sve_whilelt(tmp_pg, T, idx, cnt1); 1065 1066 // Loop back if the particular char not found. 1067 br(MI, LOOP); 1068 1069 BIND(NOMATCH); 1070 mov(result, -1); 1071 b(DONE); 1072 1073 BIND(MATCH); 1074 // Undo the index increment. 1075 sub(idx, idx, vec_len); 1076 1077 // Crop the vector to find its location. 1078 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1079 add(result, idx, -1); 1080 sve_incp(result, T, tmp_pdn); 1081 BIND(DONE); 1082 } 1083 1084 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1085 Register ch, Register result, 1086 Register tmp1, Register tmp2, Register tmp3) 1087 { 1088 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1089 Register cnt1_neg = cnt1; 1090 Register ch1 = rscratch1; 1091 Register result_tmp = rscratch2; 1092 1093 cbz(cnt1, NOMATCH); 1094 1095 cmp(cnt1, (u1)8); 1096 br(LT, DO1_SHORT); 1097 1098 orr(ch, ch, ch, LSL, 8); 1099 orr(ch, ch, ch, LSL, 16); 1100 orr(ch, ch, ch, LSL, 32); 1101 1102 sub(cnt1, cnt1, 8); 1103 mov(result_tmp, cnt1); 1104 lea(str1, Address(str1, cnt1)); 1105 sub(cnt1_neg, zr, cnt1); 1106 1107 mov(tmp3, 0x0101010101010101); 1108 1109 BIND(CH1_LOOP); 1110 ldr(ch1, Address(str1, cnt1_neg)); 1111 eor(ch1, ch, ch1); 1112 sub(tmp1, ch1, tmp3); 1113 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1114 bics(tmp1, tmp1, tmp2); 1115 br(NE, HAS_ZERO); 1116 adds(cnt1_neg, cnt1_neg, 8); 1117 br(LT, CH1_LOOP); 1118 1119 cmp(cnt1_neg, (u1)8); 1120 mov(cnt1_neg, 0); 1121 br(LT, CH1_LOOP); 1122 b(NOMATCH); 1123 1124 BIND(HAS_ZERO); 1125 rev(tmp1, tmp1); 1126 clz(tmp1, tmp1); 1127 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1128 b(MATCH); 1129 1130 BIND(DO1_SHORT); 1131 mov(result_tmp, cnt1); 1132 lea(str1, Address(str1, cnt1)); 1133 sub(cnt1_neg, zr, cnt1); 1134 BIND(DO1_LOOP); 1135 ldrb(ch1, Address(str1, cnt1_neg)); 1136 cmp(ch, ch1); 1137 br(EQ, MATCH); 1138 adds(cnt1_neg, cnt1_neg, 1); 1139 br(LT, DO1_LOOP); 1140 BIND(NOMATCH); 1141 mov(result, -1); 1142 b(DONE); 1143 BIND(MATCH); 1144 add(result, result_tmp, cnt1_neg); 1145 BIND(DONE); 1146 } 1147 1148 // Compare strings. 1149 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1150 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1151 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1152 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1153 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1154 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1155 SHORT_LOOP_START, TAIL_CHECK; 1156 1157 bool isLL = ae == StrIntrinsicNode::LL; 1158 bool isLU = ae == StrIntrinsicNode::LU; 1159 bool isUL = ae == StrIntrinsicNode::UL; 1160 1161 // The stub threshold for LL strings is: 72 (64 + 8) chars 1162 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1163 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1164 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1165 1166 bool str1_isL = isLL || isLU; 1167 bool str2_isL = isLL || isUL; 1168 1169 int str1_chr_shift = str1_isL ? 0 : 1; 1170 int str2_chr_shift = str2_isL ? 0 : 1; 1171 int str1_chr_size = str1_isL ? 1 : 2; 1172 int str2_chr_size = str2_isL ? 1 : 2; 1173 int minCharsInWord = isLL ? wordSize : wordSize/2; 1174 1175 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1176 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1177 (chr_insn)&MacroAssembler::ldrh; 1178 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1179 (chr_insn)&MacroAssembler::ldrh; 1180 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1181 (uxt_insn)&MacroAssembler::uxthw; 1182 1183 BLOCK_COMMENT("string_compare {"); 1184 1185 // Bizarrely, the counts are passed in bytes, regardless of whether they 1186 // are L or U strings, however the result is always in characters. 1187 if (!str1_isL) asrw(cnt1, cnt1, 1); 1188 if (!str2_isL) asrw(cnt2, cnt2, 1); 1189 1190 // Compute the minimum of the string lengths and save the difference. 1191 subsw(result, cnt1, cnt2); 1192 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1193 1194 // A very short string 1195 cmpw(cnt2, minCharsInWord); 1196 br(Assembler::LE, SHORT_STRING); 1197 1198 // Compare longwords 1199 // load first parts of strings and finish initialization while loading 1200 { 1201 if (str1_isL == str2_isL) { // LL or UU 1202 ldr(tmp1, Address(str1)); 1203 cmp(str1, str2); 1204 br(Assembler::EQ, DONE); 1205 ldr(tmp2, Address(str2)); 1206 cmp(cnt2, stub_threshold); 1207 br(GE, STUB); 1208 subsw(cnt2, cnt2, minCharsInWord); 1209 br(EQ, TAIL_CHECK); 1210 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1211 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1212 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1213 } else if (isLU) { 1214 ldrs(vtmp, Address(str1)); 1215 ldr(tmp2, Address(str2)); 1216 cmp(cnt2, stub_threshold); 1217 br(GE, STUB); 1218 subw(cnt2, cnt2, 4); 1219 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1220 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1221 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1222 zip1(vtmp, T8B, vtmp, vtmpZ); 1223 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1224 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1225 add(cnt1, cnt1, 4); 1226 fmovd(tmp1, vtmp); 1227 } else { // UL case 1228 ldr(tmp1, Address(str1)); 1229 ldrs(vtmp, Address(str2)); 1230 cmp(cnt2, stub_threshold); 1231 br(GE, STUB); 1232 subw(cnt2, cnt2, 4); 1233 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1234 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1235 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1236 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1237 zip1(vtmp, T8B, vtmp, vtmpZ); 1238 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1239 add(cnt1, cnt1, 8); 1240 fmovd(tmp2, vtmp); 1241 } 1242 adds(cnt2, cnt2, isUL ? 4 : 8); 1243 br(GE, TAIL); 1244 eor(rscratch2, tmp1, tmp2); 1245 cbnz(rscratch2, DIFF); 1246 // main loop 1247 bind(NEXT_WORD); 1248 if (str1_isL == str2_isL) { 1249 ldr(tmp1, Address(str1, cnt2)); 1250 ldr(tmp2, Address(str2, cnt2)); 1251 adds(cnt2, cnt2, 8); 1252 } else if (isLU) { 1253 ldrs(vtmp, Address(str1, cnt1)); 1254 ldr(tmp2, Address(str2, cnt2)); 1255 add(cnt1, cnt1, 4); 1256 zip1(vtmp, T8B, vtmp, vtmpZ); 1257 fmovd(tmp1, vtmp); 1258 adds(cnt2, cnt2, 8); 1259 } else { // UL 1260 ldrs(vtmp, Address(str2, cnt2)); 1261 ldr(tmp1, Address(str1, cnt1)); 1262 zip1(vtmp, T8B, vtmp, vtmpZ); 1263 add(cnt1, cnt1, 8); 1264 fmovd(tmp2, vtmp); 1265 adds(cnt2, cnt2, 4); 1266 } 1267 br(GE, TAIL); 1268 1269 eor(rscratch2, tmp1, tmp2); 1270 cbz(rscratch2, NEXT_WORD); 1271 b(DIFF); 1272 bind(TAIL); 1273 eor(rscratch2, tmp1, tmp2); 1274 cbnz(rscratch2, DIFF); 1275 // Last longword. In the case where length == 4 we compare the 1276 // same longword twice, but that's still faster than another 1277 // conditional branch. 1278 if (str1_isL == str2_isL) { 1279 ldr(tmp1, Address(str1)); 1280 ldr(tmp2, Address(str2)); 1281 } else if (isLU) { 1282 ldrs(vtmp, Address(str1)); 1283 ldr(tmp2, Address(str2)); 1284 zip1(vtmp, T8B, vtmp, vtmpZ); 1285 fmovd(tmp1, vtmp); 1286 } else { // UL 1287 ldrs(vtmp, Address(str2)); 1288 ldr(tmp1, Address(str1)); 1289 zip1(vtmp, T8B, vtmp, vtmpZ); 1290 fmovd(tmp2, vtmp); 1291 } 1292 bind(TAIL_CHECK); 1293 eor(rscratch2, tmp1, tmp2); 1294 cbz(rscratch2, DONE); 1295 1296 // Find the first different characters in the longwords and 1297 // compute their difference. 1298 bind(DIFF); 1299 rev(rscratch2, rscratch2); 1300 clz(rscratch2, rscratch2); 1301 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1302 lsrv(tmp1, tmp1, rscratch2); 1303 (this->*ext_chr)(tmp1, tmp1); 1304 lsrv(tmp2, tmp2, rscratch2); 1305 (this->*ext_chr)(tmp2, tmp2); 1306 subw(result, tmp1, tmp2); 1307 b(DONE); 1308 } 1309 1310 bind(STUB); 1311 RuntimeAddress stub = nullptr; 1312 switch(ae) { 1313 case StrIntrinsicNode::LL: 1314 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1315 break; 1316 case StrIntrinsicNode::UU: 1317 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1318 break; 1319 case StrIntrinsicNode::LU: 1320 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1321 break; 1322 case StrIntrinsicNode::UL: 1323 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1324 break; 1325 default: 1326 ShouldNotReachHere(); 1327 } 1328 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1329 address call = trampoline_call(stub); 1330 if (call == nullptr) { 1331 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1332 ciEnv::current()->record_failure("CodeCache is full"); 1333 return; 1334 } 1335 b(DONE); 1336 1337 bind(SHORT_STRING); 1338 // Is the minimum length zero? 1339 cbz(cnt2, DONE); 1340 // arrange code to do most branches while loading and loading next characters 1341 // while comparing previous 1342 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1343 subs(cnt2, cnt2, 1); 1344 br(EQ, SHORT_LAST_INIT); 1345 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1346 b(SHORT_LOOP_START); 1347 bind(SHORT_LOOP); 1348 subs(cnt2, cnt2, 1); 1349 br(EQ, SHORT_LAST); 1350 bind(SHORT_LOOP_START); 1351 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1352 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1353 cmp(tmp1, cnt1); 1354 br(NE, SHORT_LOOP_TAIL); 1355 subs(cnt2, cnt2, 1); 1356 br(EQ, SHORT_LAST2); 1357 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1358 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1359 cmp(tmp2, rscratch1); 1360 br(EQ, SHORT_LOOP); 1361 sub(result, tmp2, rscratch1); 1362 b(DONE); 1363 bind(SHORT_LOOP_TAIL); 1364 sub(result, tmp1, cnt1); 1365 b(DONE); 1366 bind(SHORT_LAST2); 1367 cmp(tmp2, rscratch1); 1368 br(EQ, DONE); 1369 sub(result, tmp2, rscratch1); 1370 1371 b(DONE); 1372 bind(SHORT_LAST_INIT); 1373 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1374 bind(SHORT_LAST); 1375 cmp(tmp1, cnt1); 1376 br(EQ, DONE); 1377 sub(result, tmp1, cnt1); 1378 1379 bind(DONE); 1380 1381 BLOCK_COMMENT("} string_compare"); 1382 } 1383 1384 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1385 FloatRegister src2, Condition cond, bool isQ) { 1386 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1387 FloatRegister zn = src1, zm = src2; 1388 bool needs_negation = false; 1389 switch (cond) { 1390 case LT: cond = GT; zn = src2; zm = src1; break; 1391 case LE: cond = GE; zn = src2; zm = src1; break; 1392 case LO: cond = HI; zn = src2; zm = src1; break; 1393 case LS: cond = HS; zn = src2; zm = src1; break; 1394 case NE: cond = EQ; needs_negation = true; break; 1395 default: 1396 break; 1397 } 1398 1399 if (is_floating_point_type(bt)) { 1400 fcm(cond, dst, size, zn, zm); 1401 } else { 1402 cm(cond, dst, size, zn, zm); 1403 } 1404 1405 if (needs_negation) { 1406 notr(dst, isQ ? T16B : T8B, dst); 1407 } 1408 } 1409 1410 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1411 Condition cond, bool isQ) { 1412 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1413 if (bt == T_FLOAT || bt == T_DOUBLE) { 1414 if (cond == Assembler::NE) { 1415 fcm(Assembler::EQ, dst, size, src); 1416 notr(dst, isQ ? T16B : T8B, dst); 1417 } else { 1418 fcm(cond, dst, size, src); 1419 } 1420 } else { 1421 if (cond == Assembler::NE) { 1422 cm(Assembler::EQ, dst, size, src); 1423 notr(dst, isQ ? T16B : T8B, dst); 1424 } else { 1425 cm(cond, dst, size, src); 1426 } 1427 } 1428 } 1429 1430 // Compress the least significant bit of each byte to the rightmost and clear 1431 // the higher garbage bits. 1432 void C2_MacroAssembler::bytemask_compress(Register dst) { 1433 // Example input, dst = 0x01 00 00 00 01 01 00 01 1434 // The "??" bytes are garbage. 1435 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1436 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1437 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1438 andr(dst, dst, 0xff); // dst = 0x8D 1439 } 1440 1441 // Pack the value of each mask element in "src" into a long value in "dst", at most 1442 // the first 64 lane elements. The input "src" is a vector of boolean represented as 1443 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into 1444 // one bit in "dst". 1445 // 1446 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16 1447 // Expected: dst = 0x658D 1448 // 1449 // Clobbers: rscratch1 1450 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src, 1451 FloatRegister vtmp, int lane_cnt) { 1452 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1453 assert_different_registers(dst, rscratch1); 1454 assert_different_registers(src, vtmp); 1455 assert(UseSVE > 0, "must be"); 1456 1457 // Compress the lowest 8 bytes. 1458 fmovd(dst, src); 1459 bytemask_compress(dst); 1460 if (lane_cnt <= 8) return; 1461 1462 // Repeat on higher bytes and join the results. 1463 // Compress 8 bytes in each iteration. 1464 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1465 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp); 1466 bytemask_compress(rscratch1); 1467 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1468 } 1469 } 1470 1471 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT 1472 // instruction which requires the FEAT_BITPERM feature. 1473 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src, 1474 FloatRegister vtmp1, FloatRegister vtmp2, 1475 int lane_cnt) { 1476 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1477 assert_different_registers(src, vtmp1, vtmp2); 1478 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be"); 1479 1480 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1481 // is to compress each significant bit of the byte in a cross-lane way. Due 1482 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1483 // (bit-compress in each lane) with the biggest lane size (T = D) then 1484 // concatenate the results. 1485 1486 // The second source input of BEXT, initialized with 0x01 in each byte. 1487 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1488 sve_dup(vtmp2, B, 1); 1489 1490 // BEXT vtmp1.D, src.D, vtmp2.D 1491 // src = 0x0001010000010001 | 0x0100000001010001 1492 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1493 // --------------------------------------- 1494 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1495 sve_bext(vtmp1, D, src, vtmp2); 1496 1497 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1498 // result to dst. 1499 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1500 // dst = 0x658D 1501 if (lane_cnt <= 8) { 1502 // No need to concatenate. 1503 umov(dst, vtmp1, B, 0); 1504 } else if (lane_cnt <= 16) { 1505 ins(vtmp1, B, vtmp1, 1, 8); 1506 umov(dst, vtmp1, H, 0); 1507 } else { 1508 // As the lane count is 64 at most, the final expected value must be in 1509 // the lowest 64 bits after narrowing vtmp1 from D to B. 1510 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1511 umov(dst, vtmp1, D, 0); 1512 } 1513 } 1514 1515 // Unpack the mask, a long value in "src", into a vector register of boolean 1516 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in 1517 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at 1518 // most 64 lanes. 1519 // 1520 // Below example gives the expected dst vector register, with a valid src(0x658D) 1521 // on a 128-bit vector size machine. 1522 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1523 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src, 1524 FloatRegister vtmp, int lane_cnt) { 1525 assert_different_registers(dst, vtmp); 1526 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1527 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1528 1529 // Example: src = 0x658D, lane_cnt = 16 1530 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1531 1532 // Put long value from general purpose register into the first lane of vector. 1533 // vtmp = 0x0000000000000000 | 0x000000000000658D 1534 sve_dup(vtmp, B, 0); 1535 mov(vtmp, D, 0, src); 1536 1537 // Transform the value in the first lane which is mask in bit now to the mask in 1538 // byte, which can be done by SVE2's BDEP instruction. 1539 1540 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1541 // vtmp = 0x0000000000000065 | 0x000000000000008D 1542 if (lane_cnt <= 8) { 1543 // Nothing. As only one byte exsits. 1544 } else if (lane_cnt <= 16) { 1545 ins(vtmp, B, vtmp, 8, 1); 1546 } else { 1547 sve_vector_extend(vtmp, D, vtmp, B); 1548 } 1549 1550 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1551 // dst = 0x01010101 0x01010101 0x01010101 0x01010101 1552 sve_dup(dst, B, 1); 1553 1554 // BDEP dst.D, vtmp.D, dst.D 1555 // vtmp = 0x0000000000000065 | 0x000000000000008D 1556 // dst = 0x0101010101010101 | 0x0101010101010101 1557 // --------------------------------------- 1558 // dst = 0x0001010000010001 | 0x0100000001010001 1559 sve_bdep(dst, D, vtmp, dst); 1560 } 1561 1562 // Clobbers: rflags 1563 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1564 FloatRegister zn, FloatRegister zm, Condition cond) { 1565 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1566 FloatRegister z1 = zn, z2 = zm; 1567 switch (cond) { 1568 case LE: z1 = zm; z2 = zn; cond = GE; break; 1569 case LT: z1 = zm; z2 = zn; cond = GT; break; 1570 case LO: z1 = zm; z2 = zn; cond = HI; break; 1571 case LS: z1 = zm; z2 = zn; cond = HS; break; 1572 default: 1573 break; 1574 } 1575 1576 SIMD_RegVariant size = elemType_to_regVariant(bt); 1577 if (is_floating_point_type(bt)) { 1578 sve_fcm(cond, pd, size, pg, z1, z2); 1579 } else { 1580 assert(is_integral_type(bt), "unsupported element type"); 1581 sve_cmp(cond, pd, size, pg, z1, z2); 1582 } 1583 } 1584 1585 // Get index of the last mask lane that is set 1586 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1587 SIMD_RegVariant size = elemType_to_regVariant(bt); 1588 sve_rev(ptmp, size, src); 1589 sve_brkb(ptmp, ptrue, ptmp, false); 1590 sve_cntp(dst, size, ptrue, ptmp); 1591 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1592 subw(dst, rscratch1, dst); 1593 } 1594 1595 // Extend integer vector src to dst with the same lane count 1596 // but larger element size, e.g. 4B -> 4I 1597 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1598 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1599 if (src_bt == T_BYTE) { 1600 // 4B to 4S/4I, 8B to 8S 1601 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1602 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported"); 1603 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1604 if (dst_bt == T_INT) { 1605 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1606 } 1607 } else if (src_bt == T_SHORT) { 1608 // 2S to 2I/2L, 4S to 4I 1609 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1610 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported"); 1611 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1612 if (dst_bt == T_LONG) { 1613 _xshll(is_unsigned, dst, T2D, dst, T2S, 0); 1614 } 1615 } else if (src_bt == T_INT) { 1616 // 2I to 2L 1617 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1618 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1619 } else { 1620 ShouldNotReachHere(); 1621 } 1622 } 1623 1624 // Narrow integer vector src down to dst with the same lane count 1625 // but smaller element size, e.g. 4I -> 4B 1626 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1627 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1628 if (src_bt == T_SHORT) { 1629 // 4S/8S to 4B/8B 1630 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1631 assert(dst_bt == T_BYTE, "unsupported"); 1632 xtn(dst, T8B, src, T8H); 1633 } else if (src_bt == T_INT) { 1634 // 2I to 2S, 4I to 4B/4S 1635 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1636 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1637 xtn(dst, T4H, src, T4S); 1638 if (dst_bt == T_BYTE) { 1639 xtn(dst, T8B, dst, T8H); 1640 } 1641 } else if (src_bt == T_LONG) { 1642 // 2L to 2S/2I 1643 assert(src_vlen_in_bytes == 16, "unsupported"); 1644 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported"); 1645 xtn(dst, T2S, src, T2D); 1646 if (dst_bt == T_SHORT) { 1647 xtn(dst, T4H, dst, T4S); 1648 } 1649 } else { 1650 ShouldNotReachHere(); 1651 } 1652 } 1653 1654 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1655 FloatRegister src, SIMD_RegVariant src_size, 1656 bool is_unsigned) { 1657 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1658 1659 if (src_size == B) { 1660 switch (dst_size) { 1661 case H: 1662 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1663 break; 1664 case S: 1665 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1666 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1667 break; 1668 case D: 1669 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1670 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1671 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1672 break; 1673 default: 1674 ShouldNotReachHere(); 1675 } 1676 } else if (src_size == H) { 1677 if (dst_size == S) { 1678 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1679 } else { // D 1680 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1681 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1682 } 1683 } else if (src_size == S) { 1684 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1685 } 1686 } 1687 1688 // Vector narrow from src to dst with specified element sizes. 1689 // High part of dst vector will be filled with zero. 1690 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1691 FloatRegister src, SIMD_RegVariant src_size, 1692 FloatRegister tmp) { 1693 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1694 assert_different_registers(src, tmp); 1695 sve_dup(tmp, src_size, 0); 1696 if (src_size == D) { 1697 switch (dst_size) { 1698 case S: 1699 sve_uzp1(dst, S, src, tmp); 1700 break; 1701 case H: 1702 assert_different_registers(dst, tmp); 1703 sve_uzp1(dst, S, src, tmp); 1704 sve_uzp1(dst, H, dst, tmp); 1705 break; 1706 case B: 1707 assert_different_registers(dst, tmp); 1708 sve_uzp1(dst, S, src, tmp); 1709 sve_uzp1(dst, H, dst, tmp); 1710 sve_uzp1(dst, B, dst, tmp); 1711 break; 1712 default: 1713 ShouldNotReachHere(); 1714 } 1715 } else if (src_size == S) { 1716 if (dst_size == H) { 1717 sve_uzp1(dst, H, src, tmp); 1718 } else { // B 1719 assert_different_registers(dst, tmp); 1720 sve_uzp1(dst, H, src, tmp); 1721 sve_uzp1(dst, B, dst, tmp); 1722 } 1723 } else if (src_size == H) { 1724 sve_uzp1(dst, B, src, tmp); 1725 } 1726 } 1727 1728 // Extend src predicate to dst predicate with the same lane count but larger 1729 // element size, e.g. 64Byte -> 512Long 1730 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1731 uint dst_element_length_in_bytes, 1732 uint src_element_length_in_bytes) { 1733 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1734 sve_punpklo(dst, src); 1735 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1736 sve_punpklo(dst, src); 1737 sve_punpklo(dst, dst); 1738 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1739 sve_punpklo(dst, src); 1740 sve_punpklo(dst, dst); 1741 sve_punpklo(dst, dst); 1742 } else { 1743 assert(false, "unsupported"); 1744 ShouldNotReachHere(); 1745 } 1746 } 1747 1748 // Narrow src predicate to dst predicate with the same lane count but 1749 // smaller element size, e.g. 512Long -> 64Byte 1750 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1751 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1752 // The insignificant bits in src predicate are expected to be zero. 1753 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1754 // passed as the second argument. An example narrowing operation with a given mask would be - 1755 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1756 // Mask (for 2 Longs) : TF 1757 // Predicate register for the above mask (16 bits) : 00000001 00000000 1758 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1759 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1760 assert_different_registers(src, ptmp); 1761 assert_different_registers(dst, ptmp); 1762 sve_pfalse(ptmp); 1763 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1764 sve_uzp1(dst, B, src, ptmp); 1765 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1766 sve_uzp1(dst, H, src, ptmp); 1767 sve_uzp1(dst, B, dst, ptmp); 1768 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1769 sve_uzp1(dst, S, src, ptmp); 1770 sve_uzp1(dst, H, dst, ptmp); 1771 sve_uzp1(dst, B, dst, ptmp); 1772 } else { 1773 assert(false, "unsupported"); 1774 ShouldNotReachHere(); 1775 } 1776 } 1777 1778 // Vector reduction add for integral type with ASIMD instructions. 1779 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1780 Register isrc, FloatRegister vsrc, 1781 unsigned vector_length_in_bytes, 1782 FloatRegister vtmp) { 1783 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1784 assert_different_registers(dst, isrc); 1785 bool isQ = vector_length_in_bytes == 16; 1786 1787 BLOCK_COMMENT("neon_reduce_add_integral {"); 1788 switch(bt) { 1789 case T_BYTE: 1790 addv(vtmp, isQ ? T16B : T8B, vsrc); 1791 smov(dst, vtmp, B, 0); 1792 addw(dst, dst, isrc, ext::sxtb); 1793 break; 1794 case T_SHORT: 1795 addv(vtmp, isQ ? T8H : T4H, vsrc); 1796 smov(dst, vtmp, H, 0); 1797 addw(dst, dst, isrc, ext::sxth); 1798 break; 1799 case T_INT: 1800 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1801 umov(dst, vtmp, S, 0); 1802 addw(dst, dst, isrc); 1803 break; 1804 case T_LONG: 1805 assert(isQ, "unsupported"); 1806 addpd(vtmp, vsrc); 1807 umov(dst, vtmp, D, 0); 1808 add(dst, dst, isrc); 1809 break; 1810 default: 1811 assert(false, "unsupported"); 1812 ShouldNotReachHere(); 1813 } 1814 BLOCK_COMMENT("} neon_reduce_add_integral"); 1815 } 1816 1817 // Vector reduction multiply for integral type with ASIMD instructions. 1818 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1819 // Clobbers: rscratch1 1820 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1821 Register isrc, FloatRegister vsrc, 1822 unsigned vector_length_in_bytes, 1823 FloatRegister vtmp1, FloatRegister vtmp2) { 1824 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1825 bool isQ = vector_length_in_bytes == 16; 1826 1827 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1828 switch(bt) { 1829 case T_BYTE: 1830 if (isQ) { 1831 // Multiply the lower half and higher half of vector iteratively. 1832 // vtmp1 = vsrc[8:15] 1833 ins(vtmp1, D, vsrc, 0, 1); 1834 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1835 mulv(vtmp1, T8B, vtmp1, vsrc); 1836 // vtmp2 = vtmp1[4:7] 1837 ins(vtmp2, S, vtmp1, 0, 1); 1838 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1839 mulv(vtmp1, T8B, vtmp2, vtmp1); 1840 } else { 1841 ins(vtmp1, S, vsrc, 0, 1); 1842 mulv(vtmp1, T8B, vtmp1, vsrc); 1843 } 1844 // vtmp2 = vtmp1[2:3] 1845 ins(vtmp2, H, vtmp1, 0, 1); 1846 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1847 mulv(vtmp2, T8B, vtmp2, vtmp1); 1848 // dst = vtmp2[0] * isrc * vtmp2[1] 1849 umov(rscratch1, vtmp2, B, 0); 1850 mulw(dst, rscratch1, isrc); 1851 sxtb(dst, dst); 1852 umov(rscratch1, vtmp2, B, 1); 1853 mulw(dst, rscratch1, dst); 1854 sxtb(dst, dst); 1855 break; 1856 case T_SHORT: 1857 if (isQ) { 1858 ins(vtmp2, D, vsrc, 0, 1); 1859 mulv(vtmp2, T4H, vtmp2, vsrc); 1860 ins(vtmp1, S, vtmp2, 0, 1); 1861 mulv(vtmp1, T4H, vtmp1, vtmp2); 1862 } else { 1863 ins(vtmp1, S, vsrc, 0, 1); 1864 mulv(vtmp1, T4H, vtmp1, vsrc); 1865 } 1866 umov(rscratch1, vtmp1, H, 0); 1867 mulw(dst, rscratch1, isrc); 1868 sxth(dst, dst); 1869 umov(rscratch1, vtmp1, H, 1); 1870 mulw(dst, rscratch1, dst); 1871 sxth(dst, dst); 1872 break; 1873 case T_INT: 1874 if (isQ) { 1875 ins(vtmp1, D, vsrc, 0, 1); 1876 mulv(vtmp1, T2S, vtmp1, vsrc); 1877 } else { 1878 vtmp1 = vsrc; 1879 } 1880 umov(rscratch1, vtmp1, S, 0); 1881 mul(dst, rscratch1, isrc); 1882 umov(rscratch1, vtmp1, S, 1); 1883 mul(dst, rscratch1, dst); 1884 break; 1885 case T_LONG: 1886 umov(rscratch1, vsrc, D, 0); 1887 mul(dst, isrc, rscratch1); 1888 umov(rscratch1, vsrc, D, 1); 1889 mul(dst, dst, rscratch1); 1890 break; 1891 default: 1892 assert(false, "unsupported"); 1893 ShouldNotReachHere(); 1894 } 1895 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1896 } 1897 1898 // Vector reduction multiply for floating-point type with ASIMD instructions. 1899 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1900 FloatRegister fsrc, FloatRegister vsrc, 1901 unsigned vector_length_in_bytes, 1902 FloatRegister vtmp) { 1903 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1904 bool isQ = vector_length_in_bytes == 16; 1905 1906 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1907 switch(bt) { 1908 // The T_SHORT type below is for Float16 type which also uses floating-point 1909 // instructions. 1910 case T_SHORT: 1911 fmulh(dst, fsrc, vsrc); 1912 ext(vtmp, T8B, vsrc, vsrc, 2); 1913 fmulh(dst, dst, vtmp); 1914 ext(vtmp, T8B, vsrc, vsrc, 4); 1915 fmulh(dst, dst, vtmp); 1916 ext(vtmp, T8B, vsrc, vsrc, 6); 1917 fmulh(dst, dst, vtmp); 1918 if (isQ) { 1919 ext(vtmp, T16B, vsrc, vsrc, 8); 1920 fmulh(dst, dst, vtmp); 1921 ext(vtmp, T16B, vsrc, vsrc, 10); 1922 fmulh(dst, dst, vtmp); 1923 ext(vtmp, T16B, vsrc, vsrc, 12); 1924 fmulh(dst, dst, vtmp); 1925 ext(vtmp, T16B, vsrc, vsrc, 14); 1926 fmulh(dst, dst, vtmp); 1927 } 1928 break; 1929 case T_FLOAT: 1930 fmuls(dst, fsrc, vsrc); 1931 ins(vtmp, S, vsrc, 0, 1); 1932 fmuls(dst, dst, vtmp); 1933 if (isQ) { 1934 ins(vtmp, S, vsrc, 0, 2); 1935 fmuls(dst, dst, vtmp); 1936 ins(vtmp, S, vsrc, 0, 3); 1937 fmuls(dst, dst, vtmp); 1938 } 1939 break; 1940 case T_DOUBLE: 1941 assert(isQ, "unsupported"); 1942 fmuld(dst, fsrc, vsrc); 1943 ins(vtmp, D, vsrc, 0, 1); 1944 fmuld(dst, dst, vtmp); 1945 break; 1946 default: 1947 assert(false, "unsupported"); 1948 ShouldNotReachHere(); 1949 } 1950 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1951 } 1952 1953 // Vector reduction add for half float type with ASIMD instructions. 1954 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc, 1955 unsigned vector_length_in_bytes, FloatRegister vtmp) { 1956 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1957 bool isQ = vector_length_in_bytes == 16; 1958 1959 BLOCK_COMMENT("neon_reduce_add_fp16 {"); 1960 faddh(dst, fsrc, vsrc); 1961 ext(vtmp, T8B, vsrc, vsrc, 2); 1962 faddh(dst, dst, vtmp); 1963 ext(vtmp, T8B, vsrc, vsrc, 4); 1964 faddh(dst, dst, vtmp); 1965 ext(vtmp, T8B, vsrc, vsrc, 6); 1966 faddh(dst, dst, vtmp); 1967 if (isQ) { 1968 ext(vtmp, T16B, vsrc, vsrc, 8); 1969 faddh(dst, dst, vtmp); 1970 ext(vtmp, T16B, vsrc, vsrc, 10); 1971 faddh(dst, dst, vtmp); 1972 ext(vtmp, T16B, vsrc, vsrc, 12); 1973 faddh(dst, dst, vtmp); 1974 ext(vtmp, T16B, vsrc, vsrc, 14); 1975 faddh(dst, dst, vtmp); 1976 } 1977 BLOCK_COMMENT("} neon_reduce_add_fp16"); 1978 } 1979 1980 // Helper to select logical instruction 1981 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1982 Register Rn, Register Rm, 1983 enum shift_kind kind, unsigned shift) { 1984 switch(opc) { 1985 case Op_AndReductionV: 1986 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1987 break; 1988 case Op_OrReductionV: 1989 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1990 break; 1991 case Op_XorReductionV: 1992 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1993 break; 1994 default: 1995 assert(false, "unsupported"); 1996 ShouldNotReachHere(); 1997 } 1998 } 1999 2000 // Vector reduction logical operations And, Or, Xor 2001 // Clobbers: rscratch1 2002 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2003 Register isrc, FloatRegister vsrc, 2004 unsigned vector_length_in_bytes) { 2005 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2006 "unsupported"); 2007 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2008 assert_different_registers(dst, isrc); 2009 bool isQ = vector_length_in_bytes == 16; 2010 2011 BLOCK_COMMENT("neon_reduce_logical {"); 2012 umov(rscratch1, vsrc, isQ ? D : S, 0); 2013 umov(dst, vsrc, isQ ? D : S, 1); 2014 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2015 switch(bt) { 2016 case T_BYTE: 2017 if (isQ) { 2018 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2019 } 2020 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2021 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2022 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2023 sxtb(dst, dst); 2024 break; 2025 case T_SHORT: 2026 if (isQ) { 2027 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2028 } 2029 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2030 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2031 sxth(dst, dst); 2032 break; 2033 case T_INT: 2034 if (isQ) { 2035 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2036 } 2037 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2038 break; 2039 case T_LONG: 2040 assert(isQ, "unsupported"); 2041 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2042 break; 2043 default: 2044 assert(false, "unsupported"); 2045 ShouldNotReachHere(); 2046 } 2047 BLOCK_COMMENT("} neon_reduce_logical"); 2048 } 2049 2050 // Helper function to decode min/max reduction operation properties 2051 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min, 2052 bool* is_unsigned, 2053 Condition* cond) { 2054 switch(opc) { 2055 case Op_MinReductionV: 2056 *is_min = true; *is_unsigned = false; *cond = LT; break; 2057 case Op_MaxReductionV: 2058 *is_min = false; *is_unsigned = false; *cond = GT; break; 2059 case Op_UMinReductionV: 2060 *is_min = true; *is_unsigned = true; *cond = LO; break; 2061 case Op_UMaxReductionV: 2062 *is_min = false; *is_unsigned = true; *cond = HI; break; 2063 default: 2064 ShouldNotReachHere(); 2065 } 2066 } 2067 2068 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions. 2069 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2070 // Clobbers: rscratch1, rflags 2071 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2072 Register isrc, FloatRegister vsrc, 2073 unsigned vector_length_in_bytes, 2074 FloatRegister vtmp) { 2075 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV || 2076 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported"); 2077 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2078 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2079 assert_different_registers(dst, isrc); 2080 bool isQ = vector_length_in_bytes == 16; 2081 bool is_min; 2082 bool is_unsigned; 2083 Condition cond; 2084 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond); 2085 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2086 if (bt == T_LONG) { 2087 assert(vtmp == fnoreg, "should be"); 2088 assert(isQ, "should be"); 2089 umov(rscratch1, vsrc, D, 0); 2090 cmp(isrc, rscratch1); 2091 csel(dst, isrc, rscratch1, cond); 2092 umov(rscratch1, vsrc, D, 1); 2093 cmp(dst, rscratch1); 2094 csel(dst, dst, rscratch1, cond); 2095 } else { 2096 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2097 if (size == T2S) { 2098 // For T2S (2x32-bit elements), use pairwise instructions because 2099 // uminv/umaxv/sminv/smaxv don't support arrangement 2S. 2100 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc); 2101 } else { 2102 // For other sizes, use reduction to scalar instructions. 2103 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc); 2104 } 2105 if (bt == T_INT) { 2106 umov(dst, vtmp, S, 0); 2107 } else if (is_unsigned) { 2108 umov(dst, vtmp, elemType_to_regVariant(bt), 0); 2109 } else { 2110 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2111 } 2112 cmpw(dst, isrc); 2113 cselw(dst, dst, isrc, cond); 2114 } 2115 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2116 } 2117 2118 // Vector reduction for integral type with SVE instruction. 2119 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin. 2120 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2121 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2122 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2123 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2124 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2125 assert_different_registers(src1, dst); 2126 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2127 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2128 switch (opc) { 2129 case Op_AddReductionVI: { 2130 sve_uaddv(tmp, size, pg, src2); 2131 if (bt == T_BYTE) { 2132 smov(dst, tmp, size, 0); 2133 addw(dst, src1, dst, ext::sxtb); 2134 } else if (bt == T_SHORT) { 2135 smov(dst, tmp, size, 0); 2136 addw(dst, src1, dst, ext::sxth); 2137 } else { 2138 umov(dst, tmp, size, 0); 2139 addw(dst, dst, src1); 2140 } 2141 break; 2142 } 2143 case Op_AddReductionVL: { 2144 sve_uaddv(tmp, size, pg, src2); 2145 umov(dst, tmp, size, 0); 2146 add(dst, dst, src1); 2147 break; 2148 } 2149 case Op_AndReductionV: { 2150 sve_andv(tmp, size, pg, src2); 2151 if (bt == T_INT || bt == T_LONG) { 2152 umov(dst, tmp, size, 0); 2153 } else { 2154 smov(dst, tmp, size, 0); 2155 } 2156 if (bt == T_LONG) { 2157 andr(dst, dst, src1); 2158 } else { 2159 andw(dst, dst, src1); 2160 } 2161 break; 2162 } 2163 case Op_OrReductionV: { 2164 sve_orv(tmp, size, pg, src2); 2165 if (bt == T_INT || bt == T_LONG) { 2166 umov(dst, tmp, size, 0); 2167 } else { 2168 smov(dst, tmp, size, 0); 2169 } 2170 if (bt == T_LONG) { 2171 orr(dst, dst, src1); 2172 } else { 2173 orrw(dst, dst, src1); 2174 } 2175 break; 2176 } 2177 case Op_XorReductionV: { 2178 sve_eorv(tmp, size, pg, src2); 2179 if (bt == T_INT || bt == T_LONG) { 2180 umov(dst, tmp, size, 0); 2181 } else { 2182 smov(dst, tmp, size, 0); 2183 } 2184 if (bt == T_LONG) { 2185 eor(dst, dst, src1); 2186 } else { 2187 eorw(dst, dst, src1); 2188 } 2189 break; 2190 } 2191 case Op_MaxReductionV: 2192 case Op_MinReductionV: 2193 case Op_UMaxReductionV: 2194 case Op_UMinReductionV: { 2195 bool is_min; 2196 bool is_unsigned; 2197 Condition cond; 2198 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond); 2199 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2); 2200 // Move result from vector to general register 2201 if (is_unsigned || bt == T_INT || bt == T_LONG) { 2202 umov(dst, tmp, size, 0); 2203 } else { 2204 smov(dst, tmp, size, 0); 2205 } 2206 if (bt == T_LONG) { 2207 cmp(dst, src1); 2208 csel(dst, dst, src1, cond); 2209 } else { 2210 cmpw(dst, src1); 2211 cselw(dst, dst, src1, cond); 2212 } 2213 break; 2214 } 2215 default: 2216 assert(false, "unsupported"); 2217 ShouldNotReachHere(); 2218 } 2219 2220 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2221 if (bt == T_BYTE) { 2222 sxtb(dst, dst); 2223 } else if (bt == T_SHORT) { 2224 sxth(dst, dst); 2225 } 2226 } 2227 } 2228 2229 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2230 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2231 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2232 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2233 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2234 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2235 2236 // Set all elements to false if the input "lane_cnt" is zero. 2237 if (lane_cnt == 0) { 2238 sve_pfalse(dst); 2239 return; 2240 } 2241 2242 SIMD_RegVariant size = elemType_to_regVariant(bt); 2243 assert(size != Q, "invalid size"); 2244 2245 // Set all true if "lane_cnt" equals to the max lane count. 2246 if (lane_cnt == max_vector_length) { 2247 sve_ptrue(dst, size, /* ALL */ 0b11111); 2248 return; 2249 } 2250 2251 // Fixed numbers for "ptrue". 2252 switch(lane_cnt) { 2253 case 1: /* VL1 */ 2254 case 2: /* VL2 */ 2255 case 3: /* VL3 */ 2256 case 4: /* VL4 */ 2257 case 5: /* VL5 */ 2258 case 6: /* VL6 */ 2259 case 7: /* VL7 */ 2260 case 8: /* VL8 */ 2261 sve_ptrue(dst, size, lane_cnt); 2262 return; 2263 case 16: 2264 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2265 return; 2266 case 32: 2267 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2268 return; 2269 case 64: 2270 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2271 return; 2272 case 128: 2273 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2274 return; 2275 case 256: 2276 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2277 return; 2278 default: 2279 break; 2280 } 2281 2282 // Special patterns for "ptrue". 2283 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2284 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2285 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2286 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2287 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2288 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2289 } else { 2290 // Encode to "whileltw" for the remaining cases. 2291 mov(rscratch1, lane_cnt); 2292 sve_whileltw(dst, size, zr, rscratch1); 2293 } 2294 } 2295 2296 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2297 // Any remaining elements of dst will be filled with zero. 2298 // Clobbers: rscratch1 2299 // Preserves: mask, vzr 2300 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2301 FloatRegister vzr, FloatRegister vtmp, 2302 PRegister pgtmp, unsigned vector_length_in_bytes) { 2303 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2304 // When called by sve_compress_byte, src and vtmp may be the same register. 2305 assert_different_registers(dst, src, vzr); 2306 assert_different_registers(dst, vtmp, vzr); 2307 assert_different_registers(mask, pgtmp); 2308 // high <-- low 2309 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits. 2310 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit. 2311 // Expected result: dst = 00 00 00 hh ee dd bb aa 2312 2313 // Extend lowest half to type INT. 2314 // dst = 00dd 00cc 00bb 00aa 2315 sve_uunpklo(dst, S, src); 2316 // pgtmp = 0001 0000 0001 0001 2317 sve_punpklo(pgtmp, mask); 2318 // Pack the active elements in size of type INT to the right, 2319 // and fill the remainings with zero. 2320 // dst = 0000 00dd 00bb 00aa 2321 sve_compact(dst, S, dst, pgtmp); 2322 // Narrow the result back to type SHORT. 2323 // dst = 00 00 00 00 00 dd bb aa 2324 sve_uzp1(dst, H, dst, vzr); 2325 2326 // Return if the vector length is no more than MaxVectorSize/2, since the 2327 // highest half is invalid. 2328 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) { 2329 return; 2330 } 2331 2332 // Count the active elements of lowest half. 2333 // rscratch1 = 3 2334 sve_cntp(rscratch1, S, ptrue, pgtmp); 2335 2336 // Repeat to the highest half. 2337 // pgtmp = 0001 0000 0000 0001 2338 sve_punpkhi(pgtmp, mask); 2339 // vtmp = 00hh 00gg 00ff 00ee 2340 sve_uunpkhi(vtmp, S, src); 2341 // vtmp = 0000 0000 00hh 00ee 2342 sve_compact(vtmp, S, vtmp, pgtmp); 2343 // vtmp = 00 00 00 00 00 00 hh ee 2344 sve_uzp1(vtmp, H, vtmp, vzr); 2345 2346 // pgtmp = 00 00 00 00 00 01 01 01 2347 sve_whilelt(pgtmp, H, zr, rscratch1); 2348 // Compressed low: dst = 00 00 00 00 00 dd bb aa 2349 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee 2350 // Combine the compressed low with the compressed high: 2351 // dst = 00 00 00 hh ee dd bb aa 2352 sve_splice(dst, H, pgtmp, vtmp); 2353 } 2354 2355 // Clobbers: rscratch1, rscratch2 2356 // Preserves: src, mask 2357 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2358 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 2359 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) { 2360 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2361 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3); 2362 assert_different_registers(mask, ptmp, pgtmp); 2363 // high <-- low 2364 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits. 2365 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit. 2366 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a 2367 FloatRegister vzr = vtmp3; 2368 sve_dup(vzr, B, 0); 2369 2370 // Extend lowest half to type SHORT. 2371 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a 2372 sve_uunpklo(vtmp1, H, src); 2373 // ptmp = 00 01 00 00 00 01 00 01 2374 sve_punpklo(ptmp, mask); 2375 // Pack the active elements in size of type SHORT to the right, 2376 // and fill the remainings with zero. 2377 // dst = 00 00 00 00 00 0g 0c 0a 2378 unsigned extended_size = vector_length_in_bytes << 1; 2379 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size); 2380 // Narrow the result back to type BYTE. 2381 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a 2382 sve_uzp1(dst, B, dst, vzr); 2383 2384 // Return if the vector length is no more than MaxVectorSize/2, since the 2385 // highest half is invalid. 2386 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) { 2387 return; 2388 } 2389 // Count the active elements of lowest half. 2390 // rscratch2 = 3 2391 sve_cntp(rscratch2, H, ptrue, ptmp); 2392 2393 // Repeat to the highest half. 2394 // ptmp = 00 01 00 00 00 00 00 01 2395 sve_punpkhi(ptmp, mask); 2396 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i 2397 sve_uunpkhi(vtmp2, H, src); 2398 // vtmp1 = 00 00 00 00 00 00 0p 0i 2399 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize); 2400 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i 2401 sve_uzp1(vtmp1, B, vtmp1, vzr); 2402 2403 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2404 sve_whilelt(ptmp, B, zr, rscratch2); 2405 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a 2406 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i 2407 // Combine the compressed low with the compressed high: 2408 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a 2409 sve_splice(dst, B, ptmp, vtmp1); 2410 } 2411 2412 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2413 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2414 SIMD_Arrangement size = isQ ? T16B : T8B; 2415 if (bt == T_BYTE) { 2416 rbit(dst, size, src); 2417 } else { 2418 neon_reverse_bytes(dst, src, bt, isQ); 2419 rbit(dst, size, dst); 2420 } 2421 } 2422 2423 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2424 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2425 SIMD_Arrangement size = isQ ? T16B : T8B; 2426 switch (bt) { 2427 case T_BYTE: 2428 if (dst != src) { 2429 orr(dst, size, src, src); 2430 } 2431 break; 2432 case T_SHORT: 2433 rev16(dst, size, src); 2434 break; 2435 case T_INT: 2436 rev32(dst, size, src); 2437 break; 2438 case T_LONG: 2439 rev64(dst, size, src); 2440 break; 2441 default: 2442 assert(false, "unsupported"); 2443 ShouldNotReachHere(); 2444 } 2445 } 2446 2447 // VectorRearrange implementation for short/int/float/long/double types with NEON 2448 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. 2449 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group. 2450 // For VectorRearrange long/double, we compare the shuffle input with iota indices, 2451 // and use bsl to implement the operation. 2452 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, 2453 FloatRegister shuffle, FloatRegister tmp, 2454 BasicType bt, bool isQ) { 2455 assert_different_registers(dst, src, shuffle, tmp); 2456 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2457 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2458 2459 // Here is an example that rearranges a NEON vector with 4 ints: 2460 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] 2461 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. 2462 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector 2463 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get 2464 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. 2465 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], 2466 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] 2467 // 4. Use Vm as index register, and use V1 as table register. 2468 // Then get V2 as the result by tbl NEON instructions. 2469 switch (bt) { 2470 case T_SHORT: 2471 mov(tmp, size1, 0x02); 2472 mulv(dst, size2, shuffle, tmp); 2473 mov(tmp, size2, 0x0100); 2474 addv(dst, size1, dst, tmp); 2475 tbl(dst, size1, src, 1, dst); 2476 break; 2477 case T_INT: 2478 case T_FLOAT: 2479 mov(tmp, size1, 0x04); 2480 mulv(dst, size2, shuffle, tmp); 2481 mov(tmp, size2, 0x03020100); 2482 addv(dst, size1, dst, tmp); 2483 tbl(dst, size1, src, 1, dst); 2484 break; 2485 case T_LONG: 2486 case T_DOUBLE: 2487 { 2488 int idx = vector_iota_entry_index(T_LONG); 2489 lea(rscratch1, 2490 ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx))); 2491 ldrq(tmp, rscratch1); 2492 // Check whether the input "shuffle" is the same with iota indices. 2493 // Return "src" if true, otherwise swap the two elements of "src". 2494 cm(EQ, dst, size2, shuffle, tmp); 2495 ext(tmp, size1, src, src, 8); 2496 bsl(dst, size1, src, tmp); 2497 } 2498 break; 2499 default: 2500 assert(false, "unsupported element type"); 2501 ShouldNotReachHere(); 2502 } 2503 } 2504 2505 // Extract a scalar element from an sve vector at position 'idx'. 2506 // The input elements in src are expected to be of integral type. 2507 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2508 int idx, FloatRegister vtmp) { 2509 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2510 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2511 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2512 if (bt == T_INT || bt == T_LONG) { 2513 umov(dst, src, size, idx); 2514 } else { 2515 smov(dst, src, size, idx); 2516 } 2517 } else { 2518 sve_orr(vtmp, src, src); 2519 sve_ext(vtmp, vtmp, idx << size); 2520 if (bt == T_INT || bt == T_LONG) { 2521 umov(dst, vtmp, size, 0); 2522 } else { 2523 smov(dst, vtmp, size, 0); 2524 } 2525 } 2526 } 2527 2528 // java.lang.Math::round intrinsics 2529 2530 // Clobbers: rscratch1, rflags 2531 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2532 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2533 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2534 switch (T) { 2535 case T2S: 2536 case T4S: 2537 fmovs(tmp1, T, 0.5f); 2538 mov(rscratch1, jint_cast(0x1.0p23f)); 2539 break; 2540 case T2D: 2541 fmovd(tmp1, T, 0.5); 2542 mov(rscratch1, julong_cast(0x1.0p52)); 2543 break; 2544 default: 2545 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2546 } 2547 fadd(tmp1, T, tmp1, src); 2548 fcvtms(tmp1, T, tmp1); 2549 // tmp1 = floor(src + 0.5, ties to even) 2550 2551 fcvtas(dst, T, src); 2552 // dst = round(src), ties to away 2553 2554 fneg(tmp3, T, src); 2555 dup(tmp2, T, rscratch1); 2556 cm(HS, tmp3, T, tmp3, tmp2); 2557 // tmp3 is now a set of flags 2558 2559 bif(dst, T16B, tmp1, tmp3); 2560 // result in dst 2561 } 2562 2563 // Clobbers: rscratch1, rflags 2564 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2565 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2566 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2567 assert_different_registers(tmp1, tmp2, src, dst); 2568 2569 switch (T) { 2570 case S: 2571 mov(rscratch1, jint_cast(0x1.0p23f)); 2572 break; 2573 case D: 2574 mov(rscratch1, julong_cast(0x1.0p52)); 2575 break; 2576 default: 2577 assert(T == S || T == D, "invalid register variant"); 2578 } 2579 2580 sve_frinta(dst, T, ptrue, src); 2581 // dst = round(src), ties to away 2582 2583 Label none; 2584 2585 sve_fneg(tmp1, T, ptrue, src); 2586 sve_dup(tmp2, T, rscratch1); 2587 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2588 br(EQ, none); 2589 { 2590 sve_cpy(tmp1, T, pgtmp, 0.5); 2591 sve_fadd(tmp1, T, pgtmp, src); 2592 sve_frintm(dst, T, pgtmp, tmp1); 2593 // dst = floor(src + 0.5, ties to even) 2594 } 2595 bind(none); 2596 2597 sve_fcvtzs(dst, T, ptrue, dst, T); 2598 // result in dst 2599 } 2600 2601 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2602 FloatRegister one, SIMD_Arrangement T) { 2603 assert_different_registers(dst, src, zero, one); 2604 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2605 2606 facgt(dst, T, src, zero); 2607 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2608 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2609 } 2610 2611 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2612 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2613 assert_different_registers(dst, src, zero, one, vtmp); 2614 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2615 2616 sve_orr(vtmp, src, src); 2617 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2618 switch (T) { 2619 case S: 2620 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2621 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2622 // on the sign of the float value 2623 break; 2624 case D: 2625 sve_and(vtmp, T, min_jlong); 2626 sve_orr(vtmp, T, jlong_cast(1.0)); 2627 break; 2628 default: 2629 assert(false, "unsupported"); 2630 ShouldNotReachHere(); 2631 } 2632 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2633 // Result in dst 2634 } 2635 2636 bool C2_MacroAssembler::in_scratch_emit_size() { 2637 if (ciEnv::current()->task() != nullptr) { 2638 PhaseOutput* phase_output = Compile::current()->output(); 2639 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2640 return true; 2641 } 2642 } 2643 return MacroAssembler::in_scratch_emit_size(); 2644 } 2645 2646 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) { 2647 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi); 2648 } 2649 2650 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) { 2651 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2652 if (t == TypeInt::INT) { 2653 return; 2654 } 2655 2656 BLOCK_COMMENT("verify_int_in_range {"); 2657 Label L_success, L_failure; 2658 2659 jint lo = t->_lo; 2660 jint hi = t->_hi; 2661 2662 if (lo != min_jint) { 2663 subsw(rtmp, rval, lo); 2664 br(Assembler::LT, L_failure); 2665 } 2666 if (hi != max_jint) { 2667 subsw(rtmp, rval, hi); 2668 br(Assembler::GT, L_failure); 2669 } 2670 b(L_success); 2671 2672 bind(L_failure); 2673 movw(c_rarg0, idx); 2674 mov(c_rarg1, rval); 2675 movw(c_rarg2, lo); 2676 movw(c_rarg3, hi); 2677 reconstruct_frame_pointer(rtmp); 2678 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp); 2679 hlt(0); 2680 2681 bind(L_success); 2682 BLOCK_COMMENT("} verify_int_in_range"); 2683 } 2684 2685 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) { 2686 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi); 2687 } 2688 2689 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) { 2690 assert(!t->empty() && !t->singleton(), "%s", Type::str(t)); 2691 if (t == TypeLong::LONG) { 2692 return; 2693 } 2694 2695 BLOCK_COMMENT("verify_long_in_range {"); 2696 Label L_success, L_failure; 2697 2698 jlong lo = t->_lo; 2699 jlong hi = t->_hi; 2700 2701 if (lo != min_jlong) { 2702 subs(rtmp, rval, lo); 2703 br(Assembler::LT, L_failure); 2704 } 2705 if (hi != max_jlong) { 2706 subs(rtmp, rval, hi); 2707 br(Assembler::GT, L_failure); 2708 } 2709 b(L_success); 2710 2711 bind(L_failure); 2712 movw(c_rarg0, idx); 2713 mov(c_rarg1, rval); 2714 mov(c_rarg2, lo); 2715 mov(c_rarg3, hi); 2716 reconstruct_frame_pointer(rtmp); 2717 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp); 2718 hlt(0); 2719 2720 bind(L_success); 2721 BLOCK_COMMENT("} verify_long_in_range"); 2722 } 2723 2724 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { 2725 const int framesize = Compile::current()->output()->frame_size_in_bytes(); 2726 if (PreserveFramePointer) { 2727 // frame pointer is valid 2728 #ifdef ASSERT 2729 // Verify frame pointer value in rfp. 2730 add(rtmp, sp, framesize - 2 * wordSize); 2731 Label L_success; 2732 cmp(rfp, rtmp); 2733 br(Assembler::EQ, L_success); 2734 stop("frame pointer mismatch"); 2735 bind(L_success); 2736 #endif // ASSERT 2737 } else { 2738 add(rfp, sp, framesize - 2 * wordSize); 2739 } 2740 } 2741 2742 // Selects elements from two source vectors (src1, src2) based on index values in the index register 2743 // using Neon instructions and places it in the destination vector element corresponding to the 2744 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), 2745 // where NUM_ELEM is the number of BasicType elements per vector. 2746 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) 2747 // Otherwise, selects src2[idx – NUM_ELEM] 2748 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1, 2749 FloatRegister src2, FloatRegister index, 2750 FloatRegister tmp, unsigned vector_length_in_bytes) { 2751 assert_different_registers(dst, src1, src2, tmp); 2752 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B; 2753 2754 if (vector_length_in_bytes == 16) { 2755 assert(UseSVE <= 1, "sve must be <= 1"); 2756 assert(src1->successor() == src2, "Source registers must be ordered"); 2757 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table 2758 tbl(dst, size, src1, 2, index); 2759 } else { // vector length == 8 2760 assert(UseSVE == 0, "must be Neon only"); 2761 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the 2762 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl" 2763 // instruction with one vector lookup 2764 ins(tmp, D, src1, 0, 0); 2765 ins(tmp, D, src2, 1, 0); 2766 tbl(dst, size, tmp, 1, index); 2767 } 2768 } 2769 2770 // Selects elements from two source vectors (src1, src2) based on index values in the index register 2771 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the 2772 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM), 2773 // where NUM_ELEM is the number of BasicType elements per vector. 2774 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register) 2775 // Otherwise, selects src2[idx – NUM_ELEM] 2776 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1, 2777 FloatRegister src2, FloatRegister index, 2778 FloatRegister tmp, SIMD_RegVariant T, 2779 unsigned vector_length_in_bytes) { 2780 assert_different_registers(dst, src1, src2, index, tmp); 2781 2782 if (vector_length_in_bytes == 8) { 2783 // We need to fit both the source vectors (src1, src2) in a single vector register because the 2784 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to 2785 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl" 2786 // instruction with one vector lookup 2787 assert(UseSVE >= 1, "sve must be >= 1"); 2788 ins(tmp, D, src1, 0, 0); 2789 ins(tmp, D, src2, 1, 0); 2790 sve_tbl(dst, T, tmp, index); 2791 } else { // UseSVE == 2 and vector_length_in_bytes > 8 2792 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table. 2793 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation 2794 // is not executed on machines where vector_length_in_bytes < MaxVectorSize 2795 // with the only exception of 8B vector length. 2796 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be"); 2797 assert(src1->successor() == src2, "Source registers must be ordered"); 2798 sve_tbl(dst, T, src1, src2, index); 2799 } 2800 } 2801 2802 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1, 2803 FloatRegister src2, FloatRegister index, 2804 FloatRegister tmp, BasicType bt, 2805 unsigned vector_length_in_bytes) { 2806 2807 assert_different_registers(dst, src1, src2, index, tmp); 2808 2809 // The cases that can reach this method are - 2810 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types 2811 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types 2812 // 2813 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8 2814 // and UseSVE = 2 with vector_length_in_bytes >= 8 2815 // 2816 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and 2817 // UseSVE = 1 with vector_length_in_bytes = 16 2818 2819 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) { 2820 SIMD_RegVariant T = elemType_to_regVariant(bt); 2821 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes); 2822 return; 2823 } 2824 2825 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT 2826 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type"); 2827 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16"); 2828 2829 bool isQ = vector_length_in_bytes == 16; 2830 2831 SIMD_Arrangement size1 = isQ ? T16B : T8B; 2832 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); 2833 2834 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of 2835 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table. 2836 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM 2837 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length, 2838 // the indices can range from [0, 8). 2839 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0] 2840 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202] 2841 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000] 2842 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100] 2843 // Add the multiplied result to the vector in tmp to obtain the byte level 2844 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100] 2845 // Use these offsets in the "tbl" instruction to select chunks of 2B. 2846 2847 if (bt == T_BYTE) { 2848 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes); 2849 } else { 2850 int elem_size = (bt == T_SHORT) ? 2 : 4; 2851 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u; 2852 2853 mov(tmp, size1, elem_size); 2854 mulv(dst, size2, index, tmp); 2855 mov(tmp, size2, tbl_offset); 2856 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements 2857 // to select a set of 2B/4B 2858 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes); 2859 } 2860 } 2861 2862 // Vector expand implementation. Elements from the src vector are expanded into 2863 // the dst vector under the control of the vector mask. 2864 // Since there are no native instructions directly corresponding to expand before 2865 // SVE2p2, the following implementations mainly leverages the TBL instruction to 2866 // implement expand. To compute the index input for TBL, the prefix sum algorithm 2867 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used 2868 // for NEON and SVE, but with different instructions where appropriate. 2869 2870 // Vector expand implementation for NEON. 2871 // 2872 // An example of 128-bit Byte vector: 2873 // Data direction: high <== low 2874 // Input: 2875 // src = g f e d c b a 9 8 7 6 5 4 3 2 1 2876 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 2877 // Expected result: 2878 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1 2879 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask, 2880 FloatRegister tmp1, FloatRegister tmp2, BasicType bt, 2881 int vector_length_in_bytes) { 2882 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16"); 2883 assert_different_registers(dst, src, mask, tmp1, tmp2); 2884 // Since the TBL instruction only supports byte table, we need to 2885 // compute indices in byte type for all types. 2886 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B; 2887 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2888 dup(tmp1, size, zr); 2889 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 2890 negr(dst, size, mask); 2891 // Calculate vector index for TBL with prefix sum algorithm. 2892 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1 2893 for (int i = 1; i < vector_length_in_bytes; i <<= 1) { 2894 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i); 2895 addv(dst, size, tmp2, dst); 2896 } 2897 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 2898 orr(tmp2, size, mask, mask); 2899 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1 2900 bsl(tmp2, size, dst, tmp1); 2901 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2902 movi(tmp1, size, 1); 2903 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0 2904 subv(dst, size, tmp2, tmp1); 2905 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1 2906 tbl(dst, size, src, 1, dst); 2907 } 2908 2909 // Vector expand implementation for SVE. 2910 // 2911 // An example of 128-bit Short vector: 2912 // Data direction: high <== low 2913 // Input: 2914 // src = gf ed cb a9 87 65 43 21 2915 // pg = 00 01 00 01 00 01 00 01 2916 // Expected result: 2917 // dst = 00 87 00 65 00 43 00 21 2918 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg, 2919 FloatRegister tmp1, FloatRegister tmp2, BasicType bt, 2920 int vector_length_in_bytes) { 2921 assert(UseSVE > 0, "expand implementation only for SVE"); 2922 assert_different_registers(dst, src, tmp1, tmp2); 2923 SIMD_RegVariant size = elemType_to_regVariant(bt); 2924 2925 // tmp1 = 00 00 00 00 00 00 00 00 2926 sve_dup(tmp1, size, 0); 2927 sve_movprfx(tmp2, tmp1); 2928 // tmp2 = 00 01 00 01 00 01 00 01 2929 sve_cpy(tmp2, size, pg, 1, true); 2930 // Calculate vector index for TBL with prefix sum algorithm. 2931 // tmp2 = 04 04 03 03 02 02 01 01 2932 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) { 2933 sve_movprfx(dst, tmp1); 2934 // The EXT instruction operates on the full-width sve register. The correct 2935 // index calculation method is: 2936 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes => 2937 // MaxVectorSize - i. 2938 sve_ext(dst, tmp2, MaxVectorSize - i); 2939 sve_add(tmp2, size, dst, tmp2); 2940 } 2941 // dst = 00 04 00 03 00 02 00 01 2942 sve_sel(dst, size, pg, tmp2, tmp1); 2943 // dst = -1 03 -1 02 -1 01 -1 00 2944 sve_sub(dst, size, 1); 2945 // dst = 00 87 00 65 00 43 00 21 2946 sve_tbl(dst, size, src, dst); 2947 } 2948 2949 // Optimized SVE cpy (imm, zeroing) instruction. 2950 // 2951 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same 2952 // functionality, but test results show that `movi; cpy(imm, merging)` has 2953 // higher throughput on some microarchitectures. This would depend on 2954 // microarchitecture and so may vary between implementations. 2955 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T, 2956 PRegister pg, int imm8, bool isMerge) { 2957 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) { 2958 // Generates a NEON instruction `movi V<dst>.2d, #0`. 2959 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is 2960 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of 2961 // Z<dst> above 128, so this `movi` instruction effectively zeroes the 2962 // entire Z<dst> register. According to the Arm Software Optimization 2963 // Guide, `movi` is zero latency. 2964 movi(dst, T2D, 0); 2965 isMerge = true; 2966 } 2967 Assembler::sve_cpy(dst, T, pg, imm8, isMerge); 2968 } 2969 2970 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) { 2971 // The vector iota entries array is ordered by type B/S/I/L/F/D, and 2972 // the offset between two types is 16. 2973 switch(bt) { 2974 case T_BYTE: 2975 return 0; 2976 case T_SHORT: 2977 return 1; 2978 case T_INT: 2979 return 2; 2980 case T_LONG: 2981 return 3; 2982 case T_FLOAT: 2983 return 4; 2984 case T_DOUBLE: 2985 return 5; 2986 default: 2987 ShouldNotReachHere(); 2988 } 2989 } --- EOF ---