1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 50 Register tmp2Reg, Register tmp3Reg) { 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmpReg; 54 Register tmp = tmp2Reg; 55 Label cont; 56 Label object_has_monitor; 57 Label count, no_count; 58 59 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 60 assert_different_registers(oop, box, tmp, disp_hdr); 61 62 // Load markWord from object into displaced_header. 63 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 64 65 if (DiagnoseSyncOnValueBasedClasses != 0) { 66 load_klass(tmp, oop); 67 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 68 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 69 br(Assembler::NE, cont); 70 } 71 72 // Check for existing monitor 73 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 74 75 if (LockingMode == LM_MONITOR) { 76 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 77 b(cont); 78 } else { 79 assert(LockingMode == LM_LEGACY, "must be"); 80 // Set tmp to be (markWord of object | UNLOCK_VALUE). 81 orr(tmp, disp_hdr, markWord::unlocked_value); 82 83 // Initialize the box. (Must happen before we update the object mark!) 84 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 85 86 // Compare object markWord with an unlocked value (tmp) and if 87 // equal exchange the stack address of our box with object markWord. 88 // On failure disp_hdr contains the possibly locked markWord. 89 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 90 /*release*/ true, /*weak*/ false, disp_hdr); 91 br(Assembler::EQ, cont); 92 93 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 94 95 // If the compare-and-exchange succeeded, then we found an unlocked 96 // object, will have now locked it will continue at label cont 97 98 // Check if the owner is self by comparing the value in the 99 // markWord of object (disp_hdr) with the stack pointer. 100 mov(rscratch1, sp); 101 sub(disp_hdr, disp_hdr, rscratch1); 102 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 103 // If condition is true we are cont and hence we can store 0 as the 104 // displaced header in the box, which indicates that it is a recursive lock. 105 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 106 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 107 b(cont); 108 } 109 110 // Handle existing monitor. 111 bind(object_has_monitor); 112 113 // The object's monitor m is unlocked iff m->owner == nullptr, 114 // otherwise m->owner may contain a thread or a stack address. 115 // 116 // Try to CAS m->owner from null to current thread. 117 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 118 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 119 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 120 121 // Store a non-null value into the box to avoid looking like a re-entrant 122 // lock. The fast-path monitor unlock code checks for 123 // markWord::monitor_value so use markWord::unused_mark which has the 124 // relevant bit set, and also matches ObjectSynchronizer::enter. 125 mov(tmp, (address)markWord::unused_mark().value()); 126 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 127 128 br(Assembler::EQ, cont); // CAS success means locking succeeded 129 130 cmp(tmp3Reg, rthread); 131 br(Assembler::NE, cont); // Check for recursive locking 132 133 // Recursive lock case 134 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 135 // flag == EQ still from the cmp above, checking if this is a reentrant lock 136 137 bind(cont); 138 // flag == EQ indicates success 139 // flag == NE indicates failure 140 br(Assembler::NE, no_count); 141 142 bind(count); 143 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 144 145 bind(no_count); 146 } 147 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 149 Register tmp2Reg) { 150 Register oop = objectReg; 151 Register box = boxReg; 152 Register disp_hdr = tmpReg; 153 Register tmp = tmp2Reg; 154 Label cont; 155 Label object_has_monitor; 156 Label count, no_count; 157 158 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 159 assert_different_registers(oop, box, tmp, disp_hdr); 160 161 if (LockingMode == LM_LEGACY) { 162 // Find the lock address and load the displaced header from the stack. 163 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 164 165 // If the displaced header is 0, we have a recursive unlock. 166 cmp(disp_hdr, zr); 167 br(Assembler::EQ, cont); 168 } 169 170 // Handle existing monitor. 171 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 172 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 173 174 if (LockingMode == LM_MONITOR) { 175 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 176 b(cont); 177 } else { 178 assert(LockingMode == LM_LEGACY, "must be"); 179 // Check if it is still a light weight lock, this is is true if we 180 // see the stack address of the basicLock in the markWord of the 181 // object. 182 183 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 184 /*release*/ true, /*weak*/ false, tmp); 185 b(cont); 186 } 187 188 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 189 190 // Handle existing monitor. 191 bind(object_has_monitor); 192 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 193 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 194 195 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 196 197 Label notRecursive; 198 cbz(disp_hdr, notRecursive); 199 200 // Recursive lock 201 sub(disp_hdr, disp_hdr, 1u); 202 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 203 cmp(disp_hdr, disp_hdr); // Sets flags for result 204 b(cont); 205 206 bind(notRecursive); 207 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 208 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 209 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 210 cmp(rscratch1, zr); // Sets flags for result 211 cbnz(rscratch1, cont); 212 // need a release store here 213 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 214 stlr(zr, tmp); // set unowned 215 216 bind(cont); 217 // flag == EQ indicates success 218 // flag == NE indicates failure 219 br(Assembler::NE, no_count); 220 221 bind(count); 222 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 223 224 bind(no_count); 225 } 226 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1, 228 Register t2, Register t3) { 229 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 230 assert_different_registers(obj, t1, t2, t3); 231 232 // Handle inflated monitor. 233 Label inflated; 234 // Finish fast lock successfully. MUST branch to with flag == EQ 235 Label locked; 236 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 237 Label slow_path; 238 239 if (DiagnoseSyncOnValueBasedClasses != 0) { 240 load_klass(t1, obj); 241 ldrw(t1, Address(t1, Klass::access_flags_offset())); 242 tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS); 243 br(Assembler::NE, slow_path); 244 } 245 246 const Register t1_mark = t1; 247 248 { // Lightweight locking 249 250 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 251 Label push; 252 253 const Register t2_top = t2; 254 const Register t3_t = t3; 255 256 // Check if lock-stack is full. 257 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 258 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 259 br(Assembler::GT, slow_path); 260 261 // Check if recursive. 262 subw(t3_t, t2_top, oopSize); 263 ldr(t3_t, Address(rthread, t3_t)); 264 cmp(obj, t3_t); 265 br(Assembler::EQ, push); 266 267 // Relaxed normal load to check for monitor. Optimization for monitor case. 268 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 269 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 270 271 // Not inflated 272 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 273 274 // Try to lock. Transition lock-bits 0b01 => 0b00 275 orr(t1_mark, t1_mark, markWord::unlocked_value); 276 eor(t3_t, t1_mark, markWord::unlocked_value); 277 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 278 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 279 br(Assembler::NE, slow_path); 280 281 bind(push); 282 // After successful lock, push object on lock-stack. 283 str(obj, Address(rthread, t2_top)); 284 addw(t2_top, t2_top, oopSize); 285 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 286 b(locked); 287 } 288 289 { // Handle inflated monitor. 290 bind(inflated); 291 292 // mark contains the tagged ObjectMonitor*. 293 const Register t1_tagged_monitor = t1_mark; 294 const uintptr_t monitor_tag = markWord::monitor_value; 295 const Register t2_owner_addr = t2; 296 const Register t3_owner = t3; 297 298 // Compute owner address. 299 lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag))); 300 301 // CAS owner (null => current thread). 302 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 303 /*release*/ false, /*weak*/ false, t3_owner); 304 br(Assembler::EQ, locked); 305 306 // Check if recursive. 307 cmp(t3_owner, rthread); 308 br(Assembler::NE, slow_path); 309 310 // Recursive. 311 increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1); 312 } 313 314 bind(locked); 315 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 316 317 #ifdef ASSERT 318 // Check that locked label is reached with Flags == EQ. 319 Label flag_correct; 320 br(Assembler::EQ, flag_correct); 321 stop("Fast Lock Flag != EQ"); 322 #endif 323 324 bind(slow_path); 325 #ifdef ASSERT 326 // Check that slow_path label is reached with Flags == NE. 327 br(Assembler::NE, flag_correct); 328 stop("Fast Lock Flag != NE"); 329 bind(flag_correct); 330 #endif 331 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 332 } 333 334 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2, 335 Register t3) { 336 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 337 assert_different_registers(obj, t1, t2, t3); 338 339 // Handle inflated monitor. 340 Label inflated, inflated_load_monitor; 341 // Finish fast unlock successfully. MUST branch to with flag == EQ 342 Label unlocked; 343 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 344 Label slow_path; 345 346 const Register t1_mark = t1; 347 const Register t2_top = t2; 348 const Register t3_t = t3; 349 350 { // Lightweight unlock 351 352 // Check if obj is top of lock-stack. 353 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 354 subw(t2_top, t2_top, oopSize); 355 ldr(t3_t, Address(rthread, t2_top)); 356 cmp(obj, t3_t); 357 // Top of lock stack was not obj. Must be monitor. 358 br(Assembler::NE, inflated_load_monitor); 359 360 // Pop lock-stack. 361 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 362 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 363 364 // Check if recursive. 365 subw(t3_t, t2_top, oopSize); 366 ldr(t3_t, Address(rthread, t3_t)); 367 cmp(obj, t3_t); 368 br(Assembler::EQ, unlocked); 369 370 // Not recursive. 371 // Load Mark. 372 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 373 374 // Check header for monitor (0b10). 375 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 376 377 // Try to unlock. Transition lock bits 0b00 => 0b01 378 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 379 orr(t3_t, t1_mark, markWord::unlocked_value); 380 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 381 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 382 br(Assembler::EQ, unlocked); 383 384 // Compare and exchange failed. 385 // Restore lock-stack and handle the unlock in runtime. 386 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 387 addw(t2_top, t2_top, oopSize); 388 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 389 b(slow_path); 390 } 391 392 393 { // Handle inflated monitor. 394 bind(inflated_load_monitor); 395 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 396 #ifdef ASSERT 397 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 398 stop("Fast Unlock not monitor"); 399 #endif 400 401 bind(inflated); 402 403 #ifdef ASSERT 404 Label check_done; 405 subw(t2_top, t2_top, oopSize); 406 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 407 br(Assembler::LT, check_done); 408 ldr(t3_t, Address(rthread, t2_top)); 409 cmp(obj, t3_t); 410 br(Assembler::NE, inflated); 411 stop("Fast Unlock lock on stack"); 412 bind(check_done); 413 #endif 414 415 // mark contains the tagged ObjectMonitor*. 416 const Register t1_monitor = t1_mark; 417 const uintptr_t monitor_tag = markWord::monitor_value; 418 419 // Untag the monitor. 420 sub(t1_monitor, t1_mark, monitor_tag); 421 422 const Register t2_recursions = t2; 423 Label not_recursive; 424 425 // Check if recursive. 426 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 427 cbz(t2_recursions, not_recursive); 428 429 // Recursive unlock. 430 sub(t2_recursions, t2_recursions, 1u); 431 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 432 // Set flag == EQ 433 cmp(t2_recursions, t2_recursions); 434 b(unlocked); 435 436 bind(not_recursive); 437 438 Label release; 439 const Register t2_owner_addr = t2; 440 441 // Compute owner address. 442 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 443 444 // Check if the entry lists are empty. 445 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 446 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 447 orr(rscratch1, rscratch1, t3_t); 448 cmp(rscratch1, zr); 449 br(Assembler::EQ, release); 450 451 // The owner may be anonymous and we removed the last obj entry in 452 // the lock-stack. This loses the information about the owner. 453 // Write the thread to the owner field so the runtime knows the owner. 454 str(rthread, Address(t2_owner_addr)); 455 b(slow_path); 456 457 bind(release); 458 // Set owner to null. 459 // Release to satisfy the JMM 460 stlr(zr, t2_owner_addr); 461 } 462 463 bind(unlocked); 464 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 465 466 #ifdef ASSERT 467 // Check that unlocked label is reached with Flags == EQ. 468 Label flag_correct; 469 br(Assembler::EQ, flag_correct); 470 stop("Fast Unlock Flag != EQ"); 471 #endif 472 473 bind(slow_path); 474 #ifdef ASSERT 475 // Check that slow_path label is reached with Flags == NE. 476 br(Assembler::NE, flag_correct); 477 stop("Fast Unlock Flag != NE"); 478 bind(flag_correct); 479 #endif 480 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 481 } 482 483 // Search for str1 in str2 and return index or -1 484 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 485 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 486 Register cnt2, Register cnt1, 487 Register tmp1, Register tmp2, 488 Register tmp3, Register tmp4, 489 Register tmp5, Register tmp6, 490 int icnt1, Register result, int ae) { 491 // NOTE: tmp5, tmp6 can be zr depending on specific method version 492 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 493 494 Register ch1 = rscratch1; 495 Register ch2 = rscratch2; 496 Register cnt1tmp = tmp1; 497 Register cnt2tmp = tmp2; 498 Register cnt1_neg = cnt1; 499 Register cnt2_neg = cnt2; 500 Register result_tmp = tmp4; 501 502 bool isL = ae == StrIntrinsicNode::LL; 503 504 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 505 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 506 int str1_chr_shift = str1_isL ? 0:1; 507 int str2_chr_shift = str2_isL ? 0:1; 508 int str1_chr_size = str1_isL ? 1:2; 509 int str2_chr_size = str2_isL ? 1:2; 510 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 511 (chr_insn)&MacroAssembler::ldrh; 512 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 513 (chr_insn)&MacroAssembler::ldrh; 514 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 515 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 516 517 // Note, inline_string_indexOf() generates checks: 518 // if (substr.count > string.count) return -1; 519 // if (substr.count == 0) return 0; 520 521 // We have two strings, a source string in str2, cnt2 and a pattern string 522 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 523 524 // For larger pattern and source we use a simplified Boyer Moore algorithm. 525 // With a small pattern and source we use linear scan. 526 527 if (icnt1 == -1) { 528 sub(result_tmp, cnt2, cnt1); 529 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 530 br(LT, LINEARSEARCH); 531 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 532 subs(zr, cnt1, 256); 533 lsr(tmp1, cnt2, 2); 534 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 535 br(GE, LINEARSTUB); 536 } 537 538 // The Boyer Moore alogorithm is based on the description here:- 539 // 540 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 541 // 542 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 543 // and the 'Good Suffix' rule. 544 // 545 // These rules are essentially heuristics for how far we can shift the 546 // pattern along the search string. 547 // 548 // The implementation here uses the 'Bad Character' rule only because of the 549 // complexity of initialisation for the 'Good Suffix' rule. 550 // 551 // This is also known as the Boyer-Moore-Horspool algorithm:- 552 // 553 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 554 // 555 // This particular implementation has few java-specific optimizations. 556 // 557 // #define ASIZE 256 558 // 559 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 560 // int i, j; 561 // unsigned c; 562 // unsigned char bc[ASIZE]; 563 // 564 // /* Preprocessing */ 565 // for (i = 0; i < ASIZE; ++i) 566 // bc[i] = m; 567 // for (i = 0; i < m - 1; ) { 568 // c = x[i]; 569 // ++i; 570 // // c < 256 for Latin1 string, so, no need for branch 571 // #ifdef PATTERN_STRING_IS_LATIN1 572 // bc[c] = m - i; 573 // #else 574 // if (c < ASIZE) bc[c] = m - i; 575 // #endif 576 // } 577 // 578 // /* Searching */ 579 // j = 0; 580 // while (j <= n - m) { 581 // c = y[i+j]; 582 // if (x[m-1] == c) 583 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 584 // if (i < 0) return j; 585 // // c < 256 for Latin1 string, so, no need for branch 586 // #ifdef SOURCE_STRING_IS_LATIN1 587 // // LL case: (c< 256) always true. Remove branch 588 // j += bc[y[j+m-1]]; 589 // #endif 590 // #ifndef PATTERN_STRING_IS_UTF 591 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 592 // if (c < ASIZE) 593 // j += bc[y[j+m-1]]; 594 // else 595 // j += 1 596 // #endif 597 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 598 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 599 // if (c < ASIZE) 600 // j += bc[y[j+m-1]]; 601 // else 602 // j += m 603 // #endif 604 // } 605 // } 606 607 if (icnt1 == -1) { 608 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 609 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 610 Register cnt1end = tmp2; 611 Register str2end = cnt2; 612 Register skipch = tmp2; 613 614 // str1 length is >=8, so, we can read at least 1 register for cases when 615 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 616 // UL case. We'll re-read last character in inner pre-loop code to have 617 // single outer pre-loop load 618 const int firstStep = isL ? 7 : 3; 619 620 const int ASIZE = 256; 621 const int STORED_BYTES = 32; // amount of bytes stored per instruction 622 sub(sp, sp, ASIZE); 623 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 624 mov(ch1, sp); 625 BIND(BM_INIT_LOOP); 626 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 627 subs(tmp5, tmp5, 1); 628 br(GT, BM_INIT_LOOP); 629 630 sub(cnt1tmp, cnt1, 1); 631 mov(tmp5, str2); 632 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 633 sub(ch2, cnt1, 1); 634 mov(tmp3, str1); 635 BIND(BCLOOP); 636 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 637 if (!str1_isL) { 638 subs(zr, ch1, ASIZE); 639 br(HS, BCSKIP); 640 } 641 strb(ch2, Address(sp, ch1)); 642 BIND(BCSKIP); 643 subs(ch2, ch2, 1); 644 br(GT, BCLOOP); 645 646 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 647 if (str1_isL == str2_isL) { 648 // load last 8 bytes (8LL/4UU symbols) 649 ldr(tmp6, Address(tmp6, -wordSize)); 650 } else { 651 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 652 // convert Latin1 to UTF. We'll have to wait until load completed, but 653 // it's still faster than per-character loads+checks 654 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 655 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 656 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 657 andr(tmp6, tmp6, 0xFF); // str1[N-4] 658 orr(ch2, ch1, ch2, LSL, 16); 659 orr(tmp6, tmp6, tmp3, LSL, 48); 660 orr(tmp6, tmp6, ch2, LSL, 16); 661 } 662 BIND(BMLOOPSTR2); 663 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 664 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 665 if (str1_isL == str2_isL) { 666 // re-init tmp3. It's for free because it's executed in parallel with 667 // load above. Alternative is to initialize it before loop, but it'll 668 // affect performance on in-order systems with 2 or more ld/st pipelines 669 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 670 } 671 if (!isL) { // UU/UL case 672 lsl(ch2, cnt1tmp, 1); // offset in bytes 673 } 674 cmp(tmp3, skipch); 675 br(NE, BMSKIP); 676 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 677 mov(ch1, tmp6); 678 if (isL) { 679 b(BMLOOPSTR1_AFTER_LOAD); 680 } else { 681 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 682 b(BMLOOPSTR1_CMP); 683 } 684 BIND(BMLOOPSTR1); 685 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 686 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 687 BIND(BMLOOPSTR1_AFTER_LOAD); 688 subs(cnt1tmp, cnt1tmp, 1); 689 br(LT, BMLOOPSTR1_LASTCMP); 690 BIND(BMLOOPSTR1_CMP); 691 cmp(ch1, ch2); 692 br(EQ, BMLOOPSTR1); 693 BIND(BMSKIP); 694 if (!isL) { 695 // if we've met UTF symbol while searching Latin1 pattern, then we can 696 // skip cnt1 symbols 697 if (str1_isL != str2_isL) { 698 mov(result_tmp, cnt1); 699 } else { 700 mov(result_tmp, 1); 701 } 702 subs(zr, skipch, ASIZE); 703 br(HS, BMADV); 704 } 705 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 706 BIND(BMADV); 707 sub(cnt1tmp, cnt1, 1); 708 add(str2, str2, result_tmp, LSL, str2_chr_shift); 709 cmp(str2, str2end); 710 br(LE, BMLOOPSTR2); 711 add(sp, sp, ASIZE); 712 b(NOMATCH); 713 BIND(BMLOOPSTR1_LASTCMP); 714 cmp(ch1, ch2); 715 br(NE, BMSKIP); 716 BIND(BMMATCH); 717 sub(result, str2, tmp5); 718 if (!str2_isL) lsr(result, result, 1); 719 add(sp, sp, ASIZE); 720 b(DONE); 721 722 BIND(LINEARSTUB); 723 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 724 br(LT, LINEAR_MEDIUM); 725 mov(result, zr); 726 RuntimeAddress stub = nullptr; 727 if (isL) { 728 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 729 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 730 } else if (str1_isL) { 731 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 732 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 733 } else { 734 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 735 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 736 } 737 address call = trampoline_call(stub); 738 if (call == nullptr) { 739 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 740 ciEnv::current()->record_failure("CodeCache is full"); 741 return; 742 } 743 b(DONE); 744 } 745 746 BIND(LINEARSEARCH); 747 { 748 Label DO1, DO2, DO3; 749 750 Register str2tmp = tmp2; 751 Register first = tmp3; 752 753 if (icnt1 == -1) 754 { 755 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 756 757 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 758 br(LT, DOSHORT); 759 BIND(LINEAR_MEDIUM); 760 (this->*str1_load_1chr)(first, Address(str1)); 761 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 762 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 763 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 764 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 765 766 BIND(FIRST_LOOP); 767 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 768 cmp(first, ch2); 769 br(EQ, STR1_LOOP); 770 BIND(STR2_NEXT); 771 adds(cnt2_neg, cnt2_neg, str2_chr_size); 772 br(LE, FIRST_LOOP); 773 b(NOMATCH); 774 775 BIND(STR1_LOOP); 776 adds(cnt1tmp, cnt1_neg, str1_chr_size); 777 add(cnt2tmp, cnt2_neg, str2_chr_size); 778 br(GE, MATCH); 779 780 BIND(STR1_NEXT); 781 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 782 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 783 cmp(ch1, ch2); 784 br(NE, STR2_NEXT); 785 adds(cnt1tmp, cnt1tmp, str1_chr_size); 786 add(cnt2tmp, cnt2tmp, str2_chr_size); 787 br(LT, STR1_NEXT); 788 b(MATCH); 789 790 BIND(DOSHORT); 791 if (str1_isL == str2_isL) { 792 cmp(cnt1, (u1)2); 793 br(LT, DO1); 794 br(GT, DO3); 795 } 796 } 797 798 if (icnt1 == 4) { 799 Label CH1_LOOP; 800 801 (this->*load_4chr)(ch1, str1); 802 sub(result_tmp, cnt2, 4); 803 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 804 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 805 806 BIND(CH1_LOOP); 807 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 808 cmp(ch1, ch2); 809 br(EQ, MATCH); 810 adds(cnt2_neg, cnt2_neg, str2_chr_size); 811 br(LE, CH1_LOOP); 812 b(NOMATCH); 813 } 814 815 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 816 Label CH1_LOOP; 817 818 BIND(DO2); 819 (this->*load_2chr)(ch1, str1); 820 if (icnt1 == 2) { 821 sub(result_tmp, cnt2, 2); 822 } 823 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 824 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 825 BIND(CH1_LOOP); 826 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 827 cmp(ch1, ch2); 828 br(EQ, MATCH); 829 adds(cnt2_neg, cnt2_neg, str2_chr_size); 830 br(LE, CH1_LOOP); 831 b(NOMATCH); 832 } 833 834 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 835 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 836 837 BIND(DO3); 838 (this->*load_2chr)(first, str1); 839 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 840 if (icnt1 == 3) { 841 sub(result_tmp, cnt2, 3); 842 } 843 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 844 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 845 BIND(FIRST_LOOP); 846 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 847 cmpw(first, ch2); 848 br(EQ, STR1_LOOP); 849 BIND(STR2_NEXT); 850 adds(cnt2_neg, cnt2_neg, str2_chr_size); 851 br(LE, FIRST_LOOP); 852 b(NOMATCH); 853 854 BIND(STR1_LOOP); 855 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 856 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 857 cmp(ch1, ch2); 858 br(NE, STR2_NEXT); 859 b(MATCH); 860 } 861 862 if (icnt1 == -1 || icnt1 == 1) { 863 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 864 865 BIND(DO1); 866 (this->*str1_load_1chr)(ch1, str1); 867 cmp(cnt2, (u1)8); 868 br(LT, DO1_SHORT); 869 870 sub(result_tmp, cnt2, 8/str2_chr_size); 871 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 872 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 873 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 874 875 if (str2_isL) { 876 orr(ch1, ch1, ch1, LSL, 8); 877 } 878 orr(ch1, ch1, ch1, LSL, 16); 879 orr(ch1, ch1, ch1, LSL, 32); 880 BIND(CH1_LOOP); 881 ldr(ch2, Address(str2, cnt2_neg)); 882 eor(ch2, ch1, ch2); 883 sub(tmp1, ch2, tmp3); 884 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 885 bics(tmp1, tmp1, tmp2); 886 br(NE, HAS_ZERO); 887 adds(cnt2_neg, cnt2_neg, 8); 888 br(LT, CH1_LOOP); 889 890 cmp(cnt2_neg, (u1)8); 891 mov(cnt2_neg, 0); 892 br(LT, CH1_LOOP); 893 b(NOMATCH); 894 895 BIND(HAS_ZERO); 896 rev(tmp1, tmp1); 897 clz(tmp1, tmp1); 898 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 899 b(MATCH); 900 901 BIND(DO1_SHORT); 902 mov(result_tmp, cnt2); 903 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 904 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 905 BIND(DO1_LOOP); 906 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 907 cmpw(ch1, ch2); 908 br(EQ, MATCH); 909 adds(cnt2_neg, cnt2_neg, str2_chr_size); 910 br(LT, DO1_LOOP); 911 } 912 } 913 BIND(NOMATCH); 914 mov(result, -1); 915 b(DONE); 916 BIND(MATCH); 917 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 918 BIND(DONE); 919 } 920 921 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 922 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 923 924 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 925 Register ch, Register result, 926 Register tmp1, Register tmp2, Register tmp3) 927 { 928 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 929 Register cnt1_neg = cnt1; 930 Register ch1 = rscratch1; 931 Register result_tmp = rscratch2; 932 933 cbz(cnt1, NOMATCH); 934 935 cmp(cnt1, (u1)4); 936 br(LT, DO1_SHORT); 937 938 orr(ch, ch, ch, LSL, 16); 939 orr(ch, ch, ch, LSL, 32); 940 941 sub(cnt1, cnt1, 4); 942 mov(result_tmp, cnt1); 943 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 944 sub(cnt1_neg, zr, cnt1, LSL, 1); 945 946 mov(tmp3, 0x0001000100010001); 947 948 BIND(CH1_LOOP); 949 ldr(ch1, Address(str1, cnt1_neg)); 950 eor(ch1, ch, ch1); 951 sub(tmp1, ch1, tmp3); 952 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 953 bics(tmp1, tmp1, tmp2); 954 br(NE, HAS_ZERO); 955 adds(cnt1_neg, cnt1_neg, 8); 956 br(LT, CH1_LOOP); 957 958 cmp(cnt1_neg, (u1)8); 959 mov(cnt1_neg, 0); 960 br(LT, CH1_LOOP); 961 b(NOMATCH); 962 963 BIND(HAS_ZERO); 964 rev(tmp1, tmp1); 965 clz(tmp1, tmp1); 966 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 967 b(MATCH); 968 969 BIND(DO1_SHORT); 970 mov(result_tmp, cnt1); 971 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 972 sub(cnt1_neg, zr, cnt1, LSL, 1); 973 BIND(DO1_LOOP); 974 ldrh(ch1, Address(str1, cnt1_neg)); 975 cmpw(ch, ch1); 976 br(EQ, MATCH); 977 adds(cnt1_neg, cnt1_neg, 2); 978 br(LT, DO1_LOOP); 979 BIND(NOMATCH); 980 mov(result, -1); 981 b(DONE); 982 BIND(MATCH); 983 add(result, result_tmp, cnt1_neg, ASR, 1); 984 BIND(DONE); 985 } 986 987 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 988 Register ch, Register result, 989 FloatRegister ztmp1, 990 FloatRegister ztmp2, 991 PRegister tmp_pg, 992 PRegister tmp_pdn, bool isL) 993 { 994 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 995 assert(tmp_pg->is_governing(), 996 "this register has to be a governing predicate register"); 997 998 Label LOOP, MATCH, DONE, NOMATCH; 999 Register vec_len = rscratch1; 1000 Register idx = rscratch2; 1001 1002 SIMD_RegVariant T = (isL == true) ? B : H; 1003 1004 cbz(cnt1, NOMATCH); 1005 1006 // Assign the particular char throughout the vector. 1007 sve_dup(ztmp2, T, ch); 1008 if (isL) { 1009 sve_cntb(vec_len); 1010 } else { 1011 sve_cnth(vec_len); 1012 } 1013 mov(idx, 0); 1014 1015 // Generate a predicate to control the reading of input string. 1016 sve_whilelt(tmp_pg, T, idx, cnt1); 1017 1018 BIND(LOOP); 1019 // Read a vector of 8- or 16-bit data depending on the string type. Note 1020 // that inactive elements indicated by the predicate register won't cause 1021 // a data read from memory to the destination vector. 1022 if (isL) { 1023 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1024 } else { 1025 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1026 } 1027 add(idx, idx, vec_len); 1028 1029 // Perform the comparison. An element of the destination predicate is set 1030 // to active if the particular char is matched. 1031 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1032 1033 // Branch if the particular char is found. 1034 br(NE, MATCH); 1035 1036 sve_whilelt(tmp_pg, T, idx, cnt1); 1037 1038 // Loop back if the particular char not found. 1039 br(MI, LOOP); 1040 1041 BIND(NOMATCH); 1042 mov(result, -1); 1043 b(DONE); 1044 1045 BIND(MATCH); 1046 // Undo the index increment. 1047 sub(idx, idx, vec_len); 1048 1049 // Crop the vector to find its location. 1050 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1051 add(result, idx, -1); 1052 sve_incp(result, T, tmp_pdn); 1053 BIND(DONE); 1054 } 1055 1056 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1057 Register ch, Register result, 1058 Register tmp1, Register tmp2, Register tmp3) 1059 { 1060 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1061 Register cnt1_neg = cnt1; 1062 Register ch1 = rscratch1; 1063 Register result_tmp = rscratch2; 1064 1065 cbz(cnt1, NOMATCH); 1066 1067 cmp(cnt1, (u1)8); 1068 br(LT, DO1_SHORT); 1069 1070 orr(ch, ch, ch, LSL, 8); 1071 orr(ch, ch, ch, LSL, 16); 1072 orr(ch, ch, ch, LSL, 32); 1073 1074 sub(cnt1, cnt1, 8); 1075 mov(result_tmp, cnt1); 1076 lea(str1, Address(str1, cnt1)); 1077 sub(cnt1_neg, zr, cnt1); 1078 1079 mov(tmp3, 0x0101010101010101); 1080 1081 BIND(CH1_LOOP); 1082 ldr(ch1, Address(str1, cnt1_neg)); 1083 eor(ch1, ch, ch1); 1084 sub(tmp1, ch1, tmp3); 1085 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1086 bics(tmp1, tmp1, tmp2); 1087 br(NE, HAS_ZERO); 1088 adds(cnt1_neg, cnt1_neg, 8); 1089 br(LT, CH1_LOOP); 1090 1091 cmp(cnt1_neg, (u1)8); 1092 mov(cnt1_neg, 0); 1093 br(LT, CH1_LOOP); 1094 b(NOMATCH); 1095 1096 BIND(HAS_ZERO); 1097 rev(tmp1, tmp1); 1098 clz(tmp1, tmp1); 1099 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1100 b(MATCH); 1101 1102 BIND(DO1_SHORT); 1103 mov(result_tmp, cnt1); 1104 lea(str1, Address(str1, cnt1)); 1105 sub(cnt1_neg, zr, cnt1); 1106 BIND(DO1_LOOP); 1107 ldrb(ch1, Address(str1, cnt1_neg)); 1108 cmp(ch, ch1); 1109 br(EQ, MATCH); 1110 adds(cnt1_neg, cnt1_neg, 1); 1111 br(LT, DO1_LOOP); 1112 BIND(NOMATCH); 1113 mov(result, -1); 1114 b(DONE); 1115 BIND(MATCH); 1116 add(result, result_tmp, cnt1_neg); 1117 BIND(DONE); 1118 } 1119 1120 // Compare strings. 1121 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1122 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1123 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1124 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1125 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1126 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1127 SHORT_LOOP_START, TAIL_CHECK; 1128 1129 bool isLL = ae == StrIntrinsicNode::LL; 1130 bool isLU = ae == StrIntrinsicNode::LU; 1131 bool isUL = ae == StrIntrinsicNode::UL; 1132 1133 // The stub threshold for LL strings is: 72 (64 + 8) chars 1134 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1135 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1136 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1137 1138 bool str1_isL = isLL || isLU; 1139 bool str2_isL = isLL || isUL; 1140 1141 int str1_chr_shift = str1_isL ? 0 : 1; 1142 int str2_chr_shift = str2_isL ? 0 : 1; 1143 int str1_chr_size = str1_isL ? 1 : 2; 1144 int str2_chr_size = str2_isL ? 1 : 2; 1145 int minCharsInWord = isLL ? wordSize : wordSize/2; 1146 1147 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1148 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1149 (chr_insn)&MacroAssembler::ldrh; 1150 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1151 (chr_insn)&MacroAssembler::ldrh; 1152 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1153 (uxt_insn)&MacroAssembler::uxthw; 1154 1155 BLOCK_COMMENT("string_compare {"); 1156 1157 // Bizzarely, the counts are passed in bytes, regardless of whether they 1158 // are L or U strings, however the result is always in characters. 1159 if (!str1_isL) asrw(cnt1, cnt1, 1); 1160 if (!str2_isL) asrw(cnt2, cnt2, 1); 1161 1162 // Compute the minimum of the string lengths and save the difference. 1163 subsw(result, cnt1, cnt2); 1164 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1165 1166 // A very short string 1167 cmpw(cnt2, minCharsInWord); 1168 br(Assembler::LE, SHORT_STRING); 1169 1170 // Compare longwords 1171 // load first parts of strings and finish initialization while loading 1172 { 1173 if (str1_isL == str2_isL) { // LL or UU 1174 ldr(tmp1, Address(str1)); 1175 cmp(str1, str2); 1176 br(Assembler::EQ, DONE); 1177 ldr(tmp2, Address(str2)); 1178 cmp(cnt2, stub_threshold); 1179 br(GE, STUB); 1180 subsw(cnt2, cnt2, minCharsInWord); 1181 br(EQ, TAIL_CHECK); 1182 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1183 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1184 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1185 } else if (isLU) { 1186 ldrs(vtmp, Address(str1)); 1187 ldr(tmp2, Address(str2)); 1188 cmp(cnt2, stub_threshold); 1189 br(GE, STUB); 1190 subw(cnt2, cnt2, 4); 1191 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1192 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1193 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1194 zip1(vtmp, T8B, vtmp, vtmpZ); 1195 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1196 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1197 add(cnt1, cnt1, 4); 1198 fmovd(tmp1, vtmp); 1199 } else { // UL case 1200 ldr(tmp1, Address(str1)); 1201 ldrs(vtmp, Address(str2)); 1202 cmp(cnt2, stub_threshold); 1203 br(GE, STUB); 1204 subw(cnt2, cnt2, 4); 1205 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1206 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1207 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1208 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1209 zip1(vtmp, T8B, vtmp, vtmpZ); 1210 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1211 add(cnt1, cnt1, 8); 1212 fmovd(tmp2, vtmp); 1213 } 1214 adds(cnt2, cnt2, isUL ? 4 : 8); 1215 br(GE, TAIL); 1216 eor(rscratch2, tmp1, tmp2); 1217 cbnz(rscratch2, DIFF); 1218 // main loop 1219 bind(NEXT_WORD); 1220 if (str1_isL == str2_isL) { 1221 ldr(tmp1, Address(str1, cnt2)); 1222 ldr(tmp2, Address(str2, cnt2)); 1223 adds(cnt2, cnt2, 8); 1224 } else if (isLU) { 1225 ldrs(vtmp, Address(str1, cnt1)); 1226 ldr(tmp2, Address(str2, cnt2)); 1227 add(cnt1, cnt1, 4); 1228 zip1(vtmp, T8B, vtmp, vtmpZ); 1229 fmovd(tmp1, vtmp); 1230 adds(cnt2, cnt2, 8); 1231 } else { // UL 1232 ldrs(vtmp, Address(str2, cnt2)); 1233 ldr(tmp1, Address(str1, cnt1)); 1234 zip1(vtmp, T8B, vtmp, vtmpZ); 1235 add(cnt1, cnt1, 8); 1236 fmovd(tmp2, vtmp); 1237 adds(cnt2, cnt2, 4); 1238 } 1239 br(GE, TAIL); 1240 1241 eor(rscratch2, tmp1, tmp2); 1242 cbz(rscratch2, NEXT_WORD); 1243 b(DIFF); 1244 bind(TAIL); 1245 eor(rscratch2, tmp1, tmp2); 1246 cbnz(rscratch2, DIFF); 1247 // Last longword. In the case where length == 4 we compare the 1248 // same longword twice, but that's still faster than another 1249 // conditional branch. 1250 if (str1_isL == str2_isL) { 1251 ldr(tmp1, Address(str1)); 1252 ldr(tmp2, Address(str2)); 1253 } else if (isLU) { 1254 ldrs(vtmp, Address(str1)); 1255 ldr(tmp2, Address(str2)); 1256 zip1(vtmp, T8B, vtmp, vtmpZ); 1257 fmovd(tmp1, vtmp); 1258 } else { // UL 1259 ldrs(vtmp, Address(str2)); 1260 ldr(tmp1, Address(str1)); 1261 zip1(vtmp, T8B, vtmp, vtmpZ); 1262 fmovd(tmp2, vtmp); 1263 } 1264 bind(TAIL_CHECK); 1265 eor(rscratch2, tmp1, tmp2); 1266 cbz(rscratch2, DONE); 1267 1268 // Find the first different characters in the longwords and 1269 // compute their difference. 1270 bind(DIFF); 1271 rev(rscratch2, rscratch2); 1272 clz(rscratch2, rscratch2); 1273 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1274 lsrv(tmp1, tmp1, rscratch2); 1275 (this->*ext_chr)(tmp1, tmp1); 1276 lsrv(tmp2, tmp2, rscratch2); 1277 (this->*ext_chr)(tmp2, tmp2); 1278 subw(result, tmp1, tmp2); 1279 b(DONE); 1280 } 1281 1282 bind(STUB); 1283 RuntimeAddress stub = nullptr; 1284 switch(ae) { 1285 case StrIntrinsicNode::LL: 1286 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1287 break; 1288 case StrIntrinsicNode::UU: 1289 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1290 break; 1291 case StrIntrinsicNode::LU: 1292 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1293 break; 1294 case StrIntrinsicNode::UL: 1295 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1296 break; 1297 default: 1298 ShouldNotReachHere(); 1299 } 1300 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1301 address call = trampoline_call(stub); 1302 if (call == nullptr) { 1303 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1304 ciEnv::current()->record_failure("CodeCache is full"); 1305 return; 1306 } 1307 b(DONE); 1308 1309 bind(SHORT_STRING); 1310 // Is the minimum length zero? 1311 cbz(cnt2, DONE); 1312 // arrange code to do most branches while loading and loading next characters 1313 // while comparing previous 1314 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1315 subs(cnt2, cnt2, 1); 1316 br(EQ, SHORT_LAST_INIT); 1317 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1318 b(SHORT_LOOP_START); 1319 bind(SHORT_LOOP); 1320 subs(cnt2, cnt2, 1); 1321 br(EQ, SHORT_LAST); 1322 bind(SHORT_LOOP_START); 1323 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1324 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1325 cmp(tmp1, cnt1); 1326 br(NE, SHORT_LOOP_TAIL); 1327 subs(cnt2, cnt2, 1); 1328 br(EQ, SHORT_LAST2); 1329 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1330 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1331 cmp(tmp2, rscratch1); 1332 br(EQ, SHORT_LOOP); 1333 sub(result, tmp2, rscratch1); 1334 b(DONE); 1335 bind(SHORT_LOOP_TAIL); 1336 sub(result, tmp1, cnt1); 1337 b(DONE); 1338 bind(SHORT_LAST2); 1339 cmp(tmp2, rscratch1); 1340 br(EQ, DONE); 1341 sub(result, tmp2, rscratch1); 1342 1343 b(DONE); 1344 bind(SHORT_LAST_INIT); 1345 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1346 bind(SHORT_LAST); 1347 cmp(tmp1, cnt1); 1348 br(EQ, DONE); 1349 sub(result, tmp1, cnt1); 1350 1351 bind(DONE); 1352 1353 BLOCK_COMMENT("} string_compare"); 1354 } 1355 1356 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1357 FloatRegister src2, Condition cond, bool isQ) { 1358 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1359 FloatRegister zn = src1, zm = src2; 1360 bool needs_negation = false; 1361 switch (cond) { 1362 case LT: cond = GT; zn = src2; zm = src1; break; 1363 case LE: cond = GE; zn = src2; zm = src1; break; 1364 case LO: cond = HI; zn = src2; zm = src1; break; 1365 case LS: cond = HS; zn = src2; zm = src1; break; 1366 case NE: cond = EQ; needs_negation = true; break; 1367 default: 1368 break; 1369 } 1370 1371 if (is_floating_point_type(bt)) { 1372 fcm(cond, dst, size, zn, zm); 1373 } else { 1374 cm(cond, dst, size, zn, zm); 1375 } 1376 1377 if (needs_negation) { 1378 notr(dst, isQ ? T16B : T8B, dst); 1379 } 1380 } 1381 1382 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1383 Condition cond, bool isQ) { 1384 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1385 if (bt == T_FLOAT || bt == T_DOUBLE) { 1386 if (cond == Assembler::NE) { 1387 fcm(Assembler::EQ, dst, size, src); 1388 notr(dst, isQ ? T16B : T8B, dst); 1389 } else { 1390 fcm(cond, dst, size, src); 1391 } 1392 } else { 1393 if (cond == Assembler::NE) { 1394 cm(Assembler::EQ, dst, size, src); 1395 notr(dst, isQ ? T16B : T8B, dst); 1396 } else { 1397 cm(cond, dst, size, src); 1398 } 1399 } 1400 } 1401 1402 // Compress the least significant bit of each byte to the rightmost and clear 1403 // the higher garbage bits. 1404 void C2_MacroAssembler::bytemask_compress(Register dst) { 1405 // Example input, dst = 0x01 00 00 00 01 01 00 01 1406 // The "??" bytes are garbage. 1407 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1408 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1409 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1410 andr(dst, dst, 0xff); // dst = 0x8D 1411 } 1412 1413 // Pack the lowest-numbered bit of each mask element in src into a long value 1414 // in dst, at most the first 64 lane elements. 1415 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1416 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1417 FloatRegister vtmp1, FloatRegister vtmp2) { 1418 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1419 assert_different_registers(dst, rscratch1); 1420 assert_different_registers(vtmp1, vtmp2); 1421 1422 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1423 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1424 // Expected: dst = 0x658D 1425 1426 // Convert the mask into vector with sequential bytes. 1427 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1428 sve_cpy(vtmp1, size, src, 1, false); 1429 if (bt != T_BYTE) { 1430 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1431 } 1432 1433 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1434 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1435 // is to compress each significant bit of the byte in a cross-lane way. Due 1436 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1437 // (bit-compress in each lane) with the biggest lane size (T = D) then 1438 // concatenate the results. 1439 1440 // The second source input of BEXT, initialized with 0x01 in each byte. 1441 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1442 sve_dup(vtmp2, B, 1); 1443 1444 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1445 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1446 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1447 // --------------------------------------- 1448 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1449 sve_bext(vtmp1, D, vtmp1, vtmp2); 1450 1451 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1452 // result to dst. 1453 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1454 // dst = 0x658D 1455 if (lane_cnt <= 8) { 1456 // No need to concatenate. 1457 umov(dst, vtmp1, B, 0); 1458 } else if (lane_cnt <= 16) { 1459 ins(vtmp1, B, vtmp1, 1, 8); 1460 umov(dst, vtmp1, H, 0); 1461 } else { 1462 // As the lane count is 64 at most, the final expected value must be in 1463 // the lowest 64 bits after narrowing vtmp1 from D to B. 1464 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1465 umov(dst, vtmp1, D, 0); 1466 } 1467 } else if (UseSVE > 0) { 1468 // Compress the lowest 8 bytes. 1469 fmovd(dst, vtmp1); 1470 bytemask_compress(dst); 1471 if (lane_cnt <= 8) return; 1472 1473 // Repeat on higher bytes and join the results. 1474 // Compress 8 bytes in each iteration. 1475 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1476 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1477 bytemask_compress(rscratch1); 1478 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1479 } 1480 } else { 1481 assert(false, "unsupported"); 1482 ShouldNotReachHere(); 1483 } 1484 } 1485 1486 // Unpack the mask, a long value in src, into predicate register dst based on the 1487 // corresponding data type. Note that dst can support at most 64 lanes. 1488 // Below example gives the expected dst predicate register in different types, with 1489 // a valid src(0x658D) on a 1024-bit vector size machine. 1490 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1491 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1492 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1493 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1494 // 1495 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1496 // has 24 significant bits would be an invalid input if dst predicate register refers to 1497 // a LONG type 1024-bit vector, which has at most 16 lanes. 1498 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1499 FloatRegister vtmp1, FloatRegister vtmp2) { 1500 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1501 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1502 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1503 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1504 // Expected: dst = 0b01101001 10001101 1505 1506 // Put long value from general purpose register into the first lane of vector. 1507 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1508 sve_dup(vtmp1, B, 0); 1509 mov(vtmp1, D, 0, src); 1510 1511 // As sve_cmp generates mask value with the minimum unit in byte, we should 1512 // transform the value in the first lane which is mask in bit now to the 1513 // mask in byte, which can be done by SVE2's BDEP instruction. 1514 1515 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1516 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1517 if (lane_cnt <= 8) { 1518 // Nothing. As only one byte exsits. 1519 } else if (lane_cnt <= 16) { 1520 ins(vtmp1, B, vtmp1, 8, 1); 1521 mov(vtmp1, B, 1, zr); 1522 } else { 1523 sve_vector_extend(vtmp1, D, vtmp1, B); 1524 } 1525 1526 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1527 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1528 sve_dup(vtmp2, B, 1); 1529 1530 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1531 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1532 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1533 // --------------------------------------- 1534 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1535 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1536 1537 if (bt != T_BYTE) { 1538 sve_vector_extend(vtmp1, size, vtmp1, B); 1539 } 1540 // Generate mask according to the given vector, in which the elements have been 1541 // extended to expected type. 1542 // dst = 0b01101001 10001101 1543 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1544 } 1545 1546 // Clobbers: rflags 1547 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1548 FloatRegister zn, FloatRegister zm, Condition cond) { 1549 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1550 FloatRegister z1 = zn, z2 = zm; 1551 switch (cond) { 1552 case LE: z1 = zm; z2 = zn; cond = GE; break; 1553 case LT: z1 = zm; z2 = zn; cond = GT; break; 1554 case LO: z1 = zm; z2 = zn; cond = HI; break; 1555 case LS: z1 = zm; z2 = zn; cond = HS; break; 1556 default: 1557 break; 1558 } 1559 1560 SIMD_RegVariant size = elemType_to_regVariant(bt); 1561 if (is_floating_point_type(bt)) { 1562 sve_fcm(cond, pd, size, pg, z1, z2); 1563 } else { 1564 assert(is_integral_type(bt), "unsupported element type"); 1565 sve_cmp(cond, pd, size, pg, z1, z2); 1566 } 1567 } 1568 1569 // Get index of the last mask lane that is set 1570 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1571 SIMD_RegVariant size = elemType_to_regVariant(bt); 1572 sve_rev(ptmp, size, src); 1573 sve_brkb(ptmp, ptrue, ptmp, false); 1574 sve_cntp(dst, size, ptrue, ptmp); 1575 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1576 subw(dst, rscratch1, dst); 1577 } 1578 1579 // Extend integer vector src to dst with the same lane count 1580 // but larger element size, e.g. 4B -> 4I 1581 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1582 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1583 if (src_bt == T_BYTE) { 1584 if (dst_bt == T_SHORT) { 1585 // 4B/8B to 4S/8S 1586 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1587 } else { 1588 // 4B to 4I 1589 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1590 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1591 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1592 } 1593 } else if (src_bt == T_SHORT) { 1594 // 4S to 4I 1595 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1596 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1597 } else if (src_bt == T_INT) { 1598 // 2I to 2L 1599 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1600 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1601 } else { 1602 ShouldNotReachHere(); 1603 } 1604 } 1605 1606 // Narrow integer vector src down to dst with the same lane count 1607 // but smaller element size, e.g. 4I -> 4B 1608 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1609 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1610 if (src_bt == T_SHORT) { 1611 // 4S/8S to 4B/8B 1612 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1613 assert(dst_bt == T_BYTE, "unsupported"); 1614 xtn(dst, T8B, src, T8H); 1615 } else if (src_bt == T_INT) { 1616 // 4I to 4B/4S 1617 assert(src_vlen_in_bytes == 16, "unsupported"); 1618 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1619 xtn(dst, T4H, src, T4S); 1620 if (dst_bt == T_BYTE) { 1621 xtn(dst, T8B, dst, T8H); 1622 } 1623 } else if (src_bt == T_LONG) { 1624 // 2L to 2I 1625 assert(src_vlen_in_bytes == 16, "unsupported"); 1626 assert(dst_bt == T_INT, "unsupported"); 1627 xtn(dst, T2S, src, T2D); 1628 } else { 1629 ShouldNotReachHere(); 1630 } 1631 } 1632 1633 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1634 FloatRegister src, SIMD_RegVariant src_size, 1635 bool is_unsigned) { 1636 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1637 1638 if (src_size == B) { 1639 switch (dst_size) { 1640 case H: 1641 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1642 break; 1643 case S: 1644 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1645 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1646 break; 1647 case D: 1648 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1649 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1650 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1651 break; 1652 default: 1653 ShouldNotReachHere(); 1654 } 1655 } else if (src_size == H) { 1656 if (dst_size == S) { 1657 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1658 } else { // D 1659 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1660 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1661 } 1662 } else if (src_size == S) { 1663 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1664 } 1665 } 1666 1667 // Vector narrow from src to dst with specified element sizes. 1668 // High part of dst vector will be filled with zero. 1669 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1670 FloatRegister src, SIMD_RegVariant src_size, 1671 FloatRegister tmp) { 1672 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1673 assert_different_registers(src, tmp); 1674 sve_dup(tmp, src_size, 0); 1675 if (src_size == D) { 1676 switch (dst_size) { 1677 case S: 1678 sve_uzp1(dst, S, src, tmp); 1679 break; 1680 case H: 1681 assert_different_registers(dst, tmp); 1682 sve_uzp1(dst, S, src, tmp); 1683 sve_uzp1(dst, H, dst, tmp); 1684 break; 1685 case B: 1686 assert_different_registers(dst, tmp); 1687 sve_uzp1(dst, S, src, tmp); 1688 sve_uzp1(dst, H, dst, tmp); 1689 sve_uzp1(dst, B, dst, tmp); 1690 break; 1691 default: 1692 ShouldNotReachHere(); 1693 } 1694 } else if (src_size == S) { 1695 if (dst_size == H) { 1696 sve_uzp1(dst, H, src, tmp); 1697 } else { // B 1698 assert_different_registers(dst, tmp); 1699 sve_uzp1(dst, H, src, tmp); 1700 sve_uzp1(dst, B, dst, tmp); 1701 } 1702 } else if (src_size == H) { 1703 sve_uzp1(dst, B, src, tmp); 1704 } 1705 } 1706 1707 // Extend src predicate to dst predicate with the same lane count but larger 1708 // element size, e.g. 64Byte -> 512Long 1709 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1710 uint dst_element_length_in_bytes, 1711 uint src_element_length_in_bytes) { 1712 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1713 sve_punpklo(dst, src); 1714 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1715 sve_punpklo(dst, src); 1716 sve_punpklo(dst, dst); 1717 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1718 sve_punpklo(dst, src); 1719 sve_punpklo(dst, dst); 1720 sve_punpklo(dst, dst); 1721 } else { 1722 assert(false, "unsupported"); 1723 ShouldNotReachHere(); 1724 } 1725 } 1726 1727 // Narrow src predicate to dst predicate with the same lane count but 1728 // smaller element size, e.g. 512Long -> 64Byte 1729 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1730 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1731 // The insignificant bits in src predicate are expected to be zero. 1732 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1733 // passed as the second argument. An example narrowing operation with a given mask would be - 1734 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1735 // Mask (for 2 Longs) : TF 1736 // Predicate register for the above mask (16 bits) : 00000001 00000000 1737 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1738 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1739 assert_different_registers(src, ptmp); 1740 assert_different_registers(dst, ptmp); 1741 sve_pfalse(ptmp); 1742 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1743 sve_uzp1(dst, B, src, ptmp); 1744 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1745 sve_uzp1(dst, H, src, ptmp); 1746 sve_uzp1(dst, B, dst, ptmp); 1747 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1748 sve_uzp1(dst, S, src, ptmp); 1749 sve_uzp1(dst, H, dst, ptmp); 1750 sve_uzp1(dst, B, dst, ptmp); 1751 } else { 1752 assert(false, "unsupported"); 1753 ShouldNotReachHere(); 1754 } 1755 } 1756 1757 // Vector reduction add for integral type with ASIMD instructions. 1758 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1759 Register isrc, FloatRegister vsrc, 1760 unsigned vector_length_in_bytes, 1761 FloatRegister vtmp) { 1762 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1763 assert_different_registers(dst, isrc); 1764 bool isQ = vector_length_in_bytes == 16; 1765 1766 BLOCK_COMMENT("neon_reduce_add_integral {"); 1767 switch(bt) { 1768 case T_BYTE: 1769 addv(vtmp, isQ ? T16B : T8B, vsrc); 1770 smov(dst, vtmp, B, 0); 1771 addw(dst, dst, isrc, ext::sxtb); 1772 break; 1773 case T_SHORT: 1774 addv(vtmp, isQ ? T8H : T4H, vsrc); 1775 smov(dst, vtmp, H, 0); 1776 addw(dst, dst, isrc, ext::sxth); 1777 break; 1778 case T_INT: 1779 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1780 umov(dst, vtmp, S, 0); 1781 addw(dst, dst, isrc); 1782 break; 1783 case T_LONG: 1784 assert(isQ, "unsupported"); 1785 addpd(vtmp, vsrc); 1786 umov(dst, vtmp, D, 0); 1787 add(dst, dst, isrc); 1788 break; 1789 default: 1790 assert(false, "unsupported"); 1791 ShouldNotReachHere(); 1792 } 1793 BLOCK_COMMENT("} neon_reduce_add_integral"); 1794 } 1795 1796 // Vector reduction multiply for integral type with ASIMD instructions. 1797 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1798 // Clobbers: rscratch1 1799 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1800 Register isrc, FloatRegister vsrc, 1801 unsigned vector_length_in_bytes, 1802 FloatRegister vtmp1, FloatRegister vtmp2) { 1803 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1804 bool isQ = vector_length_in_bytes == 16; 1805 1806 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1807 switch(bt) { 1808 case T_BYTE: 1809 if (isQ) { 1810 // Multiply the lower half and higher half of vector iteratively. 1811 // vtmp1 = vsrc[8:15] 1812 ins(vtmp1, D, vsrc, 0, 1); 1813 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1814 mulv(vtmp1, T8B, vtmp1, vsrc); 1815 // vtmp2 = vtmp1[4:7] 1816 ins(vtmp2, S, vtmp1, 0, 1); 1817 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1818 mulv(vtmp1, T8B, vtmp2, vtmp1); 1819 } else { 1820 ins(vtmp1, S, vsrc, 0, 1); 1821 mulv(vtmp1, T8B, vtmp1, vsrc); 1822 } 1823 // vtmp2 = vtmp1[2:3] 1824 ins(vtmp2, H, vtmp1, 0, 1); 1825 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1826 mulv(vtmp2, T8B, vtmp2, vtmp1); 1827 // dst = vtmp2[0] * isrc * vtmp2[1] 1828 umov(rscratch1, vtmp2, B, 0); 1829 mulw(dst, rscratch1, isrc); 1830 sxtb(dst, dst); 1831 umov(rscratch1, vtmp2, B, 1); 1832 mulw(dst, rscratch1, dst); 1833 sxtb(dst, dst); 1834 break; 1835 case T_SHORT: 1836 if (isQ) { 1837 ins(vtmp2, D, vsrc, 0, 1); 1838 mulv(vtmp2, T4H, vtmp2, vsrc); 1839 ins(vtmp1, S, vtmp2, 0, 1); 1840 mulv(vtmp1, T4H, vtmp1, vtmp2); 1841 } else { 1842 ins(vtmp1, S, vsrc, 0, 1); 1843 mulv(vtmp1, T4H, vtmp1, vsrc); 1844 } 1845 umov(rscratch1, vtmp1, H, 0); 1846 mulw(dst, rscratch1, isrc); 1847 sxth(dst, dst); 1848 umov(rscratch1, vtmp1, H, 1); 1849 mulw(dst, rscratch1, dst); 1850 sxth(dst, dst); 1851 break; 1852 case T_INT: 1853 if (isQ) { 1854 ins(vtmp1, D, vsrc, 0, 1); 1855 mulv(vtmp1, T2S, vtmp1, vsrc); 1856 } else { 1857 vtmp1 = vsrc; 1858 } 1859 umov(rscratch1, vtmp1, S, 0); 1860 mul(dst, rscratch1, isrc); 1861 umov(rscratch1, vtmp1, S, 1); 1862 mul(dst, rscratch1, dst); 1863 break; 1864 case T_LONG: 1865 umov(rscratch1, vsrc, D, 0); 1866 mul(dst, isrc, rscratch1); 1867 umov(rscratch1, vsrc, D, 1); 1868 mul(dst, dst, rscratch1); 1869 break; 1870 default: 1871 assert(false, "unsupported"); 1872 ShouldNotReachHere(); 1873 } 1874 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1875 } 1876 1877 // Vector reduction multiply for floating-point type with ASIMD instructions. 1878 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1879 FloatRegister fsrc, FloatRegister vsrc, 1880 unsigned vector_length_in_bytes, 1881 FloatRegister vtmp) { 1882 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1883 bool isQ = vector_length_in_bytes == 16; 1884 1885 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1886 switch(bt) { 1887 case T_FLOAT: 1888 fmuls(dst, fsrc, vsrc); 1889 ins(vtmp, S, vsrc, 0, 1); 1890 fmuls(dst, dst, vtmp); 1891 if (isQ) { 1892 ins(vtmp, S, vsrc, 0, 2); 1893 fmuls(dst, dst, vtmp); 1894 ins(vtmp, S, vsrc, 0, 3); 1895 fmuls(dst, dst, vtmp); 1896 } 1897 break; 1898 case T_DOUBLE: 1899 assert(isQ, "unsupported"); 1900 fmuld(dst, fsrc, vsrc); 1901 ins(vtmp, D, vsrc, 0, 1); 1902 fmuld(dst, dst, vtmp); 1903 break; 1904 default: 1905 assert(false, "unsupported"); 1906 ShouldNotReachHere(); 1907 } 1908 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1909 } 1910 1911 // Helper to select logical instruction 1912 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1913 Register Rn, Register Rm, 1914 enum shift_kind kind, unsigned shift) { 1915 switch(opc) { 1916 case Op_AndReductionV: 1917 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1918 break; 1919 case Op_OrReductionV: 1920 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1921 break; 1922 case Op_XorReductionV: 1923 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1924 break; 1925 default: 1926 assert(false, "unsupported"); 1927 ShouldNotReachHere(); 1928 } 1929 } 1930 1931 // Vector reduction logical operations And, Or, Xor 1932 // Clobbers: rscratch1 1933 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1934 Register isrc, FloatRegister vsrc, 1935 unsigned vector_length_in_bytes) { 1936 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1937 "unsupported"); 1938 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1939 assert_different_registers(dst, isrc); 1940 bool isQ = vector_length_in_bytes == 16; 1941 1942 BLOCK_COMMENT("neon_reduce_logical {"); 1943 umov(rscratch1, vsrc, isQ ? D : S, 0); 1944 umov(dst, vsrc, isQ ? D : S, 1); 1945 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1946 switch(bt) { 1947 case T_BYTE: 1948 if (isQ) { 1949 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1950 } 1951 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1952 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1953 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1954 sxtb(dst, dst); 1955 break; 1956 case T_SHORT: 1957 if (isQ) { 1958 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1959 } 1960 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1961 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1962 sxth(dst, dst); 1963 break; 1964 case T_INT: 1965 if (isQ) { 1966 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1967 } 1968 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1969 break; 1970 case T_LONG: 1971 assert(isQ, "unsupported"); 1972 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1973 break; 1974 default: 1975 assert(false, "unsupported"); 1976 ShouldNotReachHere(); 1977 } 1978 BLOCK_COMMENT("} neon_reduce_logical"); 1979 } 1980 1981 // Vector reduction min/max for integral type with ASIMD instructions. 1982 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1983 // Clobbers: rscratch1, rflags 1984 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1985 Register isrc, FloatRegister vsrc, 1986 unsigned vector_length_in_bytes, 1987 FloatRegister vtmp) { 1988 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1989 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1990 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1991 assert_different_registers(dst, isrc); 1992 bool isQ = vector_length_in_bytes == 16; 1993 bool is_min = opc == Op_MinReductionV; 1994 1995 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1996 if (bt == T_LONG) { 1997 assert(vtmp == fnoreg, "should be"); 1998 assert(isQ, "should be"); 1999 umov(rscratch1, vsrc, D, 0); 2000 cmp(isrc, rscratch1); 2001 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2002 umov(rscratch1, vsrc, D, 1); 2003 cmp(dst, rscratch1); 2004 csel(dst, dst, rscratch1, is_min ? LT : GT); 2005 } else { 2006 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2007 if (size == T2S) { 2008 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2009 } else { 2010 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2011 } 2012 if (bt == T_INT) { 2013 umov(dst, vtmp, S, 0); 2014 } else { 2015 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2016 } 2017 cmpw(dst, isrc); 2018 cselw(dst, dst, isrc, is_min ? LT : GT); 2019 } 2020 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2021 } 2022 2023 // Vector reduction for integral type with SVE instruction. 2024 // Supported operations are Add, And, Or, Xor, Max, Min. 2025 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2026 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2027 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2028 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2029 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2030 assert_different_registers(src1, dst); 2031 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2032 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2033 switch (opc) { 2034 case Op_AddReductionVI: { 2035 sve_uaddv(tmp, size, pg, src2); 2036 if (bt == T_BYTE) { 2037 smov(dst, tmp, size, 0); 2038 addw(dst, src1, dst, ext::sxtb); 2039 } else if (bt == T_SHORT) { 2040 smov(dst, tmp, size, 0); 2041 addw(dst, src1, dst, ext::sxth); 2042 } else { 2043 umov(dst, tmp, size, 0); 2044 addw(dst, dst, src1); 2045 } 2046 break; 2047 } 2048 case Op_AddReductionVL: { 2049 sve_uaddv(tmp, size, pg, src2); 2050 umov(dst, tmp, size, 0); 2051 add(dst, dst, src1); 2052 break; 2053 } 2054 case Op_AndReductionV: { 2055 sve_andv(tmp, size, pg, src2); 2056 if (bt == T_INT || bt == T_LONG) { 2057 umov(dst, tmp, size, 0); 2058 } else { 2059 smov(dst, tmp, size, 0); 2060 } 2061 if (bt == T_LONG) { 2062 andr(dst, dst, src1); 2063 } else { 2064 andw(dst, dst, src1); 2065 } 2066 break; 2067 } 2068 case Op_OrReductionV: { 2069 sve_orv(tmp, size, pg, src2); 2070 if (bt == T_INT || bt == T_LONG) { 2071 umov(dst, tmp, size, 0); 2072 } else { 2073 smov(dst, tmp, size, 0); 2074 } 2075 if (bt == T_LONG) { 2076 orr(dst, dst, src1); 2077 } else { 2078 orrw(dst, dst, src1); 2079 } 2080 break; 2081 } 2082 case Op_XorReductionV: { 2083 sve_eorv(tmp, size, pg, src2); 2084 if (bt == T_INT || bt == T_LONG) { 2085 umov(dst, tmp, size, 0); 2086 } else { 2087 smov(dst, tmp, size, 0); 2088 } 2089 if (bt == T_LONG) { 2090 eor(dst, dst, src1); 2091 } else { 2092 eorw(dst, dst, src1); 2093 } 2094 break; 2095 } 2096 case Op_MaxReductionV: { 2097 sve_smaxv(tmp, size, pg, src2); 2098 if (bt == T_INT || bt == T_LONG) { 2099 umov(dst, tmp, size, 0); 2100 } else { 2101 smov(dst, tmp, size, 0); 2102 } 2103 if (bt == T_LONG) { 2104 cmp(dst, src1); 2105 csel(dst, dst, src1, Assembler::GT); 2106 } else { 2107 cmpw(dst, src1); 2108 cselw(dst, dst, src1, Assembler::GT); 2109 } 2110 break; 2111 } 2112 case Op_MinReductionV: { 2113 sve_sminv(tmp, size, pg, src2); 2114 if (bt == T_INT || bt == T_LONG) { 2115 umov(dst, tmp, size, 0); 2116 } else { 2117 smov(dst, tmp, size, 0); 2118 } 2119 if (bt == T_LONG) { 2120 cmp(dst, src1); 2121 csel(dst, dst, src1, Assembler::LT); 2122 } else { 2123 cmpw(dst, src1); 2124 cselw(dst, dst, src1, Assembler::LT); 2125 } 2126 break; 2127 } 2128 default: 2129 assert(false, "unsupported"); 2130 ShouldNotReachHere(); 2131 } 2132 2133 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2134 if (bt == T_BYTE) { 2135 sxtb(dst, dst); 2136 } else if (bt == T_SHORT) { 2137 sxth(dst, dst); 2138 } 2139 } 2140 } 2141 2142 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2143 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2144 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2145 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2146 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2147 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2148 2149 // Set all elements to false if the input "lane_cnt" is zero. 2150 if (lane_cnt == 0) { 2151 sve_pfalse(dst); 2152 return; 2153 } 2154 2155 SIMD_RegVariant size = elemType_to_regVariant(bt); 2156 assert(size != Q, "invalid size"); 2157 2158 // Set all true if "lane_cnt" equals to the max lane count. 2159 if (lane_cnt == max_vector_length) { 2160 sve_ptrue(dst, size, /* ALL */ 0b11111); 2161 return; 2162 } 2163 2164 // Fixed numbers for "ptrue". 2165 switch(lane_cnt) { 2166 case 1: /* VL1 */ 2167 case 2: /* VL2 */ 2168 case 3: /* VL3 */ 2169 case 4: /* VL4 */ 2170 case 5: /* VL5 */ 2171 case 6: /* VL6 */ 2172 case 7: /* VL7 */ 2173 case 8: /* VL8 */ 2174 sve_ptrue(dst, size, lane_cnt); 2175 return; 2176 case 16: 2177 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2178 return; 2179 case 32: 2180 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2181 return; 2182 case 64: 2183 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2184 return; 2185 case 128: 2186 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2187 return; 2188 case 256: 2189 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2190 return; 2191 default: 2192 break; 2193 } 2194 2195 // Special patterns for "ptrue". 2196 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2197 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2198 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2199 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2200 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2201 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2202 } else { 2203 // Encode to "whileltw" for the remaining cases. 2204 mov(rscratch1, lane_cnt); 2205 sve_whileltw(dst, size, zr, rscratch1); 2206 } 2207 } 2208 2209 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2210 // Any remaining elements of dst will be filled with zero. 2211 // Clobbers: rscratch1 2212 // Preserves: src, mask 2213 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2214 FloatRegister vtmp1, FloatRegister vtmp2, 2215 PRegister pgtmp) { 2216 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2217 assert_different_registers(dst, src, vtmp1, vtmp2); 2218 assert_different_registers(mask, pgtmp); 2219 2220 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2221 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2222 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2223 sve_dup(vtmp2, H, 0); 2224 2225 // Extend lowest half to type INT. 2226 // dst = 00004444 00003333 00002222 00001111 2227 sve_uunpklo(dst, S, src); 2228 // pgtmp = 00000001 00000000 00000001 00000001 2229 sve_punpklo(pgtmp, mask); 2230 // Pack the active elements in size of type INT to the right, 2231 // and fill the remainings with zero. 2232 // dst = 00000000 00004444 00002222 00001111 2233 sve_compact(dst, S, dst, pgtmp); 2234 // Narrow the result back to type SHORT. 2235 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2236 sve_uzp1(dst, H, dst, vtmp2); 2237 // Count the active elements of lowest half. 2238 // rscratch1 = 3 2239 sve_cntp(rscratch1, S, ptrue, pgtmp); 2240 2241 // Repeat to the highest half. 2242 // pgtmp = 00000001 00000000 00000000 00000001 2243 sve_punpkhi(pgtmp, mask); 2244 // vtmp1 = 00008888 00007777 00006666 00005555 2245 sve_uunpkhi(vtmp1, S, src); 2246 // vtmp1 = 00000000 00000000 00008888 00005555 2247 sve_compact(vtmp1, S, vtmp1, pgtmp); 2248 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2249 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2250 2251 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2252 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2253 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2254 // TRUE_CNT is the number of active elements in the compressed low. 2255 neg(rscratch1, rscratch1); 2256 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2257 sve_index(vtmp2, H, rscratch1, 1); 2258 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2259 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2260 2261 // Combine the compressed high(after shifted) with the compressed low. 2262 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2263 sve_orr(dst, dst, vtmp1); 2264 } 2265 2266 // Clobbers: rscratch1, rscratch2 2267 // Preserves: src, mask 2268 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2269 FloatRegister vtmp1, FloatRegister vtmp2, 2270 FloatRegister vtmp3, FloatRegister vtmp4, 2271 PRegister ptmp, PRegister pgtmp) { 2272 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2273 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2274 assert_different_registers(mask, ptmp, pgtmp); 2275 // Example input: src = 88 77 66 55 44 33 22 11 2276 // mask = 01 00 00 01 01 00 01 01 2277 // Expected result: dst = 00 00 00 88 55 44 22 11 2278 2279 sve_dup(vtmp4, B, 0); 2280 // Extend lowest half to type SHORT. 2281 // vtmp1 = 0044 0033 0022 0011 2282 sve_uunpklo(vtmp1, H, src); 2283 // ptmp = 0001 0000 0001 0001 2284 sve_punpklo(ptmp, mask); 2285 // Count the active elements of lowest half. 2286 // rscratch2 = 3 2287 sve_cntp(rscratch2, H, ptrue, ptmp); 2288 // Pack the active elements in size of type SHORT to the right, 2289 // and fill the remainings with zero. 2290 // dst = 0000 0044 0022 0011 2291 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2292 // Narrow the result back to type BYTE. 2293 // dst = 00 00 00 00 00 44 22 11 2294 sve_uzp1(dst, B, dst, vtmp4); 2295 2296 // Repeat to the highest half. 2297 // ptmp = 0001 0000 0000 0001 2298 sve_punpkhi(ptmp, mask); 2299 // vtmp1 = 0088 0077 0066 0055 2300 sve_uunpkhi(vtmp2, H, src); 2301 // vtmp1 = 0000 0000 0088 0055 2302 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2303 2304 sve_dup(vtmp4, B, 0); 2305 // vtmp1 = 00 00 00 00 00 00 88 55 2306 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2307 2308 // Compressed low: dst = 00 00 00 00 00 44 22 11 2309 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2310 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2311 // TRUE_CNT is the number of active elements in the compressed low. 2312 neg(rscratch2, rscratch2); 2313 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2314 sve_index(vtmp2, B, rscratch2, 1); 2315 // vtmp1 = 00 00 00 88 55 00 00 00 2316 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2317 // Combine the compressed high(after shifted) with the compressed low. 2318 // dst = 00 00 00 88 55 44 22 11 2319 sve_orr(dst, dst, vtmp1); 2320 } 2321 2322 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2323 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2324 SIMD_Arrangement size = isQ ? T16B : T8B; 2325 if (bt == T_BYTE) { 2326 rbit(dst, size, src); 2327 } else { 2328 neon_reverse_bytes(dst, src, bt, isQ); 2329 rbit(dst, size, dst); 2330 } 2331 } 2332 2333 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2334 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2335 SIMD_Arrangement size = isQ ? T16B : T8B; 2336 switch (bt) { 2337 case T_BYTE: 2338 if (dst != src) { 2339 orr(dst, size, src, src); 2340 } 2341 break; 2342 case T_SHORT: 2343 rev16(dst, size, src); 2344 break; 2345 case T_INT: 2346 rev32(dst, size, src); 2347 break; 2348 case T_LONG: 2349 rev64(dst, size, src); 2350 break; 2351 default: 2352 assert(false, "unsupported"); 2353 ShouldNotReachHere(); 2354 } 2355 } 2356 2357 // Extract a scalar element from an sve vector at position 'idx'. 2358 // The input elements in src are expected to be of integral type. 2359 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2360 int idx, FloatRegister vtmp) { 2361 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2362 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2363 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2364 if (bt == T_INT || bt == T_LONG) { 2365 umov(dst, src, size, idx); 2366 } else { 2367 smov(dst, src, size, idx); 2368 } 2369 } else { 2370 sve_orr(vtmp, src, src); 2371 sve_ext(vtmp, vtmp, idx << size); 2372 if (bt == T_INT || bt == T_LONG) { 2373 umov(dst, vtmp, size, 0); 2374 } else { 2375 smov(dst, vtmp, size, 0); 2376 } 2377 } 2378 } 2379 2380 // java.lang.Math::round intrinsics 2381 2382 // Clobbers: rscratch1, rflags 2383 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2384 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2385 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2386 switch (T) { 2387 case T2S: 2388 case T4S: 2389 fmovs(tmp1, T, 0.5f); 2390 mov(rscratch1, jint_cast(0x1.0p23f)); 2391 break; 2392 case T2D: 2393 fmovd(tmp1, T, 0.5); 2394 mov(rscratch1, julong_cast(0x1.0p52)); 2395 break; 2396 default: 2397 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2398 } 2399 fadd(tmp1, T, tmp1, src); 2400 fcvtms(tmp1, T, tmp1); 2401 // tmp1 = floor(src + 0.5, ties to even) 2402 2403 fcvtas(dst, T, src); 2404 // dst = round(src), ties to away 2405 2406 fneg(tmp3, T, src); 2407 dup(tmp2, T, rscratch1); 2408 cm(HS, tmp3, T, tmp3, tmp2); 2409 // tmp3 is now a set of flags 2410 2411 bif(dst, T16B, tmp1, tmp3); 2412 // result in dst 2413 } 2414 2415 // Clobbers: rscratch1, rflags 2416 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2417 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2418 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2419 assert_different_registers(tmp1, tmp2, src, dst); 2420 2421 switch (T) { 2422 case S: 2423 mov(rscratch1, jint_cast(0x1.0p23f)); 2424 break; 2425 case D: 2426 mov(rscratch1, julong_cast(0x1.0p52)); 2427 break; 2428 default: 2429 assert(T == S || T == D, "invalid register variant"); 2430 } 2431 2432 sve_frinta(dst, T, ptrue, src); 2433 // dst = round(src), ties to away 2434 2435 Label none; 2436 2437 sve_fneg(tmp1, T, ptrue, src); 2438 sve_dup(tmp2, T, rscratch1); 2439 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2440 br(EQ, none); 2441 { 2442 sve_cpy(tmp1, T, pgtmp, 0.5); 2443 sve_fadd(tmp1, T, pgtmp, src); 2444 sve_frintm(dst, T, pgtmp, tmp1); 2445 // dst = floor(src + 0.5, ties to even) 2446 } 2447 bind(none); 2448 2449 sve_fcvtzs(dst, T, ptrue, dst, T); 2450 // result in dst 2451 } 2452 2453 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2454 FloatRegister one, SIMD_Arrangement T) { 2455 assert_different_registers(dst, src, zero, one); 2456 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2457 2458 facgt(dst, T, src, zero); 2459 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2460 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2461 } 2462 2463 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2464 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2465 assert_different_registers(dst, src, zero, one, vtmp); 2466 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2467 2468 sve_orr(vtmp, src, src); 2469 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2470 switch (T) { 2471 case S: 2472 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2473 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2474 // on the sign of the float value 2475 break; 2476 case D: 2477 sve_and(vtmp, T, min_jlong); 2478 sve_orr(vtmp, T, jlong_cast(1.0)); 2479 break; 2480 default: 2481 assert(false, "unsupported"); 2482 ShouldNotReachHere(); 2483 } 2484 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2485 // Result in dst 2486 } 2487 2488 bool C2_MacroAssembler::in_scratch_emit_size() { 2489 if (ciEnv::current()->task() != nullptr) { 2490 PhaseOutput* phase_output = Compile::current()->output(); 2491 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2492 return true; 2493 } 2494 } 2495 return MacroAssembler::in_scratch_emit_size(); 2496 }