1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 #ifdef PRODUCT 38 #define BLOCK_COMMENT(str) /* nothing */ 39 #define STOP(error) stop(error) 40 #else 41 #define BLOCK_COMMENT(str) block_comment(str) 42 #define STOP(error) block_comment(error); stop(error) 43 #endif 44 45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 46 47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 48 49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 50 Register tmp2Reg, Register tmp3Reg) { 51 Register oop = objectReg; 52 Register box = boxReg; 53 Register disp_hdr = tmpReg; 54 Register tmp = tmp2Reg; 55 Label cont; 56 Label object_has_monitor; 57 Label count, no_count; 58 59 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 60 assert_different_registers(oop, box, tmp, disp_hdr); 61 62 // Load markWord from object into displaced_header. 63 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 64 65 if (DiagnoseSyncOnValueBasedClasses != 0) { 66 load_klass(tmp, oop); 67 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 68 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 69 br(Assembler::NE, cont); 70 } 71 72 // Check for existing monitor 73 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 74 75 if (LockingMode == LM_MONITOR) { 76 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 77 b(cont); 78 } else { 79 assert(LockingMode == LM_LEGACY, "must be"); 80 // Set tmp to be (markWord of object | UNLOCK_VALUE). 81 orr(tmp, disp_hdr, markWord::unlocked_value); 82 83 // Initialize the box. (Must happen before we update the object mark!) 84 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 85 86 // Compare object markWord with an unlocked value (tmp) and if 87 // equal exchange the stack address of our box with object markWord. 88 // On failure disp_hdr contains the possibly locked markWord. 89 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 90 /*release*/ true, /*weak*/ false, disp_hdr); 91 br(Assembler::EQ, cont); 92 93 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 94 95 // If the compare-and-exchange succeeded, then we found an unlocked 96 // object, will have now locked it will continue at label cont 97 98 // Check if the owner is self by comparing the value in the 99 // markWord of object (disp_hdr) with the stack pointer. 100 mov(rscratch1, sp); 101 sub(disp_hdr, disp_hdr, rscratch1); 102 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 103 // If condition is true we are cont and hence we can store 0 as the 104 // displaced header in the box, which indicates that it is a recursive lock. 105 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 106 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 107 b(cont); 108 } 109 110 // Handle existing monitor. 111 bind(object_has_monitor); 112 113 // The object's monitor m is unlocked iff m->owner == NULL, 114 // otherwise m->owner may contain a thread or a stack address. 115 // 116 // Try to CAS m->owner from NULL to current thread. 117 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 118 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 119 /*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result 120 121 // Store a non-null value into the box to avoid looking like a re-entrant 122 // lock. The fast-path monitor unlock code checks for 123 // markWord::monitor_value so use markWord::unused_mark which has the 124 // relevant bit set, and also matches ObjectSynchronizer::enter. 125 mov(tmp, (address)markWord::unused_mark().value()); 126 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 127 128 br(Assembler::EQ, cont); // CAS success means locking succeeded 129 130 cmp(rscratch1, rthread); 131 br(Assembler::NE, cont); // Check for recursive locking 132 133 // Recursive lock case 134 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 135 // flag == EQ still from the cmp above, checking if this is a reentrant lock 136 137 bind(cont); 138 // flag == EQ indicates success 139 // flag == NE indicates failure 140 br(Assembler::NE, no_count); 141 142 bind(count); 143 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 144 145 bind(no_count); 146 } 147 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 149 Register tmp2Reg) { 150 Register oop = objectReg; 151 Register box = boxReg; 152 Register disp_hdr = tmpReg; 153 Register tmp = tmp2Reg; 154 Label cont; 155 Label object_has_monitor; 156 Label count, no_count; 157 158 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 159 assert_different_registers(oop, box, tmp, disp_hdr); 160 161 if (LockingMode == LM_LEGACY) { 162 // Find the lock address and load the displaced header from the stack. 163 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 164 165 // If the displaced header is 0, we have a recursive unlock. 166 cmp(disp_hdr, zr); 167 br(Assembler::EQ, cont); 168 } 169 170 // Handle existing monitor. 171 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 172 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 173 174 if (LockingMode == LM_MONITOR) { 175 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 176 b(cont); 177 } else { 178 assert(LockingMode == LM_LEGACY, "must be"); 179 // Check if it is still a light weight lock, this is is true if we 180 // see the stack address of the basicLock in the markWord of the 181 // object. 182 183 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 184 /*release*/ true, /*weak*/ false, tmp); 185 b(cont); 186 } 187 188 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 189 190 // Handle existing monitor. 191 bind(object_has_monitor); 192 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 193 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 194 195 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 196 197 Label notRecursive; 198 cbz(disp_hdr, notRecursive); 199 200 // Recursive lock 201 sub(disp_hdr, disp_hdr, 1u); 202 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 203 cmp(disp_hdr, disp_hdr); // Sets flags for result 204 b(cont); 205 206 bind(notRecursive); 207 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 208 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 209 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 210 cmp(rscratch1, zr); // Sets flags for result 211 cbnz(rscratch1, cont); 212 // need a release store here 213 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 214 stlr(zr, tmp); // set unowned 215 216 bind(cont); 217 // flag == EQ indicates success 218 // flag == NE indicates failure 219 br(Assembler::NE, no_count); 220 221 bind(count); 222 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 223 224 bind(no_count); 225 } 226 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1, 228 Register t2, Register t3) { 229 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 230 assert_different_registers(obj, t1, t2, t3); 231 232 // Handle inflated monitor. 233 Label inflated; 234 // Finish fast lock successfully. MUST branch to with flag == EQ 235 Label locked; 236 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 237 Label slow_path; 238 239 if (DiagnoseSyncOnValueBasedClasses != 0) { 240 load_klass(t1, obj); 241 ldrw(t1, Address(t1, Klass::access_flags_offset())); 242 tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS); 243 br(Assembler::NE, slow_path); 244 } 245 246 const Register t1_mark = t1; 247 248 { // Lightweight locking 249 250 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 251 Label push; 252 253 const Register t2_top = t2; 254 const Register t3_t = t3; 255 256 // Check if lock-stack is full. 257 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 258 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 259 br(Assembler::GT, slow_path); 260 261 // Check if recursive. 262 subw(t3_t, t2_top, oopSize); 263 ldr(t3_t, Address(rthread, t3_t)); 264 cmp(obj, t3_t); 265 br(Assembler::EQ, push); 266 267 // Relaxed normal load to check for monitor. Optimization for monitor case. 268 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 269 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 270 271 // Not inflated 272 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 273 274 // Try to lock. Transition lock-bits 0b01 => 0b00 275 orr(t1_mark, t1_mark, markWord::unlocked_value); 276 eor(t3_t, t1_mark, markWord::unlocked_value); 277 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 278 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 279 br(Assembler::NE, slow_path); 280 281 bind(push); 282 // After successful lock, push object on lock-stack. 283 str(obj, Address(rthread, t2_top)); 284 addw(t2_top, t2_top, oopSize); 285 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 286 b(locked); 287 } 288 289 { // Handle inflated monitor. 290 bind(inflated); 291 292 // mark contains the tagged ObjectMonitor*. 293 const Register t1_tagged_monitor = t1_mark; 294 const uintptr_t monitor_tag = markWord::monitor_value; 295 const Register t2_owner_addr = t2; 296 const Register t3_owner = t3; 297 298 // Compute owner address. 299 lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag))); 300 301 // CAS owner (null => current thread). 302 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 303 /*release*/ false, /*weak*/ false, t3_owner); 304 br(Assembler::EQ, locked); 305 306 // Check if recursive. 307 cmp(t3_owner, rthread); 308 br(Assembler::NE, slow_path); 309 310 // Recursive. 311 increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1); 312 } 313 314 bind(locked); 315 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 316 317 #ifdef ASSERT 318 // Check that locked label is reached with Flags == EQ. 319 Label flag_correct; 320 br(Assembler::EQ, flag_correct); 321 stop("Fast Lock Flag != EQ"); 322 #endif 323 324 bind(slow_path); 325 #ifdef ASSERT 326 // Check that slow_path label is reached with Flags == NE. 327 br(Assembler::NE, flag_correct); 328 stop("Fast Lock Flag != NE"); 329 bind(flag_correct); 330 #endif 331 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 332 } 333 334 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2, 335 Register t3) { 336 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 337 assert_different_registers(obj, t1, t2, t3); 338 339 // Handle inflated monitor. 340 Label inflated, inflated_load_monitor; 341 // Finish fast unlock successfully. MUST branch to with flag == EQ 342 Label unlocked; 343 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 344 Label slow_path; 345 346 const Register t1_mark = t1; 347 const Register t2_top = t2; 348 const Register t3_t = t3; 349 350 { // Lightweight unlock 351 352 // Check if obj is top of lock-stack. 353 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 354 subw(t2_top, t2_top, oopSize); 355 ldr(t3_t, Address(rthread, t2_top)); 356 cmp(obj, t3_t); 357 // Top of lock stack was not obj. Must be monitor. 358 br(Assembler::NE, inflated_load_monitor); 359 360 // Pop lock-stack. 361 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 362 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 363 364 // Check if recursive. 365 subw(t3_t, t2_top, oopSize); 366 ldr(t3_t, Address(rthread, t3_t)); 367 cmp(obj, t3_t); 368 br(Assembler::EQ, unlocked); 369 370 // Not recursive. 371 // Load Mark. 372 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 373 374 // Check header for monitor (0b10). 375 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 376 377 // Try to unlock. Transition lock bits 0b00 => 0b01 378 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 379 orr(t3_t, t1_mark, markWord::unlocked_value); 380 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 381 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 382 br(Assembler::EQ, unlocked); 383 384 // Compare and exchange failed. 385 // Restore lock-stack and handle the unlock in runtime. 386 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 387 addw(t2_top, t2_top, oopSize); 388 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 389 b(slow_path); 390 } 391 392 393 { // Handle inflated monitor. 394 bind(inflated_load_monitor); 395 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 396 #ifdef ASSERT 397 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 398 stop("Fast Unlock not monitor"); 399 #endif 400 401 bind(inflated); 402 403 #ifdef ASSERT 404 Label check_done; 405 subw(t2_top, t2_top, oopSize); 406 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 407 br(Assembler::LT, check_done); 408 ldr(t3_t, Address(rthread, t2_top)); 409 cmp(obj, t3_t); 410 br(Assembler::NE, inflated); 411 stop("Fast Unlock lock on stack"); 412 bind(check_done); 413 #endif 414 415 // mark contains the tagged ObjectMonitor*. 416 const Register t1_monitor = t1_mark; 417 const uintptr_t monitor_tag = markWord::monitor_value; 418 419 // Untag the monitor. 420 sub(t1_monitor, t1_mark, monitor_tag); 421 422 const Register t2_recursions = t2; 423 Label not_recursive; 424 425 // Check if recursive. 426 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 427 cbz(t2_recursions, not_recursive); 428 429 // Recursive unlock. 430 sub(t2_recursions, t2_recursions, 1u); 431 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 432 // Set flag == EQ 433 cmp(t2_recursions, t2_recursions); 434 b(unlocked); 435 436 bind(not_recursive); 437 438 Label release; 439 const Register t2_owner_addr = t2; 440 441 // Compute owner address. 442 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 443 444 // Check if the entry lists are empty. 445 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 446 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 447 orr(rscratch1, rscratch1, t3_t); 448 cmp(rscratch1, zr); 449 br(Assembler::EQ, release); 450 451 // The owner may be anonymous and we removed the last obj entry in 452 // the lock-stack. This loses the information about the owner. 453 // Write the thread to the owner field so the runtime knows the owner. 454 str(rthread, Address(t2_owner_addr)); 455 b(slow_path); 456 457 bind(release); 458 // Set owner to null. 459 // Release to satisfy the JMM 460 stlr(zr, t2_owner_addr); 461 } 462 463 bind(unlocked); 464 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 465 466 #ifdef ASSERT 467 // Check that unlocked label is reached with Flags == EQ. 468 Label flag_correct; 469 br(Assembler::EQ, flag_correct); 470 stop("Fast Unlock Flag != EQ"); 471 #endif 472 473 bind(slow_path); 474 #ifdef ASSERT 475 // Check that slow_path label is reached with Flags == NE. 476 br(Assembler::NE, flag_correct); 477 stop("Fast Unlock Flag != NE"); 478 bind(flag_correct); 479 #endif 480 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 481 } 482 483 // Search for str1 in str2 and return index or -1 484 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 485 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 486 Register cnt2, Register cnt1, 487 Register tmp1, Register tmp2, 488 Register tmp3, Register tmp4, 489 Register tmp5, Register tmp6, 490 int icnt1, Register result, int ae) { 491 // NOTE: tmp5, tmp6 can be zr depending on specific method version 492 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 493 494 Register ch1 = rscratch1; 495 Register ch2 = rscratch2; 496 Register cnt1tmp = tmp1; 497 Register cnt2tmp = tmp2; 498 Register cnt1_neg = cnt1; 499 Register cnt2_neg = cnt2; 500 Register result_tmp = tmp4; 501 502 bool isL = ae == StrIntrinsicNode::LL; 503 504 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 505 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 506 int str1_chr_shift = str1_isL ? 0:1; 507 int str2_chr_shift = str2_isL ? 0:1; 508 int str1_chr_size = str1_isL ? 1:2; 509 int str2_chr_size = str2_isL ? 1:2; 510 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 511 (chr_insn)&MacroAssembler::ldrh; 512 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 513 (chr_insn)&MacroAssembler::ldrh; 514 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 515 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 516 517 // Note, inline_string_indexOf() generates checks: 518 // if (substr.count > string.count) return -1; 519 // if (substr.count == 0) return 0; 520 521 // We have two strings, a source string in str2, cnt2 and a pattern string 522 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 523 524 // For larger pattern and source we use a simplified Boyer Moore algorithm. 525 // With a small pattern and source we use linear scan. 526 527 if (icnt1 == -1) { 528 sub(result_tmp, cnt2, cnt1); 529 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 530 br(LT, LINEARSEARCH); 531 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 532 subs(zr, cnt1, 256); 533 lsr(tmp1, cnt2, 2); 534 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 535 br(GE, LINEARSTUB); 536 } 537 538 // The Boyer Moore alogorithm is based on the description here:- 539 // 540 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 541 // 542 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 543 // and the 'Good Suffix' rule. 544 // 545 // These rules are essentially heuristics for how far we can shift the 546 // pattern along the search string. 547 // 548 // The implementation here uses the 'Bad Character' rule only because of the 549 // complexity of initialisation for the 'Good Suffix' rule. 550 // 551 // This is also known as the Boyer-Moore-Horspool algorithm:- 552 // 553 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 554 // 555 // This particular implementation has few java-specific optimizations. 556 // 557 // #define ASIZE 256 558 // 559 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 560 // int i, j; 561 // unsigned c; 562 // unsigned char bc[ASIZE]; 563 // 564 // /* Preprocessing */ 565 // for (i = 0; i < ASIZE; ++i) 566 // bc[i] = m; 567 // for (i = 0; i < m - 1; ) { 568 // c = x[i]; 569 // ++i; 570 // // c < 256 for Latin1 string, so, no need for branch 571 // #ifdef PATTERN_STRING_IS_LATIN1 572 // bc[c] = m - i; 573 // #else 574 // if (c < ASIZE) bc[c] = m - i; 575 // #endif 576 // } 577 // 578 // /* Searching */ 579 // j = 0; 580 // while (j <= n - m) { 581 // c = y[i+j]; 582 // if (x[m-1] == c) 583 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 584 // if (i < 0) return j; 585 // // c < 256 for Latin1 string, so, no need for branch 586 // #ifdef SOURCE_STRING_IS_LATIN1 587 // // LL case: (c< 256) always true. Remove branch 588 // j += bc[y[j+m-1]]; 589 // #endif 590 // #ifndef PATTERN_STRING_IS_UTF 591 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 592 // if (c < ASIZE) 593 // j += bc[y[j+m-1]]; 594 // else 595 // j += 1 596 // #endif 597 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 598 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 599 // if (c < ASIZE) 600 // j += bc[y[j+m-1]]; 601 // else 602 // j += m 603 // #endif 604 // } 605 // } 606 607 if (icnt1 == -1) { 608 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 609 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 610 Register cnt1end = tmp2; 611 Register str2end = cnt2; 612 Register skipch = tmp2; 613 614 // str1 length is >=8, so, we can read at least 1 register for cases when 615 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 616 // UL case. We'll re-read last character in inner pre-loop code to have 617 // single outer pre-loop load 618 const int firstStep = isL ? 7 : 3; 619 620 const int ASIZE = 256; 621 const int STORED_BYTES = 32; // amount of bytes stored per instruction 622 sub(sp, sp, ASIZE); 623 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 624 mov(ch1, sp); 625 BIND(BM_INIT_LOOP); 626 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 627 subs(tmp5, tmp5, 1); 628 br(GT, BM_INIT_LOOP); 629 630 sub(cnt1tmp, cnt1, 1); 631 mov(tmp5, str2); 632 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 633 sub(ch2, cnt1, 1); 634 mov(tmp3, str1); 635 BIND(BCLOOP); 636 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 637 if (!str1_isL) { 638 subs(zr, ch1, ASIZE); 639 br(HS, BCSKIP); 640 } 641 strb(ch2, Address(sp, ch1)); 642 BIND(BCSKIP); 643 subs(ch2, ch2, 1); 644 br(GT, BCLOOP); 645 646 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 647 if (str1_isL == str2_isL) { 648 // load last 8 bytes (8LL/4UU symbols) 649 ldr(tmp6, Address(tmp6, -wordSize)); 650 } else { 651 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 652 // convert Latin1 to UTF. We'll have to wait until load completed, but 653 // it's still faster than per-character loads+checks 654 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 655 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 656 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 657 andr(tmp6, tmp6, 0xFF); // str1[N-4] 658 orr(ch2, ch1, ch2, LSL, 16); 659 orr(tmp6, tmp6, tmp3, LSL, 48); 660 orr(tmp6, tmp6, ch2, LSL, 16); 661 } 662 BIND(BMLOOPSTR2); 663 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 664 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 665 if (str1_isL == str2_isL) { 666 // re-init tmp3. It's for free because it's executed in parallel with 667 // load above. Alternative is to initialize it before loop, but it'll 668 // affect performance on in-order systems with 2 or more ld/st pipelines 669 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 670 } 671 if (!isL) { // UU/UL case 672 lsl(ch2, cnt1tmp, 1); // offset in bytes 673 } 674 cmp(tmp3, skipch); 675 br(NE, BMSKIP); 676 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 677 mov(ch1, tmp6); 678 if (isL) { 679 b(BMLOOPSTR1_AFTER_LOAD); 680 } else { 681 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 682 b(BMLOOPSTR1_CMP); 683 } 684 BIND(BMLOOPSTR1); 685 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 686 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 687 BIND(BMLOOPSTR1_AFTER_LOAD); 688 subs(cnt1tmp, cnt1tmp, 1); 689 br(LT, BMLOOPSTR1_LASTCMP); 690 BIND(BMLOOPSTR1_CMP); 691 cmp(ch1, ch2); 692 br(EQ, BMLOOPSTR1); 693 BIND(BMSKIP); 694 if (!isL) { 695 // if we've met UTF symbol while searching Latin1 pattern, then we can 696 // skip cnt1 symbols 697 if (str1_isL != str2_isL) { 698 mov(result_tmp, cnt1); 699 } else { 700 mov(result_tmp, 1); 701 } 702 subs(zr, skipch, ASIZE); 703 br(HS, BMADV); 704 } 705 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 706 BIND(BMADV); 707 sub(cnt1tmp, cnt1, 1); 708 add(str2, str2, result_tmp, LSL, str2_chr_shift); 709 cmp(str2, str2end); 710 br(LE, BMLOOPSTR2); 711 add(sp, sp, ASIZE); 712 b(NOMATCH); 713 BIND(BMLOOPSTR1_LASTCMP); 714 cmp(ch1, ch2); 715 br(NE, BMSKIP); 716 BIND(BMMATCH); 717 sub(result, str2, tmp5); 718 if (!str2_isL) lsr(result, result, 1); 719 add(sp, sp, ASIZE); 720 b(DONE); 721 722 BIND(LINEARSTUB); 723 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 724 br(LT, LINEAR_MEDIUM); 725 mov(result, zr); 726 RuntimeAddress stub = nullptr; 727 if (isL) { 728 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 729 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 730 } else if (str1_isL) { 731 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 732 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 733 } else { 734 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 735 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 736 } 737 address call = trampoline_call(stub); 738 if (call == nullptr) { 739 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 740 ciEnv::current()->record_failure("CodeCache is full"); 741 return; 742 } 743 b(DONE); 744 } 745 746 BIND(LINEARSEARCH); 747 { 748 Label DO1, DO2, DO3; 749 750 Register str2tmp = tmp2; 751 Register first = tmp3; 752 753 if (icnt1 == -1) 754 { 755 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 756 757 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 758 br(LT, DOSHORT); 759 BIND(LINEAR_MEDIUM); 760 (this->*str1_load_1chr)(first, Address(str1)); 761 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 762 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 763 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 764 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 765 766 BIND(FIRST_LOOP); 767 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 768 cmp(first, ch2); 769 br(EQ, STR1_LOOP); 770 BIND(STR2_NEXT); 771 adds(cnt2_neg, cnt2_neg, str2_chr_size); 772 br(LE, FIRST_LOOP); 773 b(NOMATCH); 774 775 BIND(STR1_LOOP); 776 adds(cnt1tmp, cnt1_neg, str1_chr_size); 777 add(cnt2tmp, cnt2_neg, str2_chr_size); 778 br(GE, MATCH); 779 780 BIND(STR1_NEXT); 781 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 782 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 783 cmp(ch1, ch2); 784 br(NE, STR2_NEXT); 785 adds(cnt1tmp, cnt1tmp, str1_chr_size); 786 add(cnt2tmp, cnt2tmp, str2_chr_size); 787 br(LT, STR1_NEXT); 788 b(MATCH); 789 790 BIND(DOSHORT); 791 if (str1_isL == str2_isL) { 792 cmp(cnt1, (u1)2); 793 br(LT, DO1); 794 br(GT, DO3); 795 } 796 } 797 798 if (icnt1 == 4) { 799 Label CH1_LOOP; 800 801 (this->*load_4chr)(ch1, str1); 802 sub(result_tmp, cnt2, 4); 803 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 804 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 805 806 BIND(CH1_LOOP); 807 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 808 cmp(ch1, ch2); 809 br(EQ, MATCH); 810 adds(cnt2_neg, cnt2_neg, str2_chr_size); 811 br(LE, CH1_LOOP); 812 b(NOMATCH); 813 } 814 815 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 816 Label CH1_LOOP; 817 818 BIND(DO2); 819 (this->*load_2chr)(ch1, str1); 820 if (icnt1 == 2) { 821 sub(result_tmp, cnt2, 2); 822 } 823 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 824 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 825 BIND(CH1_LOOP); 826 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 827 cmp(ch1, ch2); 828 br(EQ, MATCH); 829 adds(cnt2_neg, cnt2_neg, str2_chr_size); 830 br(LE, CH1_LOOP); 831 b(NOMATCH); 832 } 833 834 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 835 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 836 837 BIND(DO3); 838 (this->*load_2chr)(first, str1); 839 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 840 if (icnt1 == 3) { 841 sub(result_tmp, cnt2, 3); 842 } 843 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 844 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 845 BIND(FIRST_LOOP); 846 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 847 cmpw(first, ch2); 848 br(EQ, STR1_LOOP); 849 BIND(STR2_NEXT); 850 adds(cnt2_neg, cnt2_neg, str2_chr_size); 851 br(LE, FIRST_LOOP); 852 b(NOMATCH); 853 854 BIND(STR1_LOOP); 855 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 856 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 857 cmp(ch1, ch2); 858 br(NE, STR2_NEXT); 859 b(MATCH); 860 } 861 862 if (icnt1 == -1 || icnt1 == 1) { 863 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 864 865 BIND(DO1); 866 (this->*str1_load_1chr)(ch1, str1); 867 cmp(cnt2, (u1)8); 868 br(LT, DO1_SHORT); 869 870 sub(result_tmp, cnt2, 8/str2_chr_size); 871 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 872 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 873 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 874 875 if (str2_isL) { 876 orr(ch1, ch1, ch1, LSL, 8); 877 } 878 orr(ch1, ch1, ch1, LSL, 16); 879 orr(ch1, ch1, ch1, LSL, 32); 880 BIND(CH1_LOOP); 881 ldr(ch2, Address(str2, cnt2_neg)); 882 eor(ch2, ch1, ch2); 883 sub(tmp1, ch2, tmp3); 884 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 885 bics(tmp1, tmp1, tmp2); 886 br(NE, HAS_ZERO); 887 adds(cnt2_neg, cnt2_neg, 8); 888 br(LT, CH1_LOOP); 889 890 cmp(cnt2_neg, (u1)8); 891 mov(cnt2_neg, 0); 892 br(LT, CH1_LOOP); 893 b(NOMATCH); 894 895 BIND(HAS_ZERO); 896 rev(tmp1, tmp1); 897 clz(tmp1, tmp1); 898 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 899 b(MATCH); 900 901 BIND(DO1_SHORT); 902 mov(result_tmp, cnt2); 903 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 904 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 905 BIND(DO1_LOOP); 906 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 907 cmpw(ch1, ch2); 908 br(EQ, MATCH); 909 adds(cnt2_neg, cnt2_neg, str2_chr_size); 910 br(LT, DO1_LOOP); 911 } 912 } 913 BIND(NOMATCH); 914 mov(result, -1); 915 b(DONE); 916 BIND(MATCH); 917 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 918 BIND(DONE); 919 } 920 921 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 922 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 923 924 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 925 Register ch, Register result, 926 Register tmp1, Register tmp2, Register tmp3) 927 { 928 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 929 Register cnt1_neg = cnt1; 930 Register ch1 = rscratch1; 931 Register result_tmp = rscratch2; 932 933 cbz(cnt1, NOMATCH); 934 935 cmp(cnt1, (u1)4); 936 br(LT, DO1_SHORT); 937 938 orr(ch, ch, ch, LSL, 16); 939 orr(ch, ch, ch, LSL, 32); 940 941 sub(cnt1, cnt1, 4); 942 mov(result_tmp, cnt1); 943 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 944 sub(cnt1_neg, zr, cnt1, LSL, 1); 945 946 mov(tmp3, 0x0001000100010001); 947 948 BIND(CH1_LOOP); 949 ldr(ch1, Address(str1, cnt1_neg)); 950 eor(ch1, ch, ch1); 951 sub(tmp1, ch1, tmp3); 952 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 953 bics(tmp1, tmp1, tmp2); 954 br(NE, HAS_ZERO); 955 adds(cnt1_neg, cnt1_neg, 8); 956 br(LT, CH1_LOOP); 957 958 cmp(cnt1_neg, (u1)8); 959 mov(cnt1_neg, 0); 960 br(LT, CH1_LOOP); 961 b(NOMATCH); 962 963 BIND(HAS_ZERO); 964 rev(tmp1, tmp1); 965 clz(tmp1, tmp1); 966 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 967 b(MATCH); 968 969 BIND(DO1_SHORT); 970 mov(result_tmp, cnt1); 971 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 972 sub(cnt1_neg, zr, cnt1, LSL, 1); 973 BIND(DO1_LOOP); 974 ldrh(ch1, Address(str1, cnt1_neg)); 975 cmpw(ch, ch1); 976 br(EQ, MATCH); 977 adds(cnt1_neg, cnt1_neg, 2); 978 br(LT, DO1_LOOP); 979 BIND(NOMATCH); 980 mov(result, -1); 981 b(DONE); 982 BIND(MATCH); 983 add(result, result_tmp, cnt1_neg, ASR, 1); 984 BIND(DONE); 985 } 986 987 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 988 Register ch, Register result, 989 FloatRegister ztmp1, 990 FloatRegister ztmp2, 991 PRegister tmp_pg, 992 PRegister tmp_pdn, bool isL) 993 { 994 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 995 assert(tmp_pg->is_governing(), 996 "this register has to be a governing predicate register"); 997 998 Label LOOP, MATCH, DONE, NOMATCH; 999 Register vec_len = rscratch1; 1000 Register idx = rscratch2; 1001 1002 SIMD_RegVariant T = (isL == true) ? B : H; 1003 1004 cbz(cnt1, NOMATCH); 1005 1006 // Assign the particular char throughout the vector. 1007 sve_dup(ztmp2, T, ch); 1008 if (isL) { 1009 sve_cntb(vec_len); 1010 } else { 1011 sve_cnth(vec_len); 1012 } 1013 mov(idx, 0); 1014 1015 // Generate a predicate to control the reading of input string. 1016 sve_whilelt(tmp_pg, T, idx, cnt1); 1017 1018 BIND(LOOP); 1019 // Read a vector of 8- or 16-bit data depending on the string type. Note 1020 // that inactive elements indicated by the predicate register won't cause 1021 // a data read from memory to the destination vector. 1022 if (isL) { 1023 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1024 } else { 1025 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1026 } 1027 add(idx, idx, vec_len); 1028 1029 // Perform the comparison. An element of the destination predicate is set 1030 // to active if the particular char is matched. 1031 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1032 1033 // Branch if the particular char is found. 1034 br(NE, MATCH); 1035 1036 sve_whilelt(tmp_pg, T, idx, cnt1); 1037 1038 // Loop back if the particular char not found. 1039 br(MI, LOOP); 1040 1041 BIND(NOMATCH); 1042 mov(result, -1); 1043 b(DONE); 1044 1045 BIND(MATCH); 1046 // Undo the index increment. 1047 sub(idx, idx, vec_len); 1048 1049 // Crop the vector to find its location. 1050 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1051 add(result, idx, -1); 1052 sve_incp(result, T, tmp_pdn); 1053 BIND(DONE); 1054 } 1055 1056 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1057 Register ch, Register result, 1058 Register tmp1, Register tmp2, Register tmp3) 1059 { 1060 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1061 Register cnt1_neg = cnt1; 1062 Register ch1 = rscratch1; 1063 Register result_tmp = rscratch2; 1064 1065 cbz(cnt1, NOMATCH); 1066 1067 cmp(cnt1, (u1)8); 1068 br(LT, DO1_SHORT); 1069 1070 orr(ch, ch, ch, LSL, 8); 1071 orr(ch, ch, ch, LSL, 16); 1072 orr(ch, ch, ch, LSL, 32); 1073 1074 sub(cnt1, cnt1, 8); 1075 mov(result_tmp, cnt1); 1076 lea(str1, Address(str1, cnt1)); 1077 sub(cnt1_neg, zr, cnt1); 1078 1079 mov(tmp3, 0x0101010101010101); 1080 1081 BIND(CH1_LOOP); 1082 ldr(ch1, Address(str1, cnt1_neg)); 1083 eor(ch1, ch, ch1); 1084 sub(tmp1, ch1, tmp3); 1085 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1086 bics(tmp1, tmp1, tmp2); 1087 br(NE, HAS_ZERO); 1088 adds(cnt1_neg, cnt1_neg, 8); 1089 br(LT, CH1_LOOP); 1090 1091 cmp(cnt1_neg, (u1)8); 1092 mov(cnt1_neg, 0); 1093 br(LT, CH1_LOOP); 1094 b(NOMATCH); 1095 1096 BIND(HAS_ZERO); 1097 rev(tmp1, tmp1); 1098 clz(tmp1, tmp1); 1099 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1100 b(MATCH); 1101 1102 BIND(DO1_SHORT); 1103 mov(result_tmp, cnt1); 1104 lea(str1, Address(str1, cnt1)); 1105 sub(cnt1_neg, zr, cnt1); 1106 BIND(DO1_LOOP); 1107 ldrb(ch1, Address(str1, cnt1_neg)); 1108 cmp(ch, ch1); 1109 br(EQ, MATCH); 1110 adds(cnt1_neg, cnt1_neg, 1); 1111 br(LT, DO1_LOOP); 1112 BIND(NOMATCH); 1113 mov(result, -1); 1114 b(DONE); 1115 BIND(MATCH); 1116 add(result, result_tmp, cnt1_neg); 1117 BIND(DONE); 1118 } 1119 1120 // Compare strings. 1121 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1122 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1123 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1124 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1125 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1126 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1127 SHORT_LOOP_START, TAIL_CHECK; 1128 1129 bool isLL = ae == StrIntrinsicNode::LL; 1130 bool isLU = ae == StrIntrinsicNode::LU; 1131 bool isUL = ae == StrIntrinsicNode::UL; 1132 1133 // The stub threshold for LL strings is: 72 (64 + 8) chars 1134 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1135 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1136 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1137 1138 bool str1_isL = isLL || isLU; 1139 bool str2_isL = isLL || isUL; 1140 1141 int str1_chr_shift = str1_isL ? 0 : 1; 1142 int str2_chr_shift = str2_isL ? 0 : 1; 1143 int str1_chr_size = str1_isL ? 1 : 2; 1144 int str2_chr_size = str2_isL ? 1 : 2; 1145 int minCharsInWord = isLL ? wordSize : wordSize/2; 1146 1147 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1148 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1149 (chr_insn)&MacroAssembler::ldrh; 1150 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1151 (chr_insn)&MacroAssembler::ldrh; 1152 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1153 (uxt_insn)&MacroAssembler::uxthw; 1154 1155 BLOCK_COMMENT("string_compare {"); 1156 1157 // Bizzarely, the counts are passed in bytes, regardless of whether they 1158 // are L or U strings, however the result is always in characters. 1159 if (!str1_isL) asrw(cnt1, cnt1, 1); 1160 if (!str2_isL) asrw(cnt2, cnt2, 1); 1161 1162 // Compute the minimum of the string lengths and save the difference. 1163 subsw(result, cnt1, cnt2); 1164 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1165 1166 // A very short string 1167 cmpw(cnt2, minCharsInWord); 1168 br(Assembler::LE, SHORT_STRING); 1169 1170 // Compare longwords 1171 // load first parts of strings and finish initialization while loading 1172 { 1173 if (str1_isL == str2_isL) { // LL or UU 1174 ldr(tmp1, Address(str1)); 1175 cmp(str1, str2); 1176 br(Assembler::EQ, DONE); 1177 ldr(tmp2, Address(str2)); 1178 cmp(cnt2, stub_threshold); 1179 br(GE, STUB); 1180 subsw(cnt2, cnt2, minCharsInWord); 1181 br(EQ, TAIL_CHECK); 1182 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1183 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1184 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1185 } else if (isLU) { 1186 ldrs(vtmp, Address(str1)); 1187 ldr(tmp2, Address(str2)); 1188 cmp(cnt2, stub_threshold); 1189 br(GE, STUB); 1190 subw(cnt2, cnt2, 4); 1191 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1192 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1193 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1194 zip1(vtmp, T8B, vtmp, vtmpZ); 1195 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1196 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1197 add(cnt1, cnt1, 4); 1198 fmovd(tmp1, vtmp); 1199 } else { // UL case 1200 ldr(tmp1, Address(str1)); 1201 ldrs(vtmp, Address(str2)); 1202 cmp(cnt2, stub_threshold); 1203 br(GE, STUB); 1204 subw(cnt2, cnt2, 4); 1205 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1206 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1207 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1208 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1209 zip1(vtmp, T8B, vtmp, vtmpZ); 1210 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1211 add(cnt1, cnt1, 8); 1212 fmovd(tmp2, vtmp); 1213 } 1214 adds(cnt2, cnt2, isUL ? 4 : 8); 1215 br(GE, TAIL); 1216 eor(rscratch2, tmp1, tmp2); 1217 cbnz(rscratch2, DIFF); 1218 // main loop 1219 bind(NEXT_WORD); 1220 if (str1_isL == str2_isL) { 1221 ldr(tmp1, Address(str1, cnt2)); 1222 ldr(tmp2, Address(str2, cnt2)); 1223 adds(cnt2, cnt2, 8); 1224 } else if (isLU) { 1225 ldrs(vtmp, Address(str1, cnt1)); 1226 ldr(tmp2, Address(str2, cnt2)); 1227 add(cnt1, cnt1, 4); 1228 zip1(vtmp, T8B, vtmp, vtmpZ); 1229 fmovd(tmp1, vtmp); 1230 adds(cnt2, cnt2, 8); 1231 } else { // UL 1232 ldrs(vtmp, Address(str2, cnt2)); 1233 ldr(tmp1, Address(str1, cnt1)); 1234 zip1(vtmp, T8B, vtmp, vtmpZ); 1235 add(cnt1, cnt1, 8); 1236 fmovd(tmp2, vtmp); 1237 adds(cnt2, cnt2, 4); 1238 } 1239 br(GE, TAIL); 1240 1241 eor(rscratch2, tmp1, tmp2); 1242 cbz(rscratch2, NEXT_WORD); 1243 b(DIFF); 1244 bind(TAIL); 1245 eor(rscratch2, tmp1, tmp2); 1246 cbnz(rscratch2, DIFF); 1247 // Last longword. In the case where length == 4 we compare the 1248 // same longword twice, but that's still faster than another 1249 // conditional branch. 1250 if (str1_isL == str2_isL) { 1251 ldr(tmp1, Address(str1)); 1252 ldr(tmp2, Address(str2)); 1253 } else if (isLU) { 1254 ldrs(vtmp, Address(str1)); 1255 ldr(tmp2, Address(str2)); 1256 zip1(vtmp, T8B, vtmp, vtmpZ); 1257 fmovd(tmp1, vtmp); 1258 } else { // UL 1259 ldrs(vtmp, Address(str2)); 1260 ldr(tmp1, Address(str1)); 1261 zip1(vtmp, T8B, vtmp, vtmpZ); 1262 fmovd(tmp2, vtmp); 1263 } 1264 bind(TAIL_CHECK); 1265 eor(rscratch2, tmp1, tmp2); 1266 cbz(rscratch2, DONE); 1267 1268 // Find the first different characters in the longwords and 1269 // compute their difference. 1270 bind(DIFF); 1271 rev(rscratch2, rscratch2); 1272 clz(rscratch2, rscratch2); 1273 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1274 lsrv(tmp1, tmp1, rscratch2); 1275 (this->*ext_chr)(tmp1, tmp1); 1276 lsrv(tmp2, tmp2, rscratch2); 1277 (this->*ext_chr)(tmp2, tmp2); 1278 subw(result, tmp1, tmp2); 1279 b(DONE); 1280 } 1281 1282 bind(STUB); 1283 RuntimeAddress stub = nullptr; 1284 switch(ae) { 1285 case StrIntrinsicNode::LL: 1286 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1287 break; 1288 case StrIntrinsicNode::UU: 1289 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1290 break; 1291 case StrIntrinsicNode::LU: 1292 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1293 break; 1294 case StrIntrinsicNode::UL: 1295 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1296 break; 1297 default: 1298 ShouldNotReachHere(); 1299 } 1300 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1301 address call = trampoline_call(stub); 1302 if (call == nullptr) { 1303 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1304 ciEnv::current()->record_failure("CodeCache is full"); 1305 return; 1306 } 1307 b(DONE); 1308 1309 bind(SHORT_STRING); 1310 // Is the minimum length zero? 1311 cbz(cnt2, DONE); 1312 // arrange code to do most branches while loading and loading next characters 1313 // while comparing previous 1314 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1315 subs(cnt2, cnt2, 1); 1316 br(EQ, SHORT_LAST_INIT); 1317 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1318 b(SHORT_LOOP_START); 1319 bind(SHORT_LOOP); 1320 subs(cnt2, cnt2, 1); 1321 br(EQ, SHORT_LAST); 1322 bind(SHORT_LOOP_START); 1323 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1324 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1325 cmp(tmp1, cnt1); 1326 br(NE, SHORT_LOOP_TAIL); 1327 subs(cnt2, cnt2, 1); 1328 br(EQ, SHORT_LAST2); 1329 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1330 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1331 cmp(tmp2, rscratch1); 1332 br(EQ, SHORT_LOOP); 1333 sub(result, tmp2, rscratch1); 1334 b(DONE); 1335 bind(SHORT_LOOP_TAIL); 1336 sub(result, tmp1, cnt1); 1337 b(DONE); 1338 bind(SHORT_LAST2); 1339 cmp(tmp2, rscratch1); 1340 br(EQ, DONE); 1341 sub(result, tmp2, rscratch1); 1342 1343 b(DONE); 1344 bind(SHORT_LAST_INIT); 1345 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1346 bind(SHORT_LAST); 1347 cmp(tmp1, cnt1); 1348 br(EQ, DONE); 1349 sub(result, tmp1, cnt1); 1350 1351 bind(DONE); 1352 1353 BLOCK_COMMENT("} string_compare"); 1354 } 1355 1356 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1357 FloatRegister src2, Condition cond, bool isQ) { 1358 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1359 FloatRegister zn = src1, zm = src2; 1360 bool needs_negation = false; 1361 switch (cond) { 1362 case LT: cond = GT; zn = src2; zm = src1; break; 1363 case LE: cond = GE; zn = src2; zm = src1; break; 1364 case LO: cond = HI; zn = src2; zm = src1; break; 1365 case LS: cond = HS; zn = src2; zm = src1; break; 1366 case NE: cond = EQ; needs_negation = true; break; 1367 default: 1368 break; 1369 } 1370 1371 if (is_floating_point_type(bt)) { 1372 fcm(cond, dst, size, zn, zm); 1373 } else { 1374 cm(cond, dst, size, zn, zm); 1375 } 1376 1377 if (needs_negation) { 1378 notr(dst, isQ ? T16B : T8B, dst); 1379 } 1380 } 1381 1382 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1383 Condition cond, bool isQ) { 1384 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1385 if (bt == T_FLOAT || bt == T_DOUBLE) { 1386 if (cond == Assembler::NE) { 1387 fcm(Assembler::EQ, dst, size, src); 1388 notr(dst, isQ ? T16B : T8B, dst); 1389 } else { 1390 fcm(cond, dst, size, src); 1391 } 1392 } else { 1393 if (cond == Assembler::NE) { 1394 cm(Assembler::EQ, dst, size, src); 1395 notr(dst, isQ ? T16B : T8B, dst); 1396 } else { 1397 cm(cond, dst, size, src); 1398 } 1399 } 1400 } 1401 1402 // Compress the least significant bit of each byte to the rightmost and clear 1403 // the higher garbage bits. 1404 void C2_MacroAssembler::bytemask_compress(Register dst) { 1405 // Example input, dst = 0x01 00 00 00 01 01 00 01 1406 // The "??" bytes are garbage. 1407 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1408 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1409 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1410 andr(dst, dst, 0xff); // dst = 0x8D 1411 } 1412 1413 // Pack the lowest-numbered bit of each mask element in src into a long value 1414 // in dst, at most the first 64 lane elements. 1415 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1416 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1417 FloatRegister vtmp1, FloatRegister vtmp2) { 1418 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1419 assert_different_registers(dst, rscratch1); 1420 assert_different_registers(vtmp1, vtmp2); 1421 1422 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1423 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1424 // Expected: dst = 0x658D 1425 1426 // Convert the mask into vector with sequential bytes. 1427 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1428 sve_cpy(vtmp1, size, src, 1, false); 1429 if (bt != T_BYTE) { 1430 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1431 } 1432 1433 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1434 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1435 // is to compress each significant bit of the byte in a cross-lane way. Due 1436 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1437 // (bit-compress in each lane) with the biggest lane size (T = D) then 1438 // concatenate the results. 1439 1440 // The second source input of BEXT, initialized with 0x01 in each byte. 1441 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1442 sve_dup(vtmp2, B, 1); 1443 1444 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1445 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1446 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1447 // --------------------------------------- 1448 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1449 sve_bext(vtmp1, D, vtmp1, vtmp2); 1450 1451 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1452 // result to dst. 1453 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1454 // dst = 0x658D 1455 if (lane_cnt <= 8) { 1456 // No need to concatenate. 1457 umov(dst, vtmp1, B, 0); 1458 } else if (lane_cnt <= 16) { 1459 ins(vtmp1, B, vtmp1, 1, 8); 1460 umov(dst, vtmp1, H, 0); 1461 } else { 1462 // As the lane count is 64 at most, the final expected value must be in 1463 // the lowest 64 bits after narrowing vtmp1 from D to B. 1464 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1465 umov(dst, vtmp1, D, 0); 1466 } 1467 } else if (UseSVE > 0) { 1468 // Compress the lowest 8 bytes. 1469 fmovd(dst, vtmp1); 1470 bytemask_compress(dst); 1471 if (lane_cnt <= 8) return; 1472 1473 // Repeat on higher bytes and join the results. 1474 // Compress 8 bytes in each iteration. 1475 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1476 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1477 bytemask_compress(rscratch1); 1478 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1479 } 1480 } else { 1481 assert(false, "unsupported"); 1482 ShouldNotReachHere(); 1483 } 1484 } 1485 1486 // Unpack the mask, a long value in src, into predicate register dst based on the 1487 // corresponding data type. Note that dst can support at most 64 lanes. 1488 // Below example gives the expected dst predicate register in different types, with 1489 // a valid src(0x658D) on a 1024-bit vector size machine. 1490 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1491 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1492 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1493 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1494 // 1495 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1496 // has 24 significant bits would be an invalid input if dst predicate register refers to 1497 // a LONG type 1024-bit vector, which has at most 16 lanes. 1498 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1499 FloatRegister vtmp1, FloatRegister vtmp2) { 1500 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1501 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1502 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1503 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1504 // Expected: dst = 0b01101001 10001101 1505 1506 // Put long value from general purpose register into the first lane of vector. 1507 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1508 sve_dup(vtmp1, B, 0); 1509 mov(vtmp1, D, 0, src); 1510 1511 // As sve_cmp generates mask value with the minimum unit in byte, we should 1512 // transform the value in the first lane which is mask in bit now to the 1513 // mask in byte, which can be done by SVE2's BDEP instruction. 1514 1515 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1516 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1517 if (lane_cnt <= 8) { 1518 // Nothing. As only one byte exsits. 1519 } else if (lane_cnt <= 16) { 1520 ins(vtmp1, B, vtmp1, 8, 1); 1521 mov(vtmp1, B, 1, zr); 1522 } else { 1523 sve_vector_extend(vtmp1, D, vtmp1, B); 1524 } 1525 1526 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1527 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1528 sve_dup(vtmp2, B, 1); 1529 1530 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1531 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1532 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1533 // --------------------------------------- 1534 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1535 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1536 1537 if (bt != T_BYTE) { 1538 sve_vector_extend(vtmp1, size, vtmp1, B); 1539 } 1540 // Generate mask according to the given vector, in which the elements have been 1541 // extended to expected type. 1542 // dst = 0b01101001 10001101 1543 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1544 } 1545 1546 // Clobbers: rflags 1547 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1548 FloatRegister zn, FloatRegister zm, Condition cond) { 1549 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1550 FloatRegister z1 = zn, z2 = zm; 1551 switch (cond) { 1552 case LE: z1 = zm; z2 = zn; cond = GE; break; 1553 case LT: z1 = zm; z2 = zn; cond = GT; break; 1554 case LO: z1 = zm; z2 = zn; cond = HI; break; 1555 case LS: z1 = zm; z2 = zn; cond = HS; break; 1556 default: 1557 break; 1558 } 1559 1560 SIMD_RegVariant size = elemType_to_regVariant(bt); 1561 if (is_floating_point_type(bt)) { 1562 sve_fcm(cond, pd, size, pg, z1, z2); 1563 } else { 1564 assert(is_integral_type(bt), "unsupported element type"); 1565 sve_cmp(cond, pd, size, pg, z1, z2); 1566 } 1567 } 1568 1569 // Get index of the last mask lane that is set 1570 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1571 SIMD_RegVariant size = elemType_to_regVariant(bt); 1572 sve_rev(ptmp, size, src); 1573 sve_brkb(ptmp, ptrue, ptmp, false); 1574 sve_cntp(dst, size, ptrue, ptmp); 1575 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1576 subw(dst, rscratch1, dst); 1577 } 1578 1579 // Extend integer vector src to dst with the same lane count 1580 // but larger element size, e.g. 4B -> 4I 1581 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1582 FloatRegister src, BasicType src_bt) { 1583 if (src_bt == T_BYTE) { 1584 if (dst_bt == T_SHORT) { 1585 // 4B/8B to 4S/8S 1586 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1587 sxtl(dst, T8H, src, T8B); 1588 } else { 1589 // 4B to 4I 1590 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1591 sxtl(dst, T8H, src, T8B); 1592 sxtl(dst, T4S, dst, T4H); 1593 } 1594 } else if (src_bt == T_SHORT) { 1595 // 4S to 4I 1596 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1597 sxtl(dst, T4S, src, T4H); 1598 } else if (src_bt == T_INT) { 1599 // 2I to 2L 1600 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1601 sxtl(dst, T2D, src, T2S); 1602 } else { 1603 ShouldNotReachHere(); 1604 } 1605 } 1606 1607 // Narrow integer vector src down to dst with the same lane count 1608 // but smaller element size, e.g. 4I -> 4B 1609 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1610 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1611 if (src_bt == T_SHORT) { 1612 // 4S/8S to 4B/8B 1613 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1614 assert(dst_bt == T_BYTE, "unsupported"); 1615 xtn(dst, T8B, src, T8H); 1616 } else if (src_bt == T_INT) { 1617 // 4I to 4B/4S 1618 assert(src_vlen_in_bytes == 16, "unsupported"); 1619 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1620 xtn(dst, T4H, src, T4S); 1621 if (dst_bt == T_BYTE) { 1622 xtn(dst, T8B, dst, T8H); 1623 } 1624 } else if (src_bt == T_LONG) { 1625 // 2L to 2I 1626 assert(src_vlen_in_bytes == 16, "unsupported"); 1627 assert(dst_bt == T_INT, "unsupported"); 1628 xtn(dst, T2S, src, T2D); 1629 } else { 1630 ShouldNotReachHere(); 1631 } 1632 } 1633 1634 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1635 FloatRegister src, SIMD_RegVariant src_size) { 1636 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1637 if (src_size == B) { 1638 switch (dst_size) { 1639 case H: 1640 sve_sunpklo(dst, H, src); 1641 break; 1642 case S: 1643 sve_sunpklo(dst, H, src); 1644 sve_sunpklo(dst, S, dst); 1645 break; 1646 case D: 1647 sve_sunpklo(dst, H, src); 1648 sve_sunpklo(dst, S, dst); 1649 sve_sunpklo(dst, D, dst); 1650 break; 1651 default: 1652 ShouldNotReachHere(); 1653 } 1654 } else if (src_size == H) { 1655 if (dst_size == S) { 1656 sve_sunpklo(dst, S, src); 1657 } else { // D 1658 sve_sunpklo(dst, S, src); 1659 sve_sunpklo(dst, D, dst); 1660 } 1661 } else if (src_size == S) { 1662 sve_sunpklo(dst, D, src); 1663 } 1664 } 1665 1666 // Vector narrow from src to dst with specified element sizes. 1667 // High part of dst vector will be filled with zero. 1668 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1669 FloatRegister src, SIMD_RegVariant src_size, 1670 FloatRegister tmp) { 1671 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1672 assert_different_registers(src, tmp); 1673 sve_dup(tmp, src_size, 0); 1674 if (src_size == D) { 1675 switch (dst_size) { 1676 case S: 1677 sve_uzp1(dst, S, src, tmp); 1678 break; 1679 case H: 1680 assert_different_registers(dst, tmp); 1681 sve_uzp1(dst, S, src, tmp); 1682 sve_uzp1(dst, H, dst, tmp); 1683 break; 1684 case B: 1685 assert_different_registers(dst, tmp); 1686 sve_uzp1(dst, S, src, tmp); 1687 sve_uzp1(dst, H, dst, tmp); 1688 sve_uzp1(dst, B, dst, tmp); 1689 break; 1690 default: 1691 ShouldNotReachHere(); 1692 } 1693 } else if (src_size == S) { 1694 if (dst_size == H) { 1695 sve_uzp1(dst, H, src, tmp); 1696 } else { // B 1697 assert_different_registers(dst, tmp); 1698 sve_uzp1(dst, H, src, tmp); 1699 sve_uzp1(dst, B, dst, tmp); 1700 } 1701 } else if (src_size == H) { 1702 sve_uzp1(dst, B, src, tmp); 1703 } 1704 } 1705 1706 // Extend src predicate to dst predicate with the same lane count but larger 1707 // element size, e.g. 64Byte -> 512Long 1708 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1709 uint dst_element_length_in_bytes, 1710 uint src_element_length_in_bytes) { 1711 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1712 sve_punpklo(dst, src); 1713 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1714 sve_punpklo(dst, src); 1715 sve_punpklo(dst, dst); 1716 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1717 sve_punpklo(dst, src); 1718 sve_punpklo(dst, dst); 1719 sve_punpklo(dst, dst); 1720 } else { 1721 assert(false, "unsupported"); 1722 ShouldNotReachHere(); 1723 } 1724 } 1725 1726 // Narrow src predicate to dst predicate with the same lane count but 1727 // smaller element size, e.g. 512Long -> 64Byte 1728 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1729 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1730 // The insignificant bits in src predicate are expected to be zero. 1731 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1732 // passed as the second argument. An example narrowing operation with a given mask would be - 1733 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1734 // Mask (for 2 Longs) : TF 1735 // Predicate register for the above mask (16 bits) : 00000001 00000000 1736 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1737 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1738 assert_different_registers(src, ptmp); 1739 assert_different_registers(dst, ptmp); 1740 sve_pfalse(ptmp); 1741 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1742 sve_uzp1(dst, B, src, ptmp); 1743 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1744 sve_uzp1(dst, H, src, ptmp); 1745 sve_uzp1(dst, B, dst, ptmp); 1746 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1747 sve_uzp1(dst, S, src, ptmp); 1748 sve_uzp1(dst, H, dst, ptmp); 1749 sve_uzp1(dst, B, dst, ptmp); 1750 } else { 1751 assert(false, "unsupported"); 1752 ShouldNotReachHere(); 1753 } 1754 } 1755 1756 // Vector reduction add for integral type with ASIMD instructions. 1757 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1758 Register isrc, FloatRegister vsrc, 1759 unsigned vector_length_in_bytes, 1760 FloatRegister vtmp) { 1761 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1762 assert_different_registers(dst, isrc); 1763 bool isQ = vector_length_in_bytes == 16; 1764 1765 BLOCK_COMMENT("neon_reduce_add_integral {"); 1766 switch(bt) { 1767 case T_BYTE: 1768 addv(vtmp, isQ ? T16B : T8B, vsrc); 1769 smov(dst, vtmp, B, 0); 1770 addw(dst, dst, isrc, ext::sxtb); 1771 break; 1772 case T_SHORT: 1773 addv(vtmp, isQ ? T8H : T4H, vsrc); 1774 smov(dst, vtmp, H, 0); 1775 addw(dst, dst, isrc, ext::sxth); 1776 break; 1777 case T_INT: 1778 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1779 umov(dst, vtmp, S, 0); 1780 addw(dst, dst, isrc); 1781 break; 1782 case T_LONG: 1783 assert(isQ, "unsupported"); 1784 addpd(vtmp, vsrc); 1785 umov(dst, vtmp, D, 0); 1786 add(dst, dst, isrc); 1787 break; 1788 default: 1789 assert(false, "unsupported"); 1790 ShouldNotReachHere(); 1791 } 1792 BLOCK_COMMENT("} neon_reduce_add_integral"); 1793 } 1794 1795 // Vector reduction multiply for integral type with ASIMD instructions. 1796 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1797 // Clobbers: rscratch1 1798 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1799 Register isrc, FloatRegister vsrc, 1800 unsigned vector_length_in_bytes, 1801 FloatRegister vtmp1, FloatRegister vtmp2) { 1802 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1803 bool isQ = vector_length_in_bytes == 16; 1804 1805 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1806 switch(bt) { 1807 case T_BYTE: 1808 if (isQ) { 1809 // Multiply the lower half and higher half of vector iteratively. 1810 // vtmp1 = vsrc[8:15] 1811 ins(vtmp1, D, vsrc, 0, 1); 1812 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1813 mulv(vtmp1, T8B, vtmp1, vsrc); 1814 // vtmp2 = vtmp1[4:7] 1815 ins(vtmp2, S, vtmp1, 0, 1); 1816 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1817 mulv(vtmp1, T8B, vtmp2, vtmp1); 1818 } else { 1819 ins(vtmp1, S, vsrc, 0, 1); 1820 mulv(vtmp1, T8B, vtmp1, vsrc); 1821 } 1822 // vtmp2 = vtmp1[2:3] 1823 ins(vtmp2, H, vtmp1, 0, 1); 1824 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1825 mulv(vtmp2, T8B, vtmp2, vtmp1); 1826 // dst = vtmp2[0] * isrc * vtmp2[1] 1827 umov(rscratch1, vtmp2, B, 0); 1828 mulw(dst, rscratch1, isrc); 1829 sxtb(dst, dst); 1830 umov(rscratch1, vtmp2, B, 1); 1831 mulw(dst, rscratch1, dst); 1832 sxtb(dst, dst); 1833 break; 1834 case T_SHORT: 1835 if (isQ) { 1836 ins(vtmp2, D, vsrc, 0, 1); 1837 mulv(vtmp2, T4H, vtmp2, vsrc); 1838 ins(vtmp1, S, vtmp2, 0, 1); 1839 mulv(vtmp1, T4H, vtmp1, vtmp2); 1840 } else { 1841 ins(vtmp1, S, vsrc, 0, 1); 1842 mulv(vtmp1, T4H, vtmp1, vsrc); 1843 } 1844 umov(rscratch1, vtmp1, H, 0); 1845 mulw(dst, rscratch1, isrc); 1846 sxth(dst, dst); 1847 umov(rscratch1, vtmp1, H, 1); 1848 mulw(dst, rscratch1, dst); 1849 sxth(dst, dst); 1850 break; 1851 case T_INT: 1852 if (isQ) { 1853 ins(vtmp1, D, vsrc, 0, 1); 1854 mulv(vtmp1, T2S, vtmp1, vsrc); 1855 } else { 1856 vtmp1 = vsrc; 1857 } 1858 umov(rscratch1, vtmp1, S, 0); 1859 mul(dst, rscratch1, isrc); 1860 umov(rscratch1, vtmp1, S, 1); 1861 mul(dst, rscratch1, dst); 1862 break; 1863 case T_LONG: 1864 umov(rscratch1, vsrc, D, 0); 1865 mul(dst, isrc, rscratch1); 1866 umov(rscratch1, vsrc, D, 1); 1867 mul(dst, dst, rscratch1); 1868 break; 1869 default: 1870 assert(false, "unsupported"); 1871 ShouldNotReachHere(); 1872 } 1873 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1874 } 1875 1876 // Vector reduction multiply for floating-point type with ASIMD instructions. 1877 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1878 FloatRegister fsrc, FloatRegister vsrc, 1879 unsigned vector_length_in_bytes, 1880 FloatRegister vtmp) { 1881 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1882 bool isQ = vector_length_in_bytes == 16; 1883 1884 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1885 switch(bt) { 1886 case T_FLOAT: 1887 fmuls(dst, fsrc, vsrc); 1888 ins(vtmp, S, vsrc, 0, 1); 1889 fmuls(dst, dst, vtmp); 1890 if (isQ) { 1891 ins(vtmp, S, vsrc, 0, 2); 1892 fmuls(dst, dst, vtmp); 1893 ins(vtmp, S, vsrc, 0, 3); 1894 fmuls(dst, dst, vtmp); 1895 } 1896 break; 1897 case T_DOUBLE: 1898 assert(isQ, "unsupported"); 1899 fmuld(dst, fsrc, vsrc); 1900 ins(vtmp, D, vsrc, 0, 1); 1901 fmuld(dst, dst, vtmp); 1902 break; 1903 default: 1904 assert(false, "unsupported"); 1905 ShouldNotReachHere(); 1906 } 1907 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1908 } 1909 1910 // Helper to select logical instruction 1911 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1912 Register Rn, Register Rm, 1913 enum shift_kind kind, unsigned shift) { 1914 switch(opc) { 1915 case Op_AndReductionV: 1916 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1917 break; 1918 case Op_OrReductionV: 1919 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1920 break; 1921 case Op_XorReductionV: 1922 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1923 break; 1924 default: 1925 assert(false, "unsupported"); 1926 ShouldNotReachHere(); 1927 } 1928 } 1929 1930 // Vector reduction logical operations And, Or, Xor 1931 // Clobbers: rscratch1 1932 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1933 Register isrc, FloatRegister vsrc, 1934 unsigned vector_length_in_bytes) { 1935 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1936 "unsupported"); 1937 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1938 assert_different_registers(dst, isrc); 1939 bool isQ = vector_length_in_bytes == 16; 1940 1941 BLOCK_COMMENT("neon_reduce_logical {"); 1942 umov(rscratch1, vsrc, isQ ? D : S, 0); 1943 umov(dst, vsrc, isQ ? D : S, 1); 1944 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1945 switch(bt) { 1946 case T_BYTE: 1947 if (isQ) { 1948 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1949 } 1950 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1951 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1952 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1953 sxtb(dst, dst); 1954 break; 1955 case T_SHORT: 1956 if (isQ) { 1957 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1958 } 1959 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1960 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1961 sxth(dst, dst); 1962 break; 1963 case T_INT: 1964 if (isQ) { 1965 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1966 } 1967 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1968 break; 1969 case T_LONG: 1970 assert(isQ, "unsupported"); 1971 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1972 break; 1973 default: 1974 assert(false, "unsupported"); 1975 ShouldNotReachHere(); 1976 } 1977 BLOCK_COMMENT("} neon_reduce_logical"); 1978 } 1979 1980 // Vector reduction min/max for integral type with ASIMD instructions. 1981 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1982 // Clobbers: rscratch1, rflags 1983 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1984 Register isrc, FloatRegister vsrc, 1985 unsigned vector_length_in_bytes, 1986 FloatRegister vtmp) { 1987 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1988 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1989 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1990 assert_different_registers(dst, isrc); 1991 bool isQ = vector_length_in_bytes == 16; 1992 bool is_min = opc == Op_MinReductionV; 1993 1994 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1995 if (bt == T_LONG) { 1996 assert(vtmp == fnoreg, "should be"); 1997 assert(isQ, "should be"); 1998 umov(rscratch1, vsrc, D, 0); 1999 cmp(isrc, rscratch1); 2000 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2001 umov(rscratch1, vsrc, D, 1); 2002 cmp(dst, rscratch1); 2003 csel(dst, dst, rscratch1, is_min ? LT : GT); 2004 } else { 2005 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2006 if (size == T2S) { 2007 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2008 } else { 2009 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2010 } 2011 if (bt == T_INT) { 2012 umov(dst, vtmp, S, 0); 2013 } else { 2014 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2015 } 2016 cmpw(dst, isrc); 2017 cselw(dst, dst, isrc, is_min ? LT : GT); 2018 } 2019 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2020 } 2021 2022 // Vector reduction for integral type with SVE instruction. 2023 // Supported operations are Add, And, Or, Xor, Max, Min. 2024 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2025 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2026 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2027 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2028 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2029 assert_different_registers(src1, dst); 2030 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2031 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2032 switch (opc) { 2033 case Op_AddReductionVI: { 2034 sve_uaddv(tmp, size, pg, src2); 2035 if (bt == T_BYTE) { 2036 smov(dst, tmp, size, 0); 2037 addw(dst, src1, dst, ext::sxtb); 2038 } else if (bt == T_SHORT) { 2039 smov(dst, tmp, size, 0); 2040 addw(dst, src1, dst, ext::sxth); 2041 } else { 2042 umov(dst, tmp, size, 0); 2043 addw(dst, dst, src1); 2044 } 2045 break; 2046 } 2047 case Op_AddReductionVL: { 2048 sve_uaddv(tmp, size, pg, src2); 2049 umov(dst, tmp, size, 0); 2050 add(dst, dst, src1); 2051 break; 2052 } 2053 case Op_AndReductionV: { 2054 sve_andv(tmp, size, pg, src2); 2055 if (bt == T_INT || bt == T_LONG) { 2056 umov(dst, tmp, size, 0); 2057 } else { 2058 smov(dst, tmp, size, 0); 2059 } 2060 if (bt == T_LONG) { 2061 andr(dst, dst, src1); 2062 } else { 2063 andw(dst, dst, src1); 2064 } 2065 break; 2066 } 2067 case Op_OrReductionV: { 2068 sve_orv(tmp, size, pg, src2); 2069 if (bt == T_INT || bt == T_LONG) { 2070 umov(dst, tmp, size, 0); 2071 } else { 2072 smov(dst, tmp, size, 0); 2073 } 2074 if (bt == T_LONG) { 2075 orr(dst, dst, src1); 2076 } else { 2077 orrw(dst, dst, src1); 2078 } 2079 break; 2080 } 2081 case Op_XorReductionV: { 2082 sve_eorv(tmp, size, pg, src2); 2083 if (bt == T_INT || bt == T_LONG) { 2084 umov(dst, tmp, size, 0); 2085 } else { 2086 smov(dst, tmp, size, 0); 2087 } 2088 if (bt == T_LONG) { 2089 eor(dst, dst, src1); 2090 } else { 2091 eorw(dst, dst, src1); 2092 } 2093 break; 2094 } 2095 case Op_MaxReductionV: { 2096 sve_smaxv(tmp, size, pg, src2); 2097 if (bt == T_INT || bt == T_LONG) { 2098 umov(dst, tmp, size, 0); 2099 } else { 2100 smov(dst, tmp, size, 0); 2101 } 2102 if (bt == T_LONG) { 2103 cmp(dst, src1); 2104 csel(dst, dst, src1, Assembler::GT); 2105 } else { 2106 cmpw(dst, src1); 2107 cselw(dst, dst, src1, Assembler::GT); 2108 } 2109 break; 2110 } 2111 case Op_MinReductionV: { 2112 sve_sminv(tmp, size, pg, src2); 2113 if (bt == T_INT || bt == T_LONG) { 2114 umov(dst, tmp, size, 0); 2115 } else { 2116 smov(dst, tmp, size, 0); 2117 } 2118 if (bt == T_LONG) { 2119 cmp(dst, src1); 2120 csel(dst, dst, src1, Assembler::LT); 2121 } else { 2122 cmpw(dst, src1); 2123 cselw(dst, dst, src1, Assembler::LT); 2124 } 2125 break; 2126 } 2127 default: 2128 assert(false, "unsupported"); 2129 ShouldNotReachHere(); 2130 } 2131 2132 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2133 if (bt == T_BYTE) { 2134 sxtb(dst, dst); 2135 } else if (bt == T_SHORT) { 2136 sxth(dst, dst); 2137 } 2138 } 2139 } 2140 2141 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2142 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2143 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2144 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2145 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2146 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2147 2148 // Set all elements to false if the input "lane_cnt" is zero. 2149 if (lane_cnt == 0) { 2150 sve_pfalse(dst); 2151 return; 2152 } 2153 2154 SIMD_RegVariant size = elemType_to_regVariant(bt); 2155 assert(size != Q, "invalid size"); 2156 2157 // Set all true if "lane_cnt" equals to the max lane count. 2158 if (lane_cnt == max_vector_length) { 2159 sve_ptrue(dst, size, /* ALL */ 0b11111); 2160 return; 2161 } 2162 2163 // Fixed numbers for "ptrue". 2164 switch(lane_cnt) { 2165 case 1: /* VL1 */ 2166 case 2: /* VL2 */ 2167 case 3: /* VL3 */ 2168 case 4: /* VL4 */ 2169 case 5: /* VL5 */ 2170 case 6: /* VL6 */ 2171 case 7: /* VL7 */ 2172 case 8: /* VL8 */ 2173 sve_ptrue(dst, size, lane_cnt); 2174 return; 2175 case 16: 2176 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2177 return; 2178 case 32: 2179 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2180 return; 2181 case 64: 2182 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2183 return; 2184 case 128: 2185 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2186 return; 2187 case 256: 2188 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2189 return; 2190 default: 2191 break; 2192 } 2193 2194 // Special patterns for "ptrue". 2195 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2196 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2197 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2198 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2199 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2200 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2201 } else { 2202 // Encode to "whileltw" for the remaining cases. 2203 mov(rscratch1, lane_cnt); 2204 sve_whileltw(dst, size, zr, rscratch1); 2205 } 2206 } 2207 2208 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2209 // Any remaining elements of dst will be filled with zero. 2210 // Clobbers: rscratch1 2211 // Preserves: src, mask 2212 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2213 FloatRegister vtmp1, FloatRegister vtmp2, 2214 PRegister pgtmp) { 2215 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2216 assert_different_registers(dst, src, vtmp1, vtmp2); 2217 assert_different_registers(mask, pgtmp); 2218 2219 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2220 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2221 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2222 sve_dup(vtmp2, H, 0); 2223 2224 // Extend lowest half to type INT. 2225 // dst = 00004444 00003333 00002222 00001111 2226 sve_uunpklo(dst, S, src); 2227 // pgtmp = 00000001 00000000 00000001 00000001 2228 sve_punpklo(pgtmp, mask); 2229 // Pack the active elements in size of type INT to the right, 2230 // and fill the remainings with zero. 2231 // dst = 00000000 00004444 00002222 00001111 2232 sve_compact(dst, S, dst, pgtmp); 2233 // Narrow the result back to type SHORT. 2234 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2235 sve_uzp1(dst, H, dst, vtmp2); 2236 // Count the active elements of lowest half. 2237 // rscratch1 = 3 2238 sve_cntp(rscratch1, S, ptrue, pgtmp); 2239 2240 // Repeat to the highest half. 2241 // pgtmp = 00000001 00000000 00000000 00000001 2242 sve_punpkhi(pgtmp, mask); 2243 // vtmp1 = 00008888 00007777 00006666 00005555 2244 sve_uunpkhi(vtmp1, S, src); 2245 // vtmp1 = 00000000 00000000 00008888 00005555 2246 sve_compact(vtmp1, S, vtmp1, pgtmp); 2247 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2248 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2249 2250 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2251 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2252 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2253 // TRUE_CNT is the number of active elements in the compressed low. 2254 neg(rscratch1, rscratch1); 2255 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2256 sve_index(vtmp2, H, rscratch1, 1); 2257 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2258 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2259 2260 // Combine the compressed high(after shifted) with the compressed low. 2261 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2262 sve_orr(dst, dst, vtmp1); 2263 } 2264 2265 // Clobbers: rscratch1, rscratch2 2266 // Preserves: src, mask 2267 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2268 FloatRegister vtmp1, FloatRegister vtmp2, 2269 FloatRegister vtmp3, FloatRegister vtmp4, 2270 PRegister ptmp, PRegister pgtmp) { 2271 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2272 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2273 assert_different_registers(mask, ptmp, pgtmp); 2274 // Example input: src = 88 77 66 55 44 33 22 11 2275 // mask = 01 00 00 01 01 00 01 01 2276 // Expected result: dst = 00 00 00 88 55 44 22 11 2277 2278 sve_dup(vtmp4, B, 0); 2279 // Extend lowest half to type SHORT. 2280 // vtmp1 = 0044 0033 0022 0011 2281 sve_uunpklo(vtmp1, H, src); 2282 // ptmp = 0001 0000 0001 0001 2283 sve_punpklo(ptmp, mask); 2284 // Count the active elements of lowest half. 2285 // rscratch2 = 3 2286 sve_cntp(rscratch2, H, ptrue, ptmp); 2287 // Pack the active elements in size of type SHORT to the right, 2288 // and fill the remainings with zero. 2289 // dst = 0000 0044 0022 0011 2290 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2291 // Narrow the result back to type BYTE. 2292 // dst = 00 00 00 00 00 44 22 11 2293 sve_uzp1(dst, B, dst, vtmp4); 2294 2295 // Repeat to the highest half. 2296 // ptmp = 0001 0000 0000 0001 2297 sve_punpkhi(ptmp, mask); 2298 // vtmp1 = 0088 0077 0066 0055 2299 sve_uunpkhi(vtmp2, H, src); 2300 // vtmp1 = 0000 0000 0088 0055 2301 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2302 2303 sve_dup(vtmp4, B, 0); 2304 // vtmp1 = 00 00 00 00 00 00 88 55 2305 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2306 2307 // Compressed low: dst = 00 00 00 00 00 44 22 11 2308 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2309 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2310 // TRUE_CNT is the number of active elements in the compressed low. 2311 neg(rscratch2, rscratch2); 2312 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2313 sve_index(vtmp2, B, rscratch2, 1); 2314 // vtmp1 = 00 00 00 88 55 00 00 00 2315 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2316 // Combine the compressed high(after shifted) with the compressed low. 2317 // dst = 00 00 00 88 55 44 22 11 2318 sve_orr(dst, dst, vtmp1); 2319 } 2320 2321 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2322 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2323 SIMD_Arrangement size = isQ ? T16B : T8B; 2324 if (bt == T_BYTE) { 2325 rbit(dst, size, src); 2326 } else { 2327 neon_reverse_bytes(dst, src, bt, isQ); 2328 rbit(dst, size, dst); 2329 } 2330 } 2331 2332 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2333 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2334 SIMD_Arrangement size = isQ ? T16B : T8B; 2335 switch (bt) { 2336 case T_BYTE: 2337 if (dst != src) { 2338 orr(dst, size, src, src); 2339 } 2340 break; 2341 case T_SHORT: 2342 rev16(dst, size, src); 2343 break; 2344 case T_INT: 2345 rev32(dst, size, src); 2346 break; 2347 case T_LONG: 2348 rev64(dst, size, src); 2349 break; 2350 default: 2351 assert(false, "unsupported"); 2352 ShouldNotReachHere(); 2353 } 2354 } 2355 2356 // Extract a scalar element from an sve vector at position 'idx'. 2357 // The input elements in src are expected to be of integral type. 2358 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2359 int idx, FloatRegister vtmp) { 2360 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2361 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2362 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2363 if (bt == T_INT || bt == T_LONG) { 2364 umov(dst, src, size, idx); 2365 } else { 2366 smov(dst, src, size, idx); 2367 } 2368 } else { 2369 sve_orr(vtmp, src, src); 2370 sve_ext(vtmp, vtmp, idx << size); 2371 if (bt == T_INT || bt == T_LONG) { 2372 umov(dst, vtmp, size, 0); 2373 } else { 2374 smov(dst, vtmp, size, 0); 2375 } 2376 } 2377 } 2378 2379 // java.lang.Math::round intrinsics 2380 2381 // Clobbers: rscratch1, rflags 2382 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2383 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2384 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2385 switch (T) { 2386 case T2S: 2387 case T4S: 2388 fmovs(tmp1, T, 0.5f); 2389 mov(rscratch1, jint_cast(0x1.0p23f)); 2390 break; 2391 case T2D: 2392 fmovd(tmp1, T, 0.5); 2393 mov(rscratch1, julong_cast(0x1.0p52)); 2394 break; 2395 default: 2396 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2397 } 2398 fadd(tmp1, T, tmp1, src); 2399 fcvtms(tmp1, T, tmp1); 2400 // tmp1 = floor(src + 0.5, ties to even) 2401 2402 fcvtas(dst, T, src); 2403 // dst = round(src), ties to away 2404 2405 fneg(tmp3, T, src); 2406 dup(tmp2, T, rscratch1); 2407 cm(HS, tmp3, T, tmp3, tmp2); 2408 // tmp3 is now a set of flags 2409 2410 bif(dst, T16B, tmp1, tmp3); 2411 // result in dst 2412 } 2413 2414 // Clobbers: rscratch1, rflags 2415 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2416 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2417 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2418 assert_different_registers(tmp1, tmp2, src, dst); 2419 2420 switch (T) { 2421 case S: 2422 mov(rscratch1, jint_cast(0x1.0p23f)); 2423 break; 2424 case D: 2425 mov(rscratch1, julong_cast(0x1.0p52)); 2426 break; 2427 default: 2428 assert(T == S || T == D, "invalid register variant"); 2429 } 2430 2431 sve_frinta(dst, T, ptrue, src); 2432 // dst = round(src), ties to away 2433 2434 Label none; 2435 2436 sve_fneg(tmp1, T, ptrue, src); 2437 sve_dup(tmp2, T, rscratch1); 2438 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2439 br(EQ, none); 2440 { 2441 sve_cpy(tmp1, T, pgtmp, 0.5); 2442 sve_fadd(tmp1, T, pgtmp, src); 2443 sve_frintm(dst, T, pgtmp, tmp1); 2444 // dst = floor(src + 0.5, ties to even) 2445 } 2446 bind(none); 2447 2448 sve_fcvtzs(dst, T, ptrue, dst, T); 2449 // result in dst 2450 } 2451 2452 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2453 FloatRegister one, SIMD_Arrangement T) { 2454 assert_different_registers(dst, src, zero, one); 2455 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2456 2457 facgt(dst, T, src, zero); 2458 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2459 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2460 } 2461 2462 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2463 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2464 assert_different_registers(dst, src, zero, one, vtmp); 2465 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2466 2467 sve_orr(vtmp, src, src); 2468 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2469 switch (T) { 2470 case S: 2471 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2472 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2473 // on the sign of the float value 2474 break; 2475 case D: 2476 sve_and(vtmp, T, min_jlong); 2477 sve_orr(vtmp, T, jlong_cast(1.0)); 2478 break; 2479 default: 2480 assert(false, "unsupported"); 2481 ShouldNotReachHere(); 2482 } 2483 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2484 // Result in dst 2485 } 2486 2487 bool C2_MacroAssembler::in_scratch_emit_size() { 2488 if (ciEnv::current()->task() != nullptr) { 2489 PhaseOutput* phase_output = Compile::current()->output(); 2490 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2491 return true; 2492 } 2493 } 2494 return MacroAssembler::in_scratch_emit_size(); 2495 } 2496 2497 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) { 2498 C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst); 2499 Compile::current()->output()->add_stub(stub); 2500 2501 // Note: Don't clobber obj anywhere in that method! 2502 2503 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract 2504 // obj-start, so that we can load from the object's mark-word instead. Usually the address 2505 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2 2506 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and 2507 // then passes that register as obj and 0 in disp. The following code extracts the base 2508 // and offset to load the mark-word. 2509 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes(); 2510 if (index == noreg) { 2511 ldr(dst, Address(obj, offset)); 2512 } else { 2513 lea(dst, Address(obj, index, Address::lsl(scale))); 2514 ldr(dst, Address(dst, offset)); 2515 } 2516 // NOTE: We can't use tbnz here, because the target is sometimes too far away 2517 // and cannot be encoded. 2518 tst(dst, markWord::monitor_value); 2519 br(Assembler::NE, stub->entry()); 2520 bind(stub->continuation()); 2521 lsr(dst, dst, markWord::klass_shift); 2522 }