1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 47 48 void C2_MacroAssembler::entry_barrier() { 49 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 50 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 51 // Dummy labels for just measuring the code size 52 Label dummy_slow_path; 53 Label dummy_continuation; 54 Label dummy_guard; 55 Label* slow_path = &dummy_slow_path; 56 Label* continuation = &dummy_continuation; 57 Label* guard = &dummy_guard; 58 if (!Compile::current()->output()->in_scratch_emit_size()) { 59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 61 Compile::current()->output()->add_stub(stub); 62 slow_path = &stub->entry(); 63 continuation = &stub->continuation(); 64 guard = &stub->guard(); 65 } 66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 68 } 69 } 70 71 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 72 Register tmp2Reg, Register tmp3Reg) { 73 Register oop = objectReg; 74 Register box = boxReg; 75 Register disp_hdr = tmpReg; 76 Register tmp = tmp2Reg; 77 Label cont; 78 Label object_has_monitor; 79 Label count, no_count; 80 81 assert_different_registers(oop, box, tmp, disp_hdr); 82 83 // Load markWord from object into displaced_header. 84 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 85 86 if (DiagnoseSyncOnValueBasedClasses != 0) { 87 load_klass(tmp, oop); 88 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 89 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 90 br(Assembler::NE, cont); 91 } 92 93 // Check for existing monitor 94 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 95 96 if (LockingMode == LM_MONITOR) { 97 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 98 b(cont); 99 } else if (LockingMode == LM_LEGACY) { 100 // Set tmp to be (markWord of object | UNLOCK_VALUE). 101 orr(tmp, disp_hdr, markWord::unlocked_value); 102 103 if (EnableValhalla) { 104 // Mask inline_type bit such that we go to the slow path if object is an inline type 105 andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place)); 106 } 107 108 // Initialize the box. (Must happen before we update the object mark!) 109 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 110 111 // Compare object markWord with an unlocked value (tmp) and if 112 // equal exchange the stack address of our box with object markWord. 113 // On failure disp_hdr contains the possibly locked markWord. 114 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 115 /*release*/ true, /*weak*/ false, disp_hdr); 116 br(Assembler::EQ, cont); 117 118 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 119 120 // If the compare-and-exchange succeeded, then we found an unlocked 121 // object, will have now locked it will continue at label cont 122 123 // Check if the owner is self by comparing the value in the 124 // markWord of object (disp_hdr) with the stack pointer. 125 mov(rscratch1, sp); 126 sub(disp_hdr, disp_hdr, rscratch1); 127 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 128 // If condition is true we are cont and hence we can store 0 as the 129 // displaced header in the box, which indicates that it is a recursive lock. 130 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 131 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 132 b(cont); 133 } else { 134 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 135 lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count); 136 b(count); 137 } 138 139 // Handle existing monitor. 140 bind(object_has_monitor); 141 142 // The object's monitor m is unlocked iff m->owner == nullptr, 143 // otherwise m->owner may contain a thread or a stack address. 144 // 145 // Try to CAS m->owner from null to current thread. 146 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 147 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 148 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 149 150 if (LockingMode != LM_LIGHTWEIGHT) { 151 // Store a non-null value into the box to avoid looking like a re-entrant 152 // lock. The fast-path monitor unlock code checks for 153 // markWord::monitor_value so use markWord::unused_mark which has the 154 // relevant bit set, and also matches ObjectSynchronizer::enter. 155 mov(tmp, (address)markWord::unused_mark().value()); 156 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 157 } 158 br(Assembler::EQ, cont); // CAS success means locking succeeded 159 160 cmp(tmp3Reg, rthread); 161 br(Assembler::NE, cont); // Check for recursive locking 162 163 // Recursive lock case 164 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 165 // flag == EQ still from the cmp above, checking if this is a reentrant lock 166 167 bind(cont); 168 // flag == EQ indicates success 169 // flag == NE indicates failure 170 br(Assembler::NE, no_count); 171 172 bind(count); 173 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 174 175 bind(no_count); 176 } 177 178 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 179 Register tmp2Reg) { 180 Register oop = objectReg; 181 Register box = boxReg; 182 Register disp_hdr = tmpReg; 183 Register tmp = tmp2Reg; 184 Label cont; 185 Label object_has_monitor; 186 Label count, no_count; 187 188 assert_different_registers(oop, box, tmp, disp_hdr); 189 190 if (LockingMode == LM_LEGACY) { 191 // Find the lock address and load the displaced header from the stack. 192 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 193 194 // If the displaced header is 0, we have a recursive unlock. 195 cmp(disp_hdr, zr); 196 br(Assembler::EQ, cont); 197 } 198 199 // Handle existing monitor. 200 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 201 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 202 203 if (LockingMode == LM_MONITOR) { 204 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 205 b(cont); 206 } else if (LockingMode == LM_LEGACY) { 207 // Check if it is still a light weight lock, this is is true if we 208 // see the stack address of the basicLock in the markWord of the 209 // object. 210 211 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 212 /*release*/ true, /*weak*/ false, tmp); 213 b(cont); 214 } else { 215 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 216 lightweight_unlock(oop, tmp, box, disp_hdr, no_count); 217 b(count); 218 } 219 220 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 221 222 // Handle existing monitor. 223 bind(object_has_monitor); 224 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 225 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 226 227 if (LockingMode == LM_LIGHTWEIGHT) { 228 // If the owner is anonymous, we need to fix it -- in an outline stub. 229 Register tmp2 = disp_hdr; 230 ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset())); 231 // We cannot use tbnz here, the target might be too far away and cannot 232 // be encoded. 233 tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER); 234 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2); 235 Compile::current()->output()->add_stub(stub); 236 br(Assembler::NE, stub->entry()); 237 bind(stub->continuation()); 238 } 239 240 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 241 242 Label notRecursive; 243 cbz(disp_hdr, notRecursive); 244 245 // Recursive lock 246 sub(disp_hdr, disp_hdr, 1u); 247 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 248 cmp(disp_hdr, disp_hdr); // Sets flags for result 249 b(cont); 250 251 bind(notRecursive); 252 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 253 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 254 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 255 cmp(rscratch1, zr); // Sets flags for result 256 cbnz(rscratch1, cont); 257 // need a release store here 258 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 259 stlr(zr, tmp); // set unowned 260 261 bind(cont); 262 // flag == EQ indicates success 263 // flag == NE indicates failure 264 br(Assembler::NE, no_count); 265 266 bind(count); 267 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 268 269 bind(no_count); 270 } 271 272 // Search for str1 in str2 and return index or -1 273 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 274 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 275 Register cnt2, Register cnt1, 276 Register tmp1, Register tmp2, 277 Register tmp3, Register tmp4, 278 Register tmp5, Register tmp6, 279 int icnt1, Register result, int ae) { 280 // NOTE: tmp5, tmp6 can be zr depending on specific method version 281 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 282 283 Register ch1 = rscratch1; 284 Register ch2 = rscratch2; 285 Register cnt1tmp = tmp1; 286 Register cnt2tmp = tmp2; 287 Register cnt1_neg = cnt1; 288 Register cnt2_neg = cnt2; 289 Register result_tmp = tmp4; 290 291 bool isL = ae == StrIntrinsicNode::LL; 292 293 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 294 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 295 int str1_chr_shift = str1_isL ? 0:1; 296 int str2_chr_shift = str2_isL ? 0:1; 297 int str1_chr_size = str1_isL ? 1:2; 298 int str2_chr_size = str2_isL ? 1:2; 299 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 300 (chr_insn)&MacroAssembler::ldrh; 301 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 302 (chr_insn)&MacroAssembler::ldrh; 303 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 304 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 305 306 // Note, inline_string_indexOf() generates checks: 307 // if (substr.count > string.count) return -1; 308 // if (substr.count == 0) return 0; 309 310 // We have two strings, a source string in str2, cnt2 and a pattern string 311 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 312 313 // For larger pattern and source we use a simplified Boyer Moore algorithm. 314 // With a small pattern and source we use linear scan. 315 316 if (icnt1 == -1) { 317 sub(result_tmp, cnt2, cnt1); 318 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 319 br(LT, LINEARSEARCH); 320 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 321 subs(zr, cnt1, 256); 322 lsr(tmp1, cnt2, 2); 323 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 324 br(GE, LINEARSTUB); 325 } 326 327 // The Boyer Moore alogorithm is based on the description here:- 328 // 329 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 330 // 331 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 332 // and the 'Good Suffix' rule. 333 // 334 // These rules are essentially heuristics for how far we can shift the 335 // pattern along the search string. 336 // 337 // The implementation here uses the 'Bad Character' rule only because of the 338 // complexity of initialisation for the 'Good Suffix' rule. 339 // 340 // This is also known as the Boyer-Moore-Horspool algorithm:- 341 // 342 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 343 // 344 // This particular implementation has few java-specific optimizations. 345 // 346 // #define ASIZE 256 347 // 348 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 349 // int i, j; 350 // unsigned c; 351 // unsigned char bc[ASIZE]; 352 // 353 // /* Preprocessing */ 354 // for (i = 0; i < ASIZE; ++i) 355 // bc[i] = m; 356 // for (i = 0; i < m - 1; ) { 357 // c = x[i]; 358 // ++i; 359 // // c < 256 for Latin1 string, so, no need for branch 360 // #ifdef PATTERN_STRING_IS_LATIN1 361 // bc[c] = m - i; 362 // #else 363 // if (c < ASIZE) bc[c] = m - i; 364 // #endif 365 // } 366 // 367 // /* Searching */ 368 // j = 0; 369 // while (j <= n - m) { 370 // c = y[i+j]; 371 // if (x[m-1] == c) 372 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 373 // if (i < 0) return j; 374 // // c < 256 for Latin1 string, so, no need for branch 375 // #ifdef SOURCE_STRING_IS_LATIN1 376 // // LL case: (c< 256) always true. Remove branch 377 // j += bc[y[j+m-1]]; 378 // #endif 379 // #ifndef PATTERN_STRING_IS_UTF 380 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 381 // if (c < ASIZE) 382 // j += bc[y[j+m-1]]; 383 // else 384 // j += 1 385 // #endif 386 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 387 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 388 // if (c < ASIZE) 389 // j += bc[y[j+m-1]]; 390 // else 391 // j += m 392 // #endif 393 // } 394 // } 395 396 if (icnt1 == -1) { 397 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 398 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 399 Register cnt1end = tmp2; 400 Register str2end = cnt2; 401 Register skipch = tmp2; 402 403 // str1 length is >=8, so, we can read at least 1 register for cases when 404 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 405 // UL case. We'll re-read last character in inner pre-loop code to have 406 // single outer pre-loop load 407 const int firstStep = isL ? 7 : 3; 408 409 const int ASIZE = 256; 410 const int STORED_BYTES = 32; // amount of bytes stored per instruction 411 sub(sp, sp, ASIZE); 412 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 413 mov(ch1, sp); 414 BIND(BM_INIT_LOOP); 415 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 416 subs(tmp5, tmp5, 1); 417 br(GT, BM_INIT_LOOP); 418 419 sub(cnt1tmp, cnt1, 1); 420 mov(tmp5, str2); 421 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 422 sub(ch2, cnt1, 1); 423 mov(tmp3, str1); 424 BIND(BCLOOP); 425 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 426 if (!str1_isL) { 427 subs(zr, ch1, ASIZE); 428 br(HS, BCSKIP); 429 } 430 strb(ch2, Address(sp, ch1)); 431 BIND(BCSKIP); 432 subs(ch2, ch2, 1); 433 br(GT, BCLOOP); 434 435 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 436 if (str1_isL == str2_isL) { 437 // load last 8 bytes (8LL/4UU symbols) 438 ldr(tmp6, Address(tmp6, -wordSize)); 439 } else { 440 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 441 // convert Latin1 to UTF. We'll have to wait until load completed, but 442 // it's still faster than per-character loads+checks 443 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 444 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 445 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 446 andr(tmp6, tmp6, 0xFF); // str1[N-4] 447 orr(ch2, ch1, ch2, LSL, 16); 448 orr(tmp6, tmp6, tmp3, LSL, 48); 449 orr(tmp6, tmp6, ch2, LSL, 16); 450 } 451 BIND(BMLOOPSTR2); 452 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 453 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 454 if (str1_isL == str2_isL) { 455 // re-init tmp3. It's for free because it's executed in parallel with 456 // load above. Alternative is to initialize it before loop, but it'll 457 // affect performance on in-order systems with 2 or more ld/st pipelines 458 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 459 } 460 if (!isL) { // UU/UL case 461 lsl(ch2, cnt1tmp, 1); // offset in bytes 462 } 463 cmp(tmp3, skipch); 464 br(NE, BMSKIP); 465 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 466 mov(ch1, tmp6); 467 if (isL) { 468 b(BMLOOPSTR1_AFTER_LOAD); 469 } else { 470 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 471 b(BMLOOPSTR1_CMP); 472 } 473 BIND(BMLOOPSTR1); 474 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 475 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 476 BIND(BMLOOPSTR1_AFTER_LOAD); 477 subs(cnt1tmp, cnt1tmp, 1); 478 br(LT, BMLOOPSTR1_LASTCMP); 479 BIND(BMLOOPSTR1_CMP); 480 cmp(ch1, ch2); 481 br(EQ, BMLOOPSTR1); 482 BIND(BMSKIP); 483 if (!isL) { 484 // if we've met UTF symbol while searching Latin1 pattern, then we can 485 // skip cnt1 symbols 486 if (str1_isL != str2_isL) { 487 mov(result_tmp, cnt1); 488 } else { 489 mov(result_tmp, 1); 490 } 491 subs(zr, skipch, ASIZE); 492 br(HS, BMADV); 493 } 494 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 495 BIND(BMADV); 496 sub(cnt1tmp, cnt1, 1); 497 add(str2, str2, result_tmp, LSL, str2_chr_shift); 498 cmp(str2, str2end); 499 br(LE, BMLOOPSTR2); 500 add(sp, sp, ASIZE); 501 b(NOMATCH); 502 BIND(BMLOOPSTR1_LASTCMP); 503 cmp(ch1, ch2); 504 br(NE, BMSKIP); 505 BIND(BMMATCH); 506 sub(result, str2, tmp5); 507 if (!str2_isL) lsr(result, result, 1); 508 add(sp, sp, ASIZE); 509 b(DONE); 510 511 BIND(LINEARSTUB); 512 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 513 br(LT, LINEAR_MEDIUM); 514 mov(result, zr); 515 RuntimeAddress stub = nullptr; 516 if (isL) { 517 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 518 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 519 } else if (str1_isL) { 520 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 521 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 522 } else { 523 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 524 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 525 } 526 address call = trampoline_call(stub); 527 if (call == nullptr) { 528 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 529 ciEnv::current()->record_failure("CodeCache is full"); 530 return; 531 } 532 b(DONE); 533 } 534 535 BIND(LINEARSEARCH); 536 { 537 Label DO1, DO2, DO3; 538 539 Register str2tmp = tmp2; 540 Register first = tmp3; 541 542 if (icnt1 == -1) 543 { 544 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 545 546 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 547 br(LT, DOSHORT); 548 BIND(LINEAR_MEDIUM); 549 (this->*str1_load_1chr)(first, Address(str1)); 550 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 551 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 552 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 553 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 554 555 BIND(FIRST_LOOP); 556 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 557 cmp(first, ch2); 558 br(EQ, STR1_LOOP); 559 BIND(STR2_NEXT); 560 adds(cnt2_neg, cnt2_neg, str2_chr_size); 561 br(LE, FIRST_LOOP); 562 b(NOMATCH); 563 564 BIND(STR1_LOOP); 565 adds(cnt1tmp, cnt1_neg, str1_chr_size); 566 add(cnt2tmp, cnt2_neg, str2_chr_size); 567 br(GE, MATCH); 568 569 BIND(STR1_NEXT); 570 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 571 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 572 cmp(ch1, ch2); 573 br(NE, STR2_NEXT); 574 adds(cnt1tmp, cnt1tmp, str1_chr_size); 575 add(cnt2tmp, cnt2tmp, str2_chr_size); 576 br(LT, STR1_NEXT); 577 b(MATCH); 578 579 BIND(DOSHORT); 580 if (str1_isL == str2_isL) { 581 cmp(cnt1, (u1)2); 582 br(LT, DO1); 583 br(GT, DO3); 584 } 585 } 586 587 if (icnt1 == 4) { 588 Label CH1_LOOP; 589 590 (this->*load_4chr)(ch1, str1); 591 sub(result_tmp, cnt2, 4); 592 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 593 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 594 595 BIND(CH1_LOOP); 596 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 597 cmp(ch1, ch2); 598 br(EQ, MATCH); 599 adds(cnt2_neg, cnt2_neg, str2_chr_size); 600 br(LE, CH1_LOOP); 601 b(NOMATCH); 602 } 603 604 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 605 Label CH1_LOOP; 606 607 BIND(DO2); 608 (this->*load_2chr)(ch1, str1); 609 if (icnt1 == 2) { 610 sub(result_tmp, cnt2, 2); 611 } 612 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 613 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 614 BIND(CH1_LOOP); 615 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 616 cmp(ch1, ch2); 617 br(EQ, MATCH); 618 adds(cnt2_neg, cnt2_neg, str2_chr_size); 619 br(LE, CH1_LOOP); 620 b(NOMATCH); 621 } 622 623 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 624 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 625 626 BIND(DO3); 627 (this->*load_2chr)(first, str1); 628 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 629 if (icnt1 == 3) { 630 sub(result_tmp, cnt2, 3); 631 } 632 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 633 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 634 BIND(FIRST_LOOP); 635 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 636 cmpw(first, ch2); 637 br(EQ, STR1_LOOP); 638 BIND(STR2_NEXT); 639 adds(cnt2_neg, cnt2_neg, str2_chr_size); 640 br(LE, FIRST_LOOP); 641 b(NOMATCH); 642 643 BIND(STR1_LOOP); 644 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 645 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 646 cmp(ch1, ch2); 647 br(NE, STR2_NEXT); 648 b(MATCH); 649 } 650 651 if (icnt1 == -1 || icnt1 == 1) { 652 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 653 654 BIND(DO1); 655 (this->*str1_load_1chr)(ch1, str1); 656 cmp(cnt2, (u1)8); 657 br(LT, DO1_SHORT); 658 659 sub(result_tmp, cnt2, 8/str2_chr_size); 660 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 661 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 662 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 663 664 if (str2_isL) { 665 orr(ch1, ch1, ch1, LSL, 8); 666 } 667 orr(ch1, ch1, ch1, LSL, 16); 668 orr(ch1, ch1, ch1, LSL, 32); 669 BIND(CH1_LOOP); 670 ldr(ch2, Address(str2, cnt2_neg)); 671 eor(ch2, ch1, ch2); 672 sub(tmp1, ch2, tmp3); 673 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 674 bics(tmp1, tmp1, tmp2); 675 br(NE, HAS_ZERO); 676 adds(cnt2_neg, cnt2_neg, 8); 677 br(LT, CH1_LOOP); 678 679 cmp(cnt2_neg, (u1)8); 680 mov(cnt2_neg, 0); 681 br(LT, CH1_LOOP); 682 b(NOMATCH); 683 684 BIND(HAS_ZERO); 685 rev(tmp1, tmp1); 686 clz(tmp1, tmp1); 687 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 688 b(MATCH); 689 690 BIND(DO1_SHORT); 691 mov(result_tmp, cnt2); 692 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 693 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 694 BIND(DO1_LOOP); 695 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 696 cmpw(ch1, ch2); 697 br(EQ, MATCH); 698 adds(cnt2_neg, cnt2_neg, str2_chr_size); 699 br(LT, DO1_LOOP); 700 } 701 } 702 BIND(NOMATCH); 703 mov(result, -1); 704 b(DONE); 705 BIND(MATCH); 706 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 707 BIND(DONE); 708 } 709 710 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 711 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 712 713 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 714 Register ch, Register result, 715 Register tmp1, Register tmp2, Register tmp3) 716 { 717 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 718 Register cnt1_neg = cnt1; 719 Register ch1 = rscratch1; 720 Register result_tmp = rscratch2; 721 722 cbz(cnt1, NOMATCH); 723 724 cmp(cnt1, (u1)4); 725 br(LT, DO1_SHORT); 726 727 orr(ch, ch, ch, LSL, 16); 728 orr(ch, ch, ch, LSL, 32); 729 730 sub(cnt1, cnt1, 4); 731 mov(result_tmp, cnt1); 732 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 733 sub(cnt1_neg, zr, cnt1, LSL, 1); 734 735 mov(tmp3, 0x0001000100010001); 736 737 BIND(CH1_LOOP); 738 ldr(ch1, Address(str1, cnt1_neg)); 739 eor(ch1, ch, ch1); 740 sub(tmp1, ch1, tmp3); 741 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 742 bics(tmp1, tmp1, tmp2); 743 br(NE, HAS_ZERO); 744 adds(cnt1_neg, cnt1_neg, 8); 745 br(LT, CH1_LOOP); 746 747 cmp(cnt1_neg, (u1)8); 748 mov(cnt1_neg, 0); 749 br(LT, CH1_LOOP); 750 b(NOMATCH); 751 752 BIND(HAS_ZERO); 753 rev(tmp1, tmp1); 754 clz(tmp1, tmp1); 755 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 756 b(MATCH); 757 758 BIND(DO1_SHORT); 759 mov(result_tmp, cnt1); 760 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 761 sub(cnt1_neg, zr, cnt1, LSL, 1); 762 BIND(DO1_LOOP); 763 ldrh(ch1, Address(str1, cnt1_neg)); 764 cmpw(ch, ch1); 765 br(EQ, MATCH); 766 adds(cnt1_neg, cnt1_neg, 2); 767 br(LT, DO1_LOOP); 768 BIND(NOMATCH); 769 mov(result, -1); 770 b(DONE); 771 BIND(MATCH); 772 add(result, result_tmp, cnt1_neg, ASR, 1); 773 BIND(DONE); 774 } 775 776 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 777 Register ch, Register result, 778 FloatRegister ztmp1, 779 FloatRegister ztmp2, 780 PRegister tmp_pg, 781 PRegister tmp_pdn, bool isL) 782 { 783 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 784 assert(tmp_pg->is_governing(), 785 "this register has to be a governing predicate register"); 786 787 Label LOOP, MATCH, DONE, NOMATCH; 788 Register vec_len = rscratch1; 789 Register idx = rscratch2; 790 791 SIMD_RegVariant T = (isL == true) ? B : H; 792 793 cbz(cnt1, NOMATCH); 794 795 // Assign the particular char throughout the vector. 796 sve_dup(ztmp2, T, ch); 797 if (isL) { 798 sve_cntb(vec_len); 799 } else { 800 sve_cnth(vec_len); 801 } 802 mov(idx, 0); 803 804 // Generate a predicate to control the reading of input string. 805 sve_whilelt(tmp_pg, T, idx, cnt1); 806 807 BIND(LOOP); 808 // Read a vector of 8- or 16-bit data depending on the string type. Note 809 // that inactive elements indicated by the predicate register won't cause 810 // a data read from memory to the destination vector. 811 if (isL) { 812 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 813 } else { 814 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 815 } 816 add(idx, idx, vec_len); 817 818 // Perform the comparison. An element of the destination predicate is set 819 // to active if the particular char is matched. 820 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 821 822 // Branch if the particular char is found. 823 br(NE, MATCH); 824 825 sve_whilelt(tmp_pg, T, idx, cnt1); 826 827 // Loop back if the particular char not found. 828 br(MI, LOOP); 829 830 BIND(NOMATCH); 831 mov(result, -1); 832 b(DONE); 833 834 BIND(MATCH); 835 // Undo the index increment. 836 sub(idx, idx, vec_len); 837 838 // Crop the vector to find its location. 839 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 840 add(result, idx, -1); 841 sve_incp(result, T, tmp_pdn); 842 BIND(DONE); 843 } 844 845 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 846 Register ch, Register result, 847 Register tmp1, Register tmp2, Register tmp3) 848 { 849 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 850 Register cnt1_neg = cnt1; 851 Register ch1 = rscratch1; 852 Register result_tmp = rscratch2; 853 854 cbz(cnt1, NOMATCH); 855 856 cmp(cnt1, (u1)8); 857 br(LT, DO1_SHORT); 858 859 orr(ch, ch, ch, LSL, 8); 860 orr(ch, ch, ch, LSL, 16); 861 orr(ch, ch, ch, LSL, 32); 862 863 sub(cnt1, cnt1, 8); 864 mov(result_tmp, cnt1); 865 lea(str1, Address(str1, cnt1)); 866 sub(cnt1_neg, zr, cnt1); 867 868 mov(tmp3, 0x0101010101010101); 869 870 BIND(CH1_LOOP); 871 ldr(ch1, Address(str1, cnt1_neg)); 872 eor(ch1, ch, ch1); 873 sub(tmp1, ch1, tmp3); 874 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 875 bics(tmp1, tmp1, tmp2); 876 br(NE, HAS_ZERO); 877 adds(cnt1_neg, cnt1_neg, 8); 878 br(LT, CH1_LOOP); 879 880 cmp(cnt1_neg, (u1)8); 881 mov(cnt1_neg, 0); 882 br(LT, CH1_LOOP); 883 b(NOMATCH); 884 885 BIND(HAS_ZERO); 886 rev(tmp1, tmp1); 887 clz(tmp1, tmp1); 888 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 889 b(MATCH); 890 891 BIND(DO1_SHORT); 892 mov(result_tmp, cnt1); 893 lea(str1, Address(str1, cnt1)); 894 sub(cnt1_neg, zr, cnt1); 895 BIND(DO1_LOOP); 896 ldrb(ch1, Address(str1, cnt1_neg)); 897 cmp(ch, ch1); 898 br(EQ, MATCH); 899 adds(cnt1_neg, cnt1_neg, 1); 900 br(LT, DO1_LOOP); 901 BIND(NOMATCH); 902 mov(result, -1); 903 b(DONE); 904 BIND(MATCH); 905 add(result, result_tmp, cnt1_neg); 906 BIND(DONE); 907 } 908 909 // Compare strings. 910 void C2_MacroAssembler::string_compare(Register str1, Register str2, 911 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 912 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 913 PRegister pgtmp1, PRegister pgtmp2, int ae) { 914 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 915 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 916 SHORT_LOOP_START, TAIL_CHECK; 917 918 bool isLL = ae == StrIntrinsicNode::LL; 919 bool isLU = ae == StrIntrinsicNode::LU; 920 bool isUL = ae == StrIntrinsicNode::UL; 921 922 // The stub threshold for LL strings is: 72 (64 + 8) chars 923 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 924 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 925 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 926 927 bool str1_isL = isLL || isLU; 928 bool str2_isL = isLL || isUL; 929 930 int str1_chr_shift = str1_isL ? 0 : 1; 931 int str2_chr_shift = str2_isL ? 0 : 1; 932 int str1_chr_size = str1_isL ? 1 : 2; 933 int str2_chr_size = str2_isL ? 1 : 2; 934 int minCharsInWord = isLL ? wordSize : wordSize/2; 935 936 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 937 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 938 (chr_insn)&MacroAssembler::ldrh; 939 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 940 (chr_insn)&MacroAssembler::ldrh; 941 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 942 (uxt_insn)&MacroAssembler::uxthw; 943 944 BLOCK_COMMENT("string_compare {"); 945 946 // Bizzarely, the counts are passed in bytes, regardless of whether they 947 // are L or U strings, however the result is always in characters. 948 if (!str1_isL) asrw(cnt1, cnt1, 1); 949 if (!str2_isL) asrw(cnt2, cnt2, 1); 950 951 // Compute the minimum of the string lengths and save the difference. 952 subsw(result, cnt1, cnt2); 953 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 954 955 // A very short string 956 cmpw(cnt2, minCharsInWord); 957 br(Assembler::LE, SHORT_STRING); 958 959 // Compare longwords 960 // load first parts of strings and finish initialization while loading 961 { 962 if (str1_isL == str2_isL) { // LL or UU 963 ldr(tmp1, Address(str1)); 964 cmp(str1, str2); 965 br(Assembler::EQ, DONE); 966 ldr(tmp2, Address(str2)); 967 cmp(cnt2, stub_threshold); 968 br(GE, STUB); 969 subsw(cnt2, cnt2, minCharsInWord); 970 br(EQ, TAIL_CHECK); 971 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 972 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 973 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 974 } else if (isLU) { 975 ldrs(vtmp, Address(str1)); 976 ldr(tmp2, Address(str2)); 977 cmp(cnt2, stub_threshold); 978 br(GE, STUB); 979 subw(cnt2, cnt2, 4); 980 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 981 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 982 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 983 zip1(vtmp, T8B, vtmp, vtmpZ); 984 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 985 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 986 add(cnt1, cnt1, 4); 987 fmovd(tmp1, vtmp); 988 } else { // UL case 989 ldr(tmp1, Address(str1)); 990 ldrs(vtmp, Address(str2)); 991 cmp(cnt2, stub_threshold); 992 br(GE, STUB); 993 subw(cnt2, cnt2, 4); 994 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 995 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 996 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 997 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 998 zip1(vtmp, T8B, vtmp, vtmpZ); 999 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1000 add(cnt1, cnt1, 8); 1001 fmovd(tmp2, vtmp); 1002 } 1003 adds(cnt2, cnt2, isUL ? 4 : 8); 1004 br(GE, TAIL); 1005 eor(rscratch2, tmp1, tmp2); 1006 cbnz(rscratch2, DIFF); 1007 // main loop 1008 bind(NEXT_WORD); 1009 if (str1_isL == str2_isL) { 1010 ldr(tmp1, Address(str1, cnt2)); 1011 ldr(tmp2, Address(str2, cnt2)); 1012 adds(cnt2, cnt2, 8); 1013 } else if (isLU) { 1014 ldrs(vtmp, Address(str1, cnt1)); 1015 ldr(tmp2, Address(str2, cnt2)); 1016 add(cnt1, cnt1, 4); 1017 zip1(vtmp, T8B, vtmp, vtmpZ); 1018 fmovd(tmp1, vtmp); 1019 adds(cnt2, cnt2, 8); 1020 } else { // UL 1021 ldrs(vtmp, Address(str2, cnt2)); 1022 ldr(tmp1, Address(str1, cnt1)); 1023 zip1(vtmp, T8B, vtmp, vtmpZ); 1024 add(cnt1, cnt1, 8); 1025 fmovd(tmp2, vtmp); 1026 adds(cnt2, cnt2, 4); 1027 } 1028 br(GE, TAIL); 1029 1030 eor(rscratch2, tmp1, tmp2); 1031 cbz(rscratch2, NEXT_WORD); 1032 b(DIFF); 1033 bind(TAIL); 1034 eor(rscratch2, tmp1, tmp2); 1035 cbnz(rscratch2, DIFF); 1036 // Last longword. In the case where length == 4 we compare the 1037 // same longword twice, but that's still faster than another 1038 // conditional branch. 1039 if (str1_isL == str2_isL) { 1040 ldr(tmp1, Address(str1)); 1041 ldr(tmp2, Address(str2)); 1042 } else if (isLU) { 1043 ldrs(vtmp, Address(str1)); 1044 ldr(tmp2, Address(str2)); 1045 zip1(vtmp, T8B, vtmp, vtmpZ); 1046 fmovd(tmp1, vtmp); 1047 } else { // UL 1048 ldrs(vtmp, Address(str2)); 1049 ldr(tmp1, Address(str1)); 1050 zip1(vtmp, T8B, vtmp, vtmpZ); 1051 fmovd(tmp2, vtmp); 1052 } 1053 bind(TAIL_CHECK); 1054 eor(rscratch2, tmp1, tmp2); 1055 cbz(rscratch2, DONE); 1056 1057 // Find the first different characters in the longwords and 1058 // compute their difference. 1059 bind(DIFF); 1060 rev(rscratch2, rscratch2); 1061 clz(rscratch2, rscratch2); 1062 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1063 lsrv(tmp1, tmp1, rscratch2); 1064 (this->*ext_chr)(tmp1, tmp1); 1065 lsrv(tmp2, tmp2, rscratch2); 1066 (this->*ext_chr)(tmp2, tmp2); 1067 subw(result, tmp1, tmp2); 1068 b(DONE); 1069 } 1070 1071 bind(STUB); 1072 RuntimeAddress stub = nullptr; 1073 switch(ae) { 1074 case StrIntrinsicNode::LL: 1075 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1076 break; 1077 case StrIntrinsicNode::UU: 1078 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1079 break; 1080 case StrIntrinsicNode::LU: 1081 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1082 break; 1083 case StrIntrinsicNode::UL: 1084 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1085 break; 1086 default: 1087 ShouldNotReachHere(); 1088 } 1089 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1090 address call = trampoline_call(stub); 1091 if (call == nullptr) { 1092 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1093 ciEnv::current()->record_failure("CodeCache is full"); 1094 return; 1095 } 1096 b(DONE); 1097 1098 bind(SHORT_STRING); 1099 // Is the minimum length zero? 1100 cbz(cnt2, DONE); 1101 // arrange code to do most branches while loading and loading next characters 1102 // while comparing previous 1103 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1104 subs(cnt2, cnt2, 1); 1105 br(EQ, SHORT_LAST_INIT); 1106 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1107 b(SHORT_LOOP_START); 1108 bind(SHORT_LOOP); 1109 subs(cnt2, cnt2, 1); 1110 br(EQ, SHORT_LAST); 1111 bind(SHORT_LOOP_START); 1112 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1113 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1114 cmp(tmp1, cnt1); 1115 br(NE, SHORT_LOOP_TAIL); 1116 subs(cnt2, cnt2, 1); 1117 br(EQ, SHORT_LAST2); 1118 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1119 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1120 cmp(tmp2, rscratch1); 1121 br(EQ, SHORT_LOOP); 1122 sub(result, tmp2, rscratch1); 1123 b(DONE); 1124 bind(SHORT_LOOP_TAIL); 1125 sub(result, tmp1, cnt1); 1126 b(DONE); 1127 bind(SHORT_LAST2); 1128 cmp(tmp2, rscratch1); 1129 br(EQ, DONE); 1130 sub(result, tmp2, rscratch1); 1131 1132 b(DONE); 1133 bind(SHORT_LAST_INIT); 1134 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1135 bind(SHORT_LAST); 1136 cmp(tmp1, cnt1); 1137 br(EQ, DONE); 1138 sub(result, tmp1, cnt1); 1139 1140 bind(DONE); 1141 1142 BLOCK_COMMENT("} string_compare"); 1143 } 1144 1145 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1146 FloatRegister src2, Condition cond, bool isQ) { 1147 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1148 FloatRegister zn = src1, zm = src2; 1149 bool needs_negation = false; 1150 switch (cond) { 1151 case LT: cond = GT; zn = src2; zm = src1; break; 1152 case LE: cond = GE; zn = src2; zm = src1; break; 1153 case LO: cond = HI; zn = src2; zm = src1; break; 1154 case LS: cond = HS; zn = src2; zm = src1; break; 1155 case NE: cond = EQ; needs_negation = true; break; 1156 default: 1157 break; 1158 } 1159 1160 if (is_floating_point_type(bt)) { 1161 fcm(cond, dst, size, zn, zm); 1162 } else { 1163 cm(cond, dst, size, zn, zm); 1164 } 1165 1166 if (needs_negation) { 1167 notr(dst, isQ ? T16B : T8B, dst); 1168 } 1169 } 1170 1171 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1172 Condition cond, bool isQ) { 1173 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1174 if (bt == T_FLOAT || bt == T_DOUBLE) { 1175 if (cond == Assembler::NE) { 1176 fcm(Assembler::EQ, dst, size, src); 1177 notr(dst, isQ ? T16B : T8B, dst); 1178 } else { 1179 fcm(cond, dst, size, src); 1180 } 1181 } else { 1182 if (cond == Assembler::NE) { 1183 cm(Assembler::EQ, dst, size, src); 1184 notr(dst, isQ ? T16B : T8B, dst); 1185 } else { 1186 cm(cond, dst, size, src); 1187 } 1188 } 1189 } 1190 1191 // Compress the least significant bit of each byte to the rightmost and clear 1192 // the higher garbage bits. 1193 void C2_MacroAssembler::bytemask_compress(Register dst) { 1194 // Example input, dst = 0x01 00 00 00 01 01 00 01 1195 // The "??" bytes are garbage. 1196 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1197 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1198 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1199 andr(dst, dst, 0xff); // dst = 0x8D 1200 } 1201 1202 // Pack the lowest-numbered bit of each mask element in src into a long value 1203 // in dst, at most the first 64 lane elements. 1204 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1205 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1206 FloatRegister vtmp1, FloatRegister vtmp2) { 1207 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1208 assert_different_registers(dst, rscratch1); 1209 assert_different_registers(vtmp1, vtmp2); 1210 1211 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1212 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1213 // Expected: dst = 0x658D 1214 1215 // Convert the mask into vector with sequential bytes. 1216 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1217 sve_cpy(vtmp1, size, src, 1, false); 1218 if (bt != T_BYTE) { 1219 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1220 } 1221 1222 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1223 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1224 // is to compress each significant bit of the byte in a cross-lane way. Due 1225 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1226 // (bit-compress in each lane) with the biggest lane size (T = D) then 1227 // concatenate the results. 1228 1229 // The second source input of BEXT, initialized with 0x01 in each byte. 1230 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1231 sve_dup(vtmp2, B, 1); 1232 1233 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1234 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1235 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1236 // --------------------------------------- 1237 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1238 sve_bext(vtmp1, D, vtmp1, vtmp2); 1239 1240 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1241 // result to dst. 1242 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1243 // dst = 0x658D 1244 if (lane_cnt <= 8) { 1245 // No need to concatenate. 1246 umov(dst, vtmp1, B, 0); 1247 } else if (lane_cnt <= 16) { 1248 ins(vtmp1, B, vtmp1, 1, 8); 1249 umov(dst, vtmp1, H, 0); 1250 } else { 1251 // As the lane count is 64 at most, the final expected value must be in 1252 // the lowest 64 bits after narrowing vtmp1 from D to B. 1253 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1254 umov(dst, vtmp1, D, 0); 1255 } 1256 } else if (UseSVE > 0) { 1257 // Compress the lowest 8 bytes. 1258 fmovd(dst, vtmp1); 1259 bytemask_compress(dst); 1260 if (lane_cnt <= 8) return; 1261 1262 // Repeat on higher bytes and join the results. 1263 // Compress 8 bytes in each iteration. 1264 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1265 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1266 bytemask_compress(rscratch1); 1267 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1268 } 1269 } else { 1270 assert(false, "unsupported"); 1271 ShouldNotReachHere(); 1272 } 1273 } 1274 1275 // Unpack the mask, a long value in src, into predicate register dst based on the 1276 // corresponding data type. Note that dst can support at most 64 lanes. 1277 // Below example gives the expected dst predicate register in different types, with 1278 // a valid src(0x658D) on a 1024-bit vector size machine. 1279 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1280 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1281 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1282 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1283 // 1284 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1285 // has 24 significant bits would be an invalid input if dst predicate register refers to 1286 // a LONG type 1024-bit vector, which has at most 16 lanes. 1287 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1288 FloatRegister vtmp1, FloatRegister vtmp2) { 1289 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1290 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1291 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1292 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1293 // Expected: dst = 0b01101001 10001101 1294 1295 // Put long value from general purpose register into the first lane of vector. 1296 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1297 sve_dup(vtmp1, B, 0); 1298 mov(vtmp1, D, 0, src); 1299 1300 // As sve_cmp generates mask value with the minimum unit in byte, we should 1301 // transform the value in the first lane which is mask in bit now to the 1302 // mask in byte, which can be done by SVE2's BDEP instruction. 1303 1304 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1305 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1306 if (lane_cnt <= 8) { 1307 // Nothing. As only one byte exsits. 1308 } else if (lane_cnt <= 16) { 1309 ins(vtmp1, B, vtmp1, 8, 1); 1310 mov(vtmp1, B, 1, zr); 1311 } else { 1312 sve_vector_extend(vtmp1, D, vtmp1, B); 1313 } 1314 1315 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1316 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1317 sve_dup(vtmp2, B, 1); 1318 1319 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1320 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1321 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1322 // --------------------------------------- 1323 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1324 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1325 1326 if (bt != T_BYTE) { 1327 sve_vector_extend(vtmp1, size, vtmp1, B); 1328 } 1329 // Generate mask according to the given vector, in which the elements have been 1330 // extended to expected type. 1331 // dst = 0b01101001 10001101 1332 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1333 } 1334 1335 // Clobbers: rflags 1336 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1337 FloatRegister zn, FloatRegister zm, Condition cond) { 1338 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1339 FloatRegister z1 = zn, z2 = zm; 1340 switch (cond) { 1341 case LE: z1 = zm; z2 = zn; cond = GE; break; 1342 case LT: z1 = zm; z2 = zn; cond = GT; break; 1343 case LO: z1 = zm; z2 = zn; cond = HI; break; 1344 case LS: z1 = zm; z2 = zn; cond = HS; break; 1345 default: 1346 break; 1347 } 1348 1349 SIMD_RegVariant size = elemType_to_regVariant(bt); 1350 if (is_floating_point_type(bt)) { 1351 sve_fcm(cond, pd, size, pg, z1, z2); 1352 } else { 1353 assert(is_integral_type(bt), "unsupported element type"); 1354 sve_cmp(cond, pd, size, pg, z1, z2); 1355 } 1356 } 1357 1358 // Get index of the last mask lane that is set 1359 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1360 SIMD_RegVariant size = elemType_to_regVariant(bt); 1361 sve_rev(ptmp, size, src); 1362 sve_brkb(ptmp, ptrue, ptmp, false); 1363 sve_cntp(dst, size, ptrue, ptmp); 1364 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1365 subw(dst, rscratch1, dst); 1366 } 1367 1368 // Extend integer vector src to dst with the same lane count 1369 // but larger element size, e.g. 4B -> 4I 1370 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1371 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1372 if (src_bt == T_BYTE) { 1373 if (dst_bt == T_SHORT) { 1374 // 4B/8B to 4S/8S 1375 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1376 } else { 1377 // 4B to 4I 1378 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1379 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1380 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1381 } 1382 } else if (src_bt == T_SHORT) { 1383 // 4S to 4I 1384 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1385 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1386 } else if (src_bt == T_INT) { 1387 // 2I to 2L 1388 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1389 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1390 } else { 1391 ShouldNotReachHere(); 1392 } 1393 } 1394 1395 // Narrow integer vector src down to dst with the same lane count 1396 // but smaller element size, e.g. 4I -> 4B 1397 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1398 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1399 if (src_bt == T_SHORT) { 1400 // 4S/8S to 4B/8B 1401 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1402 assert(dst_bt == T_BYTE, "unsupported"); 1403 xtn(dst, T8B, src, T8H); 1404 } else if (src_bt == T_INT) { 1405 // 4I to 4B/4S 1406 assert(src_vlen_in_bytes == 16, "unsupported"); 1407 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1408 xtn(dst, T4H, src, T4S); 1409 if (dst_bt == T_BYTE) { 1410 xtn(dst, T8B, dst, T8H); 1411 } 1412 } else if (src_bt == T_LONG) { 1413 // 2L to 2I 1414 assert(src_vlen_in_bytes == 16, "unsupported"); 1415 assert(dst_bt == T_INT, "unsupported"); 1416 xtn(dst, T2S, src, T2D); 1417 } else { 1418 ShouldNotReachHere(); 1419 } 1420 } 1421 1422 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1423 FloatRegister src, SIMD_RegVariant src_size, 1424 bool is_unsigned) { 1425 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1426 1427 if (src_size == B) { 1428 switch (dst_size) { 1429 case H: 1430 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1431 break; 1432 case S: 1433 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1434 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1435 break; 1436 case D: 1437 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1438 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1439 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1440 break; 1441 default: 1442 ShouldNotReachHere(); 1443 } 1444 } else if (src_size == H) { 1445 if (dst_size == S) { 1446 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1447 } else { // D 1448 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1449 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1450 } 1451 } else if (src_size == S) { 1452 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1453 } 1454 } 1455 1456 // Vector narrow from src to dst with specified element sizes. 1457 // High part of dst vector will be filled with zero. 1458 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1459 FloatRegister src, SIMD_RegVariant src_size, 1460 FloatRegister tmp) { 1461 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1462 assert_different_registers(src, tmp); 1463 sve_dup(tmp, src_size, 0); 1464 if (src_size == D) { 1465 switch (dst_size) { 1466 case S: 1467 sve_uzp1(dst, S, src, tmp); 1468 break; 1469 case H: 1470 assert_different_registers(dst, tmp); 1471 sve_uzp1(dst, S, src, tmp); 1472 sve_uzp1(dst, H, dst, tmp); 1473 break; 1474 case B: 1475 assert_different_registers(dst, tmp); 1476 sve_uzp1(dst, S, src, tmp); 1477 sve_uzp1(dst, H, dst, tmp); 1478 sve_uzp1(dst, B, dst, tmp); 1479 break; 1480 default: 1481 ShouldNotReachHere(); 1482 } 1483 } else if (src_size == S) { 1484 if (dst_size == H) { 1485 sve_uzp1(dst, H, src, tmp); 1486 } else { // B 1487 assert_different_registers(dst, tmp); 1488 sve_uzp1(dst, H, src, tmp); 1489 sve_uzp1(dst, B, dst, tmp); 1490 } 1491 } else if (src_size == H) { 1492 sve_uzp1(dst, B, src, tmp); 1493 } 1494 } 1495 1496 // Extend src predicate to dst predicate with the same lane count but larger 1497 // element size, e.g. 64Byte -> 512Long 1498 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1499 uint dst_element_length_in_bytes, 1500 uint src_element_length_in_bytes) { 1501 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1502 sve_punpklo(dst, src); 1503 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1504 sve_punpklo(dst, src); 1505 sve_punpklo(dst, dst); 1506 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1507 sve_punpklo(dst, src); 1508 sve_punpklo(dst, dst); 1509 sve_punpklo(dst, dst); 1510 } else { 1511 assert(false, "unsupported"); 1512 ShouldNotReachHere(); 1513 } 1514 } 1515 1516 // Narrow src predicate to dst predicate with the same lane count but 1517 // smaller element size, e.g. 512Long -> 64Byte 1518 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1519 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1520 // The insignificant bits in src predicate are expected to be zero. 1521 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1522 // passed as the second argument. An example narrowing operation with a given mask would be - 1523 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1524 // Mask (for 2 Longs) : TF 1525 // Predicate register for the above mask (16 bits) : 00000001 00000000 1526 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1527 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1528 assert_different_registers(src, ptmp); 1529 assert_different_registers(dst, ptmp); 1530 sve_pfalse(ptmp); 1531 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1532 sve_uzp1(dst, B, src, ptmp); 1533 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1534 sve_uzp1(dst, H, src, ptmp); 1535 sve_uzp1(dst, B, dst, ptmp); 1536 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1537 sve_uzp1(dst, S, src, ptmp); 1538 sve_uzp1(dst, H, dst, ptmp); 1539 sve_uzp1(dst, B, dst, ptmp); 1540 } else { 1541 assert(false, "unsupported"); 1542 ShouldNotReachHere(); 1543 } 1544 } 1545 1546 // Vector reduction add for integral type with ASIMD instructions. 1547 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1548 Register isrc, FloatRegister vsrc, 1549 unsigned vector_length_in_bytes, 1550 FloatRegister vtmp) { 1551 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1552 assert_different_registers(dst, isrc); 1553 bool isQ = vector_length_in_bytes == 16; 1554 1555 BLOCK_COMMENT("neon_reduce_add_integral {"); 1556 switch(bt) { 1557 case T_BYTE: 1558 addv(vtmp, isQ ? T16B : T8B, vsrc); 1559 smov(dst, vtmp, B, 0); 1560 addw(dst, dst, isrc, ext::sxtb); 1561 break; 1562 case T_SHORT: 1563 addv(vtmp, isQ ? T8H : T4H, vsrc); 1564 smov(dst, vtmp, H, 0); 1565 addw(dst, dst, isrc, ext::sxth); 1566 break; 1567 case T_INT: 1568 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1569 umov(dst, vtmp, S, 0); 1570 addw(dst, dst, isrc); 1571 break; 1572 case T_LONG: 1573 assert(isQ, "unsupported"); 1574 addpd(vtmp, vsrc); 1575 umov(dst, vtmp, D, 0); 1576 add(dst, dst, isrc); 1577 break; 1578 default: 1579 assert(false, "unsupported"); 1580 ShouldNotReachHere(); 1581 } 1582 BLOCK_COMMENT("} neon_reduce_add_integral"); 1583 } 1584 1585 // Vector reduction multiply for integral type with ASIMD instructions. 1586 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1587 // Clobbers: rscratch1 1588 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1589 Register isrc, FloatRegister vsrc, 1590 unsigned vector_length_in_bytes, 1591 FloatRegister vtmp1, FloatRegister vtmp2) { 1592 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1593 bool isQ = vector_length_in_bytes == 16; 1594 1595 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1596 switch(bt) { 1597 case T_BYTE: 1598 if (isQ) { 1599 // Multiply the lower half and higher half of vector iteratively. 1600 // vtmp1 = vsrc[8:15] 1601 ins(vtmp1, D, vsrc, 0, 1); 1602 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1603 mulv(vtmp1, T8B, vtmp1, vsrc); 1604 // vtmp2 = vtmp1[4:7] 1605 ins(vtmp2, S, vtmp1, 0, 1); 1606 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1607 mulv(vtmp1, T8B, vtmp2, vtmp1); 1608 } else { 1609 ins(vtmp1, S, vsrc, 0, 1); 1610 mulv(vtmp1, T8B, vtmp1, vsrc); 1611 } 1612 // vtmp2 = vtmp1[2:3] 1613 ins(vtmp2, H, vtmp1, 0, 1); 1614 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1615 mulv(vtmp2, T8B, vtmp2, vtmp1); 1616 // dst = vtmp2[0] * isrc * vtmp2[1] 1617 umov(rscratch1, vtmp2, B, 0); 1618 mulw(dst, rscratch1, isrc); 1619 sxtb(dst, dst); 1620 umov(rscratch1, vtmp2, B, 1); 1621 mulw(dst, rscratch1, dst); 1622 sxtb(dst, dst); 1623 break; 1624 case T_SHORT: 1625 if (isQ) { 1626 ins(vtmp2, D, vsrc, 0, 1); 1627 mulv(vtmp2, T4H, vtmp2, vsrc); 1628 ins(vtmp1, S, vtmp2, 0, 1); 1629 mulv(vtmp1, T4H, vtmp1, vtmp2); 1630 } else { 1631 ins(vtmp1, S, vsrc, 0, 1); 1632 mulv(vtmp1, T4H, vtmp1, vsrc); 1633 } 1634 umov(rscratch1, vtmp1, H, 0); 1635 mulw(dst, rscratch1, isrc); 1636 sxth(dst, dst); 1637 umov(rscratch1, vtmp1, H, 1); 1638 mulw(dst, rscratch1, dst); 1639 sxth(dst, dst); 1640 break; 1641 case T_INT: 1642 if (isQ) { 1643 ins(vtmp1, D, vsrc, 0, 1); 1644 mulv(vtmp1, T2S, vtmp1, vsrc); 1645 } else { 1646 vtmp1 = vsrc; 1647 } 1648 umov(rscratch1, vtmp1, S, 0); 1649 mul(dst, rscratch1, isrc); 1650 umov(rscratch1, vtmp1, S, 1); 1651 mul(dst, rscratch1, dst); 1652 break; 1653 case T_LONG: 1654 umov(rscratch1, vsrc, D, 0); 1655 mul(dst, isrc, rscratch1); 1656 umov(rscratch1, vsrc, D, 1); 1657 mul(dst, dst, rscratch1); 1658 break; 1659 default: 1660 assert(false, "unsupported"); 1661 ShouldNotReachHere(); 1662 } 1663 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1664 } 1665 1666 // Vector reduction multiply for floating-point type with ASIMD instructions. 1667 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1668 FloatRegister fsrc, FloatRegister vsrc, 1669 unsigned vector_length_in_bytes, 1670 FloatRegister vtmp) { 1671 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1672 bool isQ = vector_length_in_bytes == 16; 1673 1674 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1675 switch(bt) { 1676 case T_FLOAT: 1677 fmuls(dst, fsrc, vsrc); 1678 ins(vtmp, S, vsrc, 0, 1); 1679 fmuls(dst, dst, vtmp); 1680 if (isQ) { 1681 ins(vtmp, S, vsrc, 0, 2); 1682 fmuls(dst, dst, vtmp); 1683 ins(vtmp, S, vsrc, 0, 3); 1684 fmuls(dst, dst, vtmp); 1685 } 1686 break; 1687 case T_DOUBLE: 1688 assert(isQ, "unsupported"); 1689 fmuld(dst, fsrc, vsrc); 1690 ins(vtmp, D, vsrc, 0, 1); 1691 fmuld(dst, dst, vtmp); 1692 break; 1693 default: 1694 assert(false, "unsupported"); 1695 ShouldNotReachHere(); 1696 } 1697 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1698 } 1699 1700 // Helper to select logical instruction 1701 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1702 Register Rn, Register Rm, 1703 enum shift_kind kind, unsigned shift) { 1704 switch(opc) { 1705 case Op_AndReductionV: 1706 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1707 break; 1708 case Op_OrReductionV: 1709 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1710 break; 1711 case Op_XorReductionV: 1712 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1713 break; 1714 default: 1715 assert(false, "unsupported"); 1716 ShouldNotReachHere(); 1717 } 1718 } 1719 1720 // Vector reduction logical operations And, Or, Xor 1721 // Clobbers: rscratch1 1722 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1723 Register isrc, FloatRegister vsrc, 1724 unsigned vector_length_in_bytes) { 1725 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1726 "unsupported"); 1727 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1728 assert_different_registers(dst, isrc); 1729 bool isQ = vector_length_in_bytes == 16; 1730 1731 BLOCK_COMMENT("neon_reduce_logical {"); 1732 umov(rscratch1, vsrc, isQ ? D : S, 0); 1733 umov(dst, vsrc, isQ ? D : S, 1); 1734 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1735 switch(bt) { 1736 case T_BYTE: 1737 if (isQ) { 1738 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1739 } 1740 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1741 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1742 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1743 sxtb(dst, dst); 1744 break; 1745 case T_SHORT: 1746 if (isQ) { 1747 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1748 } 1749 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1750 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1751 sxth(dst, dst); 1752 break; 1753 case T_INT: 1754 if (isQ) { 1755 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1756 } 1757 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1758 break; 1759 case T_LONG: 1760 assert(isQ, "unsupported"); 1761 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1762 break; 1763 default: 1764 assert(false, "unsupported"); 1765 ShouldNotReachHere(); 1766 } 1767 BLOCK_COMMENT("} neon_reduce_logical"); 1768 } 1769 1770 // Vector reduction min/max for integral type with ASIMD instructions. 1771 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1772 // Clobbers: rscratch1, rflags 1773 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1774 Register isrc, FloatRegister vsrc, 1775 unsigned vector_length_in_bytes, 1776 FloatRegister vtmp) { 1777 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1778 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1779 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1780 assert_different_registers(dst, isrc); 1781 bool isQ = vector_length_in_bytes == 16; 1782 bool is_min = opc == Op_MinReductionV; 1783 1784 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1785 if (bt == T_LONG) { 1786 assert(vtmp == fnoreg, "should be"); 1787 assert(isQ, "should be"); 1788 umov(rscratch1, vsrc, D, 0); 1789 cmp(isrc, rscratch1); 1790 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1791 umov(rscratch1, vsrc, D, 1); 1792 cmp(dst, rscratch1); 1793 csel(dst, dst, rscratch1, is_min ? LT : GT); 1794 } else { 1795 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1796 if (size == T2S) { 1797 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 1798 } else { 1799 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 1800 } 1801 if (bt == T_INT) { 1802 umov(dst, vtmp, S, 0); 1803 } else { 1804 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 1805 } 1806 cmpw(dst, isrc); 1807 cselw(dst, dst, isrc, is_min ? LT : GT); 1808 } 1809 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 1810 } 1811 1812 // Vector reduction for integral type with SVE instruction. 1813 // Supported operations are Add, And, Or, Xor, Max, Min. 1814 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 1815 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1816 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1817 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1818 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1819 assert_different_registers(src1, dst); 1820 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1821 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1822 switch (opc) { 1823 case Op_AddReductionVI: { 1824 sve_uaddv(tmp, size, pg, src2); 1825 if (bt == T_BYTE) { 1826 smov(dst, tmp, size, 0); 1827 addw(dst, src1, dst, ext::sxtb); 1828 } else if (bt == T_SHORT) { 1829 smov(dst, tmp, size, 0); 1830 addw(dst, src1, dst, ext::sxth); 1831 } else { 1832 umov(dst, tmp, size, 0); 1833 addw(dst, dst, src1); 1834 } 1835 break; 1836 } 1837 case Op_AddReductionVL: { 1838 sve_uaddv(tmp, size, pg, src2); 1839 umov(dst, tmp, size, 0); 1840 add(dst, dst, src1); 1841 break; 1842 } 1843 case Op_AndReductionV: { 1844 sve_andv(tmp, size, pg, src2); 1845 if (bt == T_INT || bt == T_LONG) { 1846 umov(dst, tmp, size, 0); 1847 } else { 1848 smov(dst, tmp, size, 0); 1849 } 1850 if (bt == T_LONG) { 1851 andr(dst, dst, src1); 1852 } else { 1853 andw(dst, dst, src1); 1854 } 1855 break; 1856 } 1857 case Op_OrReductionV: { 1858 sve_orv(tmp, size, pg, src2); 1859 if (bt == T_INT || bt == T_LONG) { 1860 umov(dst, tmp, size, 0); 1861 } else { 1862 smov(dst, tmp, size, 0); 1863 } 1864 if (bt == T_LONG) { 1865 orr(dst, dst, src1); 1866 } else { 1867 orrw(dst, dst, src1); 1868 } 1869 break; 1870 } 1871 case Op_XorReductionV: { 1872 sve_eorv(tmp, size, pg, src2); 1873 if (bt == T_INT || bt == T_LONG) { 1874 umov(dst, tmp, size, 0); 1875 } else { 1876 smov(dst, tmp, size, 0); 1877 } 1878 if (bt == T_LONG) { 1879 eor(dst, dst, src1); 1880 } else { 1881 eorw(dst, dst, src1); 1882 } 1883 break; 1884 } 1885 case Op_MaxReductionV: { 1886 sve_smaxv(tmp, size, pg, src2); 1887 if (bt == T_INT || bt == T_LONG) { 1888 umov(dst, tmp, size, 0); 1889 } else { 1890 smov(dst, tmp, size, 0); 1891 } 1892 if (bt == T_LONG) { 1893 cmp(dst, src1); 1894 csel(dst, dst, src1, Assembler::GT); 1895 } else { 1896 cmpw(dst, src1); 1897 cselw(dst, dst, src1, Assembler::GT); 1898 } 1899 break; 1900 } 1901 case Op_MinReductionV: { 1902 sve_sminv(tmp, size, pg, src2); 1903 if (bt == T_INT || bt == T_LONG) { 1904 umov(dst, tmp, size, 0); 1905 } else { 1906 smov(dst, tmp, size, 0); 1907 } 1908 if (bt == T_LONG) { 1909 cmp(dst, src1); 1910 csel(dst, dst, src1, Assembler::LT); 1911 } else { 1912 cmpw(dst, src1); 1913 cselw(dst, dst, src1, Assembler::LT); 1914 } 1915 break; 1916 } 1917 default: 1918 assert(false, "unsupported"); 1919 ShouldNotReachHere(); 1920 } 1921 1922 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1923 if (bt == T_BYTE) { 1924 sxtb(dst, dst); 1925 } else if (bt == T_SHORT) { 1926 sxth(dst, dst); 1927 } 1928 } 1929 } 1930 1931 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 1932 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 1933 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 1934 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 1935 uint32_t max_vector_length = Matcher::max_vector_size(bt); 1936 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 1937 1938 // Set all elements to false if the input "lane_cnt" is zero. 1939 if (lane_cnt == 0) { 1940 sve_pfalse(dst); 1941 return; 1942 } 1943 1944 SIMD_RegVariant size = elemType_to_regVariant(bt); 1945 assert(size != Q, "invalid size"); 1946 1947 // Set all true if "lane_cnt" equals to the max lane count. 1948 if (lane_cnt == max_vector_length) { 1949 sve_ptrue(dst, size, /* ALL */ 0b11111); 1950 return; 1951 } 1952 1953 // Fixed numbers for "ptrue". 1954 switch(lane_cnt) { 1955 case 1: /* VL1 */ 1956 case 2: /* VL2 */ 1957 case 3: /* VL3 */ 1958 case 4: /* VL4 */ 1959 case 5: /* VL5 */ 1960 case 6: /* VL6 */ 1961 case 7: /* VL7 */ 1962 case 8: /* VL8 */ 1963 sve_ptrue(dst, size, lane_cnt); 1964 return; 1965 case 16: 1966 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1967 return; 1968 case 32: 1969 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1970 return; 1971 case 64: 1972 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1973 return; 1974 case 128: 1975 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1976 return; 1977 case 256: 1978 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1979 return; 1980 default: 1981 break; 1982 } 1983 1984 // Special patterns for "ptrue". 1985 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 1986 sve_ptrue(dst, size, /* POW2 */ 0b00000); 1987 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 1988 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 1989 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 1990 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 1991 } else { 1992 // Encode to "whileltw" for the remaining cases. 1993 mov(rscratch1, lane_cnt); 1994 sve_whileltw(dst, size, zr, rscratch1); 1995 } 1996 } 1997 1998 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1999 // Any remaining elements of dst will be filled with zero. 2000 // Clobbers: rscratch1 2001 // Preserves: src, mask 2002 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2003 FloatRegister vtmp1, FloatRegister vtmp2, 2004 PRegister pgtmp) { 2005 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2006 assert_different_registers(dst, src, vtmp1, vtmp2); 2007 assert_different_registers(mask, pgtmp); 2008 2009 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2010 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2011 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2012 sve_dup(vtmp2, H, 0); 2013 2014 // Extend lowest half to type INT. 2015 // dst = 00004444 00003333 00002222 00001111 2016 sve_uunpklo(dst, S, src); 2017 // pgtmp = 00000001 00000000 00000001 00000001 2018 sve_punpklo(pgtmp, mask); 2019 // Pack the active elements in size of type INT to the right, 2020 // and fill the remainings with zero. 2021 // dst = 00000000 00004444 00002222 00001111 2022 sve_compact(dst, S, dst, pgtmp); 2023 // Narrow the result back to type SHORT. 2024 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2025 sve_uzp1(dst, H, dst, vtmp2); 2026 // Count the active elements of lowest half. 2027 // rscratch1 = 3 2028 sve_cntp(rscratch1, S, ptrue, pgtmp); 2029 2030 // Repeat to the highest half. 2031 // pgtmp = 00000001 00000000 00000000 00000001 2032 sve_punpkhi(pgtmp, mask); 2033 // vtmp1 = 00008888 00007777 00006666 00005555 2034 sve_uunpkhi(vtmp1, S, src); 2035 // vtmp1 = 00000000 00000000 00008888 00005555 2036 sve_compact(vtmp1, S, vtmp1, pgtmp); 2037 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2038 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2039 2040 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2041 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2042 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2043 // TRUE_CNT is the number of active elements in the compressed low. 2044 neg(rscratch1, rscratch1); 2045 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2046 sve_index(vtmp2, H, rscratch1, 1); 2047 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2048 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2049 2050 // Combine the compressed high(after shifted) with the compressed low. 2051 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2052 sve_orr(dst, dst, vtmp1); 2053 } 2054 2055 // Clobbers: rscratch1, rscratch2 2056 // Preserves: src, mask 2057 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2058 FloatRegister vtmp1, FloatRegister vtmp2, 2059 FloatRegister vtmp3, FloatRegister vtmp4, 2060 PRegister ptmp, PRegister pgtmp) { 2061 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2062 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2063 assert_different_registers(mask, ptmp, pgtmp); 2064 // Example input: src = 88 77 66 55 44 33 22 11 2065 // mask = 01 00 00 01 01 00 01 01 2066 // Expected result: dst = 00 00 00 88 55 44 22 11 2067 2068 sve_dup(vtmp4, B, 0); 2069 // Extend lowest half to type SHORT. 2070 // vtmp1 = 0044 0033 0022 0011 2071 sve_uunpklo(vtmp1, H, src); 2072 // ptmp = 0001 0000 0001 0001 2073 sve_punpklo(ptmp, mask); 2074 // Count the active elements of lowest half. 2075 // rscratch2 = 3 2076 sve_cntp(rscratch2, H, ptrue, ptmp); 2077 // Pack the active elements in size of type SHORT to the right, 2078 // and fill the remainings with zero. 2079 // dst = 0000 0044 0022 0011 2080 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2081 // Narrow the result back to type BYTE. 2082 // dst = 00 00 00 00 00 44 22 11 2083 sve_uzp1(dst, B, dst, vtmp4); 2084 2085 // Repeat to the highest half. 2086 // ptmp = 0001 0000 0000 0001 2087 sve_punpkhi(ptmp, mask); 2088 // vtmp1 = 0088 0077 0066 0055 2089 sve_uunpkhi(vtmp2, H, src); 2090 // vtmp1 = 0000 0000 0088 0055 2091 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2092 2093 sve_dup(vtmp4, B, 0); 2094 // vtmp1 = 00 00 00 00 00 00 88 55 2095 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2096 2097 // Compressed low: dst = 00 00 00 00 00 44 22 11 2098 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2099 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2100 // TRUE_CNT is the number of active elements in the compressed low. 2101 neg(rscratch2, rscratch2); 2102 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2103 sve_index(vtmp2, B, rscratch2, 1); 2104 // vtmp1 = 00 00 00 88 55 00 00 00 2105 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2106 // Combine the compressed high(after shifted) with the compressed low. 2107 // dst = 00 00 00 88 55 44 22 11 2108 sve_orr(dst, dst, vtmp1); 2109 } 2110 2111 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2112 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2113 SIMD_Arrangement size = isQ ? T16B : T8B; 2114 if (bt == T_BYTE) { 2115 rbit(dst, size, src); 2116 } else { 2117 neon_reverse_bytes(dst, src, bt, isQ); 2118 rbit(dst, size, dst); 2119 } 2120 } 2121 2122 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2123 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2124 SIMD_Arrangement size = isQ ? T16B : T8B; 2125 switch (bt) { 2126 case T_BYTE: 2127 if (dst != src) { 2128 orr(dst, size, src, src); 2129 } 2130 break; 2131 case T_SHORT: 2132 rev16(dst, size, src); 2133 break; 2134 case T_INT: 2135 rev32(dst, size, src); 2136 break; 2137 case T_LONG: 2138 rev64(dst, size, src); 2139 break; 2140 default: 2141 assert(false, "unsupported"); 2142 ShouldNotReachHere(); 2143 } 2144 } 2145 2146 // Extract a scalar element from an sve vector at position 'idx'. 2147 // The input elements in src are expected to be of integral type. 2148 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2149 int idx, FloatRegister vtmp) { 2150 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2151 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2152 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2153 if (bt == T_INT || bt == T_LONG) { 2154 umov(dst, src, size, idx); 2155 } else { 2156 smov(dst, src, size, idx); 2157 } 2158 } else { 2159 sve_orr(vtmp, src, src); 2160 sve_ext(vtmp, vtmp, idx << size); 2161 if (bt == T_INT || bt == T_LONG) { 2162 umov(dst, vtmp, size, 0); 2163 } else { 2164 smov(dst, vtmp, size, 0); 2165 } 2166 } 2167 } 2168 2169 // java.lang.Math::round intrinsics 2170 2171 // Clobbers: rscratch1, rflags 2172 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2173 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2174 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2175 switch (T) { 2176 case T2S: 2177 case T4S: 2178 fmovs(tmp1, T, 0.5f); 2179 mov(rscratch1, jint_cast(0x1.0p23f)); 2180 break; 2181 case T2D: 2182 fmovd(tmp1, T, 0.5); 2183 mov(rscratch1, julong_cast(0x1.0p52)); 2184 break; 2185 default: 2186 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2187 } 2188 fadd(tmp1, T, tmp1, src); 2189 fcvtms(tmp1, T, tmp1); 2190 // tmp1 = floor(src + 0.5, ties to even) 2191 2192 fcvtas(dst, T, src); 2193 // dst = round(src), ties to away 2194 2195 fneg(tmp3, T, src); 2196 dup(tmp2, T, rscratch1); 2197 cm(HS, tmp3, T, tmp3, tmp2); 2198 // tmp3 is now a set of flags 2199 2200 bif(dst, T16B, tmp1, tmp3); 2201 // result in dst 2202 } 2203 2204 // Clobbers: rscratch1, rflags 2205 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2206 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2207 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2208 assert_different_registers(tmp1, tmp2, src, dst); 2209 2210 switch (T) { 2211 case S: 2212 mov(rscratch1, jint_cast(0x1.0p23f)); 2213 break; 2214 case D: 2215 mov(rscratch1, julong_cast(0x1.0p52)); 2216 break; 2217 default: 2218 assert(T == S || T == D, "invalid register variant"); 2219 } 2220 2221 sve_frinta(dst, T, ptrue, src); 2222 // dst = round(src), ties to away 2223 2224 Label none; 2225 2226 sve_fneg(tmp1, T, ptrue, src); 2227 sve_dup(tmp2, T, rscratch1); 2228 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2229 br(EQ, none); 2230 { 2231 sve_cpy(tmp1, T, pgtmp, 0.5); 2232 sve_fadd(tmp1, T, pgtmp, src); 2233 sve_frintm(dst, T, pgtmp, tmp1); 2234 // dst = floor(src + 0.5, ties to even) 2235 } 2236 bind(none); 2237 2238 sve_fcvtzs(dst, T, ptrue, dst, T); 2239 // result in dst 2240 } 2241 2242 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2243 FloatRegister one, SIMD_Arrangement T) { 2244 assert_different_registers(dst, src, zero, one); 2245 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2246 2247 facgt(dst, T, src, zero); 2248 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2249 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2250 } 2251 2252 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2253 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2254 assert_different_registers(dst, src, zero, one, vtmp); 2255 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2256 2257 sve_orr(vtmp, src, src); 2258 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2259 switch (T) { 2260 case S: 2261 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2262 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2263 // on the sign of the float value 2264 break; 2265 case D: 2266 sve_and(vtmp, T, min_jlong); 2267 sve_orr(vtmp, T, jlong_cast(1.0)); 2268 break; 2269 default: 2270 assert(false, "unsupported"); 2271 ShouldNotReachHere(); 2272 } 2273 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2274 // Result in dst 2275 } 2276 2277 bool C2_MacroAssembler::in_scratch_emit_size() { 2278 if (ciEnv::current()->task() != nullptr) { 2279 PhaseOutput* phase_output = Compile::current()->output(); 2280 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2281 return true; 2282 } 2283 } 2284 return MacroAssembler::in_scratch_emit_size(); 2285 }