1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 47 48 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 49 Register tmp2Reg, Register tmp3Reg) { 50 Register oop = objectReg; 51 Register box = boxReg; 52 Register disp_hdr = tmpReg; 53 Register tmp = tmp2Reg; 54 Label cont; 55 Label object_has_monitor; 56 Label count, no_count; 57 58 assert_different_registers(oop, box, tmp, disp_hdr); 59 60 // Load markWord from object into displaced_header. 61 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 62 63 if (DiagnoseSyncOnValueBasedClasses != 0) { 64 load_klass(tmp, oop); 65 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 66 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 67 br(Assembler::NE, cont); 68 } 69 70 // Check for existing monitor 71 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 72 73 if (LockingMode == LM_MONITOR) { 74 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 75 b(cont); 76 } else if (LockingMode == LM_LEGACY) { 77 // Set tmp to be (markWord of object | UNLOCK_VALUE). 78 orr(tmp, disp_hdr, markWord::unlocked_value); 79 80 // Initialize the box. (Must happen before we update the object mark!) 81 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 82 83 // Compare object markWord with an unlocked value (tmp) and if 84 // equal exchange the stack address of our box with object markWord. 85 // On failure disp_hdr contains the possibly locked markWord. 86 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 87 /*release*/ true, /*weak*/ false, disp_hdr); 88 br(Assembler::EQ, cont); 89 90 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 91 92 // If the compare-and-exchange succeeded, then we found an unlocked 93 // object, will have now locked it will continue at label cont 94 95 // Check if the owner is self by comparing the value in the 96 // markWord of object (disp_hdr) with the stack pointer. 97 mov(rscratch1, sp); 98 sub(disp_hdr, disp_hdr, rscratch1); 99 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 100 // If condition is true we are cont and hence we can store 0 as the 101 // displaced header in the box, which indicates that it is a recursive lock. 102 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 103 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 104 b(cont); 105 } else { 106 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 107 lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count); 108 b(count); 109 } 110 111 // Handle existing monitor. 112 bind(object_has_monitor); 113 114 // The object's monitor m is unlocked iff m->owner == NULL, 115 // otherwise m->owner may contain a thread or a stack address. 116 // 117 // Try to CAS m->owner from NULL to current thread. 118 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 119 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 120 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 121 122 if (LockingMode != LM_LIGHTWEIGHT) { 123 // Store a non-null value into the box to avoid looking like a re-entrant 124 // lock. The fast-path monitor unlock code checks for 125 // markWord::monitor_value so use markWord::unused_mark which has the 126 // relevant bit set, and also matches ObjectSynchronizer::enter. 127 mov(tmp, (address)markWord::unused_mark().value()); 128 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 129 } 130 br(Assembler::EQ, cont); // CAS success means locking succeeded 131 132 cmp(tmp3Reg, rthread); 133 br(Assembler::NE, cont); // Check for recursive locking 134 135 // Recursive lock case 136 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 137 // flag == EQ still from the cmp above, checking if this is a reentrant lock 138 139 bind(cont); 140 // flag == EQ indicates success 141 // flag == NE indicates failure 142 br(Assembler::NE, no_count); 143 144 bind(count); 145 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 146 147 bind(no_count); 148 } 149 150 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 151 Register tmp2Reg) { 152 Register oop = objectReg; 153 Register box = boxReg; 154 Register disp_hdr = tmpReg; 155 Register tmp = tmp2Reg; 156 Label cont; 157 Label object_has_monitor; 158 Label count, no_count; 159 160 assert_different_registers(oop, box, tmp, disp_hdr); 161 162 if (LockingMode == LM_LEGACY) { 163 // Find the lock address and load the displaced header from the stack. 164 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 165 166 // If the displaced header is 0, we have a recursive unlock. 167 cmp(disp_hdr, zr); 168 br(Assembler::EQ, cont); 169 } 170 171 // Handle existing monitor. 172 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 173 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 174 175 if (LockingMode == LM_MONITOR) { 176 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 177 b(cont); 178 } else if (LockingMode == LM_LEGACY) { 179 // Check if it is still a light weight lock, this is is true if we 180 // see the stack address of the basicLock in the markWord of the 181 // object. 182 183 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 184 /*release*/ true, /*weak*/ false, tmp); 185 b(cont); 186 } else { 187 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 188 lightweight_unlock(oop, tmp, box, disp_hdr, no_count); 189 b(count); 190 } 191 192 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 193 194 // Handle existing monitor. 195 bind(object_has_monitor); 196 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 197 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 198 199 if (LockingMode == LM_LIGHTWEIGHT) { 200 // If the owner is anonymous, we need to fix it -- in an outline stub. 201 Register tmp2 = disp_hdr; 202 ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset())); 203 // We cannot use tbnz here, the target might be too far away and cannot 204 // be encoded. 205 tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER); 206 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2); 207 Compile::current()->output()->add_stub(stub); 208 br(Assembler::NE, stub->entry()); 209 bind(stub->continuation()); 210 } 211 212 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 213 214 Label notRecursive; 215 cbz(disp_hdr, notRecursive); 216 217 // Recursive lock 218 sub(disp_hdr, disp_hdr, 1u); 219 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 220 cmp(disp_hdr, disp_hdr); // Sets flags for result 221 b(cont); 222 223 bind(notRecursive); 224 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 225 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 226 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 227 cmp(rscratch1, zr); // Sets flags for result 228 cbnz(rscratch1, cont); 229 // need a release store here 230 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 231 stlr(zr, tmp); // set unowned 232 233 bind(cont); 234 // flag == EQ indicates success 235 // flag == NE indicates failure 236 br(Assembler::NE, no_count); 237 238 bind(count); 239 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 240 241 bind(no_count); 242 } 243 244 // Search for str1 in str2 and return index or -1 245 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 246 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 247 Register cnt2, Register cnt1, 248 Register tmp1, Register tmp2, 249 Register tmp3, Register tmp4, 250 Register tmp5, Register tmp6, 251 int icnt1, Register result, int ae) { 252 // NOTE: tmp5, tmp6 can be zr depending on specific method version 253 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 254 255 Register ch1 = rscratch1; 256 Register ch2 = rscratch2; 257 Register cnt1tmp = tmp1; 258 Register cnt2tmp = tmp2; 259 Register cnt1_neg = cnt1; 260 Register cnt2_neg = cnt2; 261 Register result_tmp = tmp4; 262 263 bool isL = ae == StrIntrinsicNode::LL; 264 265 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 266 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 267 int str1_chr_shift = str1_isL ? 0:1; 268 int str2_chr_shift = str2_isL ? 0:1; 269 int str1_chr_size = str1_isL ? 1:2; 270 int str2_chr_size = str2_isL ? 1:2; 271 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 272 (chr_insn)&MacroAssembler::ldrh; 273 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 274 (chr_insn)&MacroAssembler::ldrh; 275 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 276 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 277 278 // Note, inline_string_indexOf() generates checks: 279 // if (substr.count > string.count) return -1; 280 // if (substr.count == 0) return 0; 281 282 // We have two strings, a source string in str2, cnt2 and a pattern string 283 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 284 285 // For larger pattern and source we use a simplified Boyer Moore algorithm. 286 // With a small pattern and source we use linear scan. 287 288 if (icnt1 == -1) { 289 sub(result_tmp, cnt2, cnt1); 290 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 291 br(LT, LINEARSEARCH); 292 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 293 subs(zr, cnt1, 256); 294 lsr(tmp1, cnt2, 2); 295 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 296 br(GE, LINEARSTUB); 297 } 298 299 // The Boyer Moore alogorithm is based on the description here:- 300 // 301 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 302 // 303 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 304 // and the 'Good Suffix' rule. 305 // 306 // These rules are essentially heuristics for how far we can shift the 307 // pattern along the search string. 308 // 309 // The implementation here uses the 'Bad Character' rule only because of the 310 // complexity of initialisation for the 'Good Suffix' rule. 311 // 312 // This is also known as the Boyer-Moore-Horspool algorithm:- 313 // 314 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 315 // 316 // This particular implementation has few java-specific optimizations. 317 // 318 // #define ASIZE 256 319 // 320 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 321 // int i, j; 322 // unsigned c; 323 // unsigned char bc[ASIZE]; 324 // 325 // /* Preprocessing */ 326 // for (i = 0; i < ASIZE; ++i) 327 // bc[i] = m; 328 // for (i = 0; i < m - 1; ) { 329 // c = x[i]; 330 // ++i; 331 // // c < 256 for Latin1 string, so, no need for branch 332 // #ifdef PATTERN_STRING_IS_LATIN1 333 // bc[c] = m - i; 334 // #else 335 // if (c < ASIZE) bc[c] = m - i; 336 // #endif 337 // } 338 // 339 // /* Searching */ 340 // j = 0; 341 // while (j <= n - m) { 342 // c = y[i+j]; 343 // if (x[m-1] == c) 344 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 345 // if (i < 0) return j; 346 // // c < 256 for Latin1 string, so, no need for branch 347 // #ifdef SOURCE_STRING_IS_LATIN1 348 // // LL case: (c< 256) always true. Remove branch 349 // j += bc[y[j+m-1]]; 350 // #endif 351 // #ifndef PATTERN_STRING_IS_UTF 352 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 353 // if (c < ASIZE) 354 // j += bc[y[j+m-1]]; 355 // else 356 // j += 1 357 // #endif 358 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 359 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 360 // if (c < ASIZE) 361 // j += bc[y[j+m-1]]; 362 // else 363 // j += m 364 // #endif 365 // } 366 // } 367 368 if (icnt1 == -1) { 369 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 370 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 371 Register cnt1end = tmp2; 372 Register str2end = cnt2; 373 Register skipch = tmp2; 374 375 // str1 length is >=8, so, we can read at least 1 register for cases when 376 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 377 // UL case. We'll re-read last character in inner pre-loop code to have 378 // single outer pre-loop load 379 const int firstStep = isL ? 7 : 3; 380 381 const int ASIZE = 256; 382 const int STORED_BYTES = 32; // amount of bytes stored per instruction 383 sub(sp, sp, ASIZE); 384 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 385 mov(ch1, sp); 386 BIND(BM_INIT_LOOP); 387 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 388 subs(tmp5, tmp5, 1); 389 br(GT, BM_INIT_LOOP); 390 391 sub(cnt1tmp, cnt1, 1); 392 mov(tmp5, str2); 393 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 394 sub(ch2, cnt1, 1); 395 mov(tmp3, str1); 396 BIND(BCLOOP); 397 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 398 if (!str1_isL) { 399 subs(zr, ch1, ASIZE); 400 br(HS, BCSKIP); 401 } 402 strb(ch2, Address(sp, ch1)); 403 BIND(BCSKIP); 404 subs(ch2, ch2, 1); 405 br(GT, BCLOOP); 406 407 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 408 if (str1_isL == str2_isL) { 409 // load last 8 bytes (8LL/4UU symbols) 410 ldr(tmp6, Address(tmp6, -wordSize)); 411 } else { 412 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 413 // convert Latin1 to UTF. We'll have to wait until load completed, but 414 // it's still faster than per-character loads+checks 415 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 416 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 417 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 418 andr(tmp6, tmp6, 0xFF); // str1[N-4] 419 orr(ch2, ch1, ch2, LSL, 16); 420 orr(tmp6, tmp6, tmp3, LSL, 48); 421 orr(tmp6, tmp6, ch2, LSL, 16); 422 } 423 BIND(BMLOOPSTR2); 424 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 425 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 426 if (str1_isL == str2_isL) { 427 // re-init tmp3. It's for free because it's executed in parallel with 428 // load above. Alternative is to initialize it before loop, but it'll 429 // affect performance on in-order systems with 2 or more ld/st pipelines 430 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 431 } 432 if (!isL) { // UU/UL case 433 lsl(ch2, cnt1tmp, 1); // offset in bytes 434 } 435 cmp(tmp3, skipch); 436 br(NE, BMSKIP); 437 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 438 mov(ch1, tmp6); 439 if (isL) { 440 b(BMLOOPSTR1_AFTER_LOAD); 441 } else { 442 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 443 b(BMLOOPSTR1_CMP); 444 } 445 BIND(BMLOOPSTR1); 446 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 447 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 448 BIND(BMLOOPSTR1_AFTER_LOAD); 449 subs(cnt1tmp, cnt1tmp, 1); 450 br(LT, BMLOOPSTR1_LASTCMP); 451 BIND(BMLOOPSTR1_CMP); 452 cmp(ch1, ch2); 453 br(EQ, BMLOOPSTR1); 454 BIND(BMSKIP); 455 if (!isL) { 456 // if we've met UTF symbol while searching Latin1 pattern, then we can 457 // skip cnt1 symbols 458 if (str1_isL != str2_isL) { 459 mov(result_tmp, cnt1); 460 } else { 461 mov(result_tmp, 1); 462 } 463 subs(zr, skipch, ASIZE); 464 br(HS, BMADV); 465 } 466 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 467 BIND(BMADV); 468 sub(cnt1tmp, cnt1, 1); 469 add(str2, str2, result_tmp, LSL, str2_chr_shift); 470 cmp(str2, str2end); 471 br(LE, BMLOOPSTR2); 472 add(sp, sp, ASIZE); 473 b(NOMATCH); 474 BIND(BMLOOPSTR1_LASTCMP); 475 cmp(ch1, ch2); 476 br(NE, BMSKIP); 477 BIND(BMMATCH); 478 sub(result, str2, tmp5); 479 if (!str2_isL) lsr(result, result, 1); 480 add(sp, sp, ASIZE); 481 b(DONE); 482 483 BIND(LINEARSTUB); 484 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 485 br(LT, LINEAR_MEDIUM); 486 mov(result, zr); 487 RuntimeAddress stub = nullptr; 488 if (isL) { 489 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 490 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 491 } else if (str1_isL) { 492 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 493 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 494 } else { 495 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 496 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 497 } 498 address call = trampoline_call(stub); 499 if (call == nullptr) { 500 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 501 ciEnv::current()->record_failure("CodeCache is full"); 502 return; 503 } 504 b(DONE); 505 } 506 507 BIND(LINEARSEARCH); 508 { 509 Label DO1, DO2, DO3; 510 511 Register str2tmp = tmp2; 512 Register first = tmp3; 513 514 if (icnt1 == -1) 515 { 516 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 517 518 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 519 br(LT, DOSHORT); 520 BIND(LINEAR_MEDIUM); 521 (this->*str1_load_1chr)(first, Address(str1)); 522 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 523 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 524 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 525 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 526 527 BIND(FIRST_LOOP); 528 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 529 cmp(first, ch2); 530 br(EQ, STR1_LOOP); 531 BIND(STR2_NEXT); 532 adds(cnt2_neg, cnt2_neg, str2_chr_size); 533 br(LE, FIRST_LOOP); 534 b(NOMATCH); 535 536 BIND(STR1_LOOP); 537 adds(cnt1tmp, cnt1_neg, str1_chr_size); 538 add(cnt2tmp, cnt2_neg, str2_chr_size); 539 br(GE, MATCH); 540 541 BIND(STR1_NEXT); 542 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 543 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 544 cmp(ch1, ch2); 545 br(NE, STR2_NEXT); 546 adds(cnt1tmp, cnt1tmp, str1_chr_size); 547 add(cnt2tmp, cnt2tmp, str2_chr_size); 548 br(LT, STR1_NEXT); 549 b(MATCH); 550 551 BIND(DOSHORT); 552 if (str1_isL == str2_isL) { 553 cmp(cnt1, (u1)2); 554 br(LT, DO1); 555 br(GT, DO3); 556 } 557 } 558 559 if (icnt1 == 4) { 560 Label CH1_LOOP; 561 562 (this->*load_4chr)(ch1, str1); 563 sub(result_tmp, cnt2, 4); 564 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 565 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 566 567 BIND(CH1_LOOP); 568 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 569 cmp(ch1, ch2); 570 br(EQ, MATCH); 571 adds(cnt2_neg, cnt2_neg, str2_chr_size); 572 br(LE, CH1_LOOP); 573 b(NOMATCH); 574 } 575 576 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 577 Label CH1_LOOP; 578 579 BIND(DO2); 580 (this->*load_2chr)(ch1, str1); 581 if (icnt1 == 2) { 582 sub(result_tmp, cnt2, 2); 583 } 584 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 585 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 586 BIND(CH1_LOOP); 587 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 588 cmp(ch1, ch2); 589 br(EQ, MATCH); 590 adds(cnt2_neg, cnt2_neg, str2_chr_size); 591 br(LE, CH1_LOOP); 592 b(NOMATCH); 593 } 594 595 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 596 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 597 598 BIND(DO3); 599 (this->*load_2chr)(first, str1); 600 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 601 if (icnt1 == 3) { 602 sub(result_tmp, cnt2, 3); 603 } 604 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 605 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 606 BIND(FIRST_LOOP); 607 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 608 cmpw(first, ch2); 609 br(EQ, STR1_LOOP); 610 BIND(STR2_NEXT); 611 adds(cnt2_neg, cnt2_neg, str2_chr_size); 612 br(LE, FIRST_LOOP); 613 b(NOMATCH); 614 615 BIND(STR1_LOOP); 616 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 617 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 618 cmp(ch1, ch2); 619 br(NE, STR2_NEXT); 620 b(MATCH); 621 } 622 623 if (icnt1 == -1 || icnt1 == 1) { 624 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 625 626 BIND(DO1); 627 (this->*str1_load_1chr)(ch1, str1); 628 cmp(cnt2, (u1)8); 629 br(LT, DO1_SHORT); 630 631 sub(result_tmp, cnt2, 8/str2_chr_size); 632 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 633 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 634 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 635 636 if (str2_isL) { 637 orr(ch1, ch1, ch1, LSL, 8); 638 } 639 orr(ch1, ch1, ch1, LSL, 16); 640 orr(ch1, ch1, ch1, LSL, 32); 641 BIND(CH1_LOOP); 642 ldr(ch2, Address(str2, cnt2_neg)); 643 eor(ch2, ch1, ch2); 644 sub(tmp1, ch2, tmp3); 645 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 646 bics(tmp1, tmp1, tmp2); 647 br(NE, HAS_ZERO); 648 adds(cnt2_neg, cnt2_neg, 8); 649 br(LT, CH1_LOOP); 650 651 cmp(cnt2_neg, (u1)8); 652 mov(cnt2_neg, 0); 653 br(LT, CH1_LOOP); 654 b(NOMATCH); 655 656 BIND(HAS_ZERO); 657 rev(tmp1, tmp1); 658 clz(tmp1, tmp1); 659 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 660 b(MATCH); 661 662 BIND(DO1_SHORT); 663 mov(result_tmp, cnt2); 664 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 665 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 666 BIND(DO1_LOOP); 667 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 668 cmpw(ch1, ch2); 669 br(EQ, MATCH); 670 adds(cnt2_neg, cnt2_neg, str2_chr_size); 671 br(LT, DO1_LOOP); 672 } 673 } 674 BIND(NOMATCH); 675 mov(result, -1); 676 b(DONE); 677 BIND(MATCH); 678 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 679 BIND(DONE); 680 } 681 682 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 683 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 684 685 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 686 Register ch, Register result, 687 Register tmp1, Register tmp2, Register tmp3) 688 { 689 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 690 Register cnt1_neg = cnt1; 691 Register ch1 = rscratch1; 692 Register result_tmp = rscratch2; 693 694 cbz(cnt1, NOMATCH); 695 696 cmp(cnt1, (u1)4); 697 br(LT, DO1_SHORT); 698 699 orr(ch, ch, ch, LSL, 16); 700 orr(ch, ch, ch, LSL, 32); 701 702 sub(cnt1, cnt1, 4); 703 mov(result_tmp, cnt1); 704 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 705 sub(cnt1_neg, zr, cnt1, LSL, 1); 706 707 mov(tmp3, 0x0001000100010001); 708 709 BIND(CH1_LOOP); 710 ldr(ch1, Address(str1, cnt1_neg)); 711 eor(ch1, ch, ch1); 712 sub(tmp1, ch1, tmp3); 713 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 714 bics(tmp1, tmp1, tmp2); 715 br(NE, HAS_ZERO); 716 adds(cnt1_neg, cnt1_neg, 8); 717 br(LT, CH1_LOOP); 718 719 cmp(cnt1_neg, (u1)8); 720 mov(cnt1_neg, 0); 721 br(LT, CH1_LOOP); 722 b(NOMATCH); 723 724 BIND(HAS_ZERO); 725 rev(tmp1, tmp1); 726 clz(tmp1, tmp1); 727 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 728 b(MATCH); 729 730 BIND(DO1_SHORT); 731 mov(result_tmp, cnt1); 732 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 733 sub(cnt1_neg, zr, cnt1, LSL, 1); 734 BIND(DO1_LOOP); 735 ldrh(ch1, Address(str1, cnt1_neg)); 736 cmpw(ch, ch1); 737 br(EQ, MATCH); 738 adds(cnt1_neg, cnt1_neg, 2); 739 br(LT, DO1_LOOP); 740 BIND(NOMATCH); 741 mov(result, -1); 742 b(DONE); 743 BIND(MATCH); 744 add(result, result_tmp, cnt1_neg, ASR, 1); 745 BIND(DONE); 746 } 747 748 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 749 Register ch, Register result, 750 FloatRegister ztmp1, 751 FloatRegister ztmp2, 752 PRegister tmp_pg, 753 PRegister tmp_pdn, bool isL) 754 { 755 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 756 assert(tmp_pg->is_governing(), 757 "this register has to be a governing predicate register"); 758 759 Label LOOP, MATCH, DONE, NOMATCH; 760 Register vec_len = rscratch1; 761 Register idx = rscratch2; 762 763 SIMD_RegVariant T = (isL == true) ? B : H; 764 765 cbz(cnt1, NOMATCH); 766 767 // Assign the particular char throughout the vector. 768 sve_dup(ztmp2, T, ch); 769 if (isL) { 770 sve_cntb(vec_len); 771 } else { 772 sve_cnth(vec_len); 773 } 774 mov(idx, 0); 775 776 // Generate a predicate to control the reading of input string. 777 sve_whilelt(tmp_pg, T, idx, cnt1); 778 779 BIND(LOOP); 780 // Read a vector of 8- or 16-bit data depending on the string type. Note 781 // that inactive elements indicated by the predicate register won't cause 782 // a data read from memory to the destination vector. 783 if (isL) { 784 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 785 } else { 786 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 787 } 788 add(idx, idx, vec_len); 789 790 // Perform the comparison. An element of the destination predicate is set 791 // to active if the particular char is matched. 792 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 793 794 // Branch if the particular char is found. 795 br(NE, MATCH); 796 797 sve_whilelt(tmp_pg, T, idx, cnt1); 798 799 // Loop back if the particular char not found. 800 br(MI, LOOP); 801 802 BIND(NOMATCH); 803 mov(result, -1); 804 b(DONE); 805 806 BIND(MATCH); 807 // Undo the index increment. 808 sub(idx, idx, vec_len); 809 810 // Crop the vector to find its location. 811 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 812 add(result, idx, -1); 813 sve_incp(result, T, tmp_pdn); 814 BIND(DONE); 815 } 816 817 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 818 Register ch, Register result, 819 Register tmp1, Register tmp2, Register tmp3) 820 { 821 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 822 Register cnt1_neg = cnt1; 823 Register ch1 = rscratch1; 824 Register result_tmp = rscratch2; 825 826 cbz(cnt1, NOMATCH); 827 828 cmp(cnt1, (u1)8); 829 br(LT, DO1_SHORT); 830 831 orr(ch, ch, ch, LSL, 8); 832 orr(ch, ch, ch, LSL, 16); 833 orr(ch, ch, ch, LSL, 32); 834 835 sub(cnt1, cnt1, 8); 836 mov(result_tmp, cnt1); 837 lea(str1, Address(str1, cnt1)); 838 sub(cnt1_neg, zr, cnt1); 839 840 mov(tmp3, 0x0101010101010101); 841 842 BIND(CH1_LOOP); 843 ldr(ch1, Address(str1, cnt1_neg)); 844 eor(ch1, ch, ch1); 845 sub(tmp1, ch1, tmp3); 846 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 847 bics(tmp1, tmp1, tmp2); 848 br(NE, HAS_ZERO); 849 adds(cnt1_neg, cnt1_neg, 8); 850 br(LT, CH1_LOOP); 851 852 cmp(cnt1_neg, (u1)8); 853 mov(cnt1_neg, 0); 854 br(LT, CH1_LOOP); 855 b(NOMATCH); 856 857 BIND(HAS_ZERO); 858 rev(tmp1, tmp1); 859 clz(tmp1, tmp1); 860 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 861 b(MATCH); 862 863 BIND(DO1_SHORT); 864 mov(result_tmp, cnt1); 865 lea(str1, Address(str1, cnt1)); 866 sub(cnt1_neg, zr, cnt1); 867 BIND(DO1_LOOP); 868 ldrb(ch1, Address(str1, cnt1_neg)); 869 cmp(ch, ch1); 870 br(EQ, MATCH); 871 adds(cnt1_neg, cnt1_neg, 1); 872 br(LT, DO1_LOOP); 873 BIND(NOMATCH); 874 mov(result, -1); 875 b(DONE); 876 BIND(MATCH); 877 add(result, result_tmp, cnt1_neg); 878 BIND(DONE); 879 } 880 881 // Compare strings. 882 void C2_MacroAssembler::string_compare(Register str1, Register str2, 883 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 884 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 885 PRegister pgtmp1, PRegister pgtmp2, int ae) { 886 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 887 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 888 SHORT_LOOP_START, TAIL_CHECK; 889 890 bool isLL = ae == StrIntrinsicNode::LL; 891 bool isLU = ae == StrIntrinsicNode::LU; 892 bool isUL = ae == StrIntrinsicNode::UL; 893 894 // The stub threshold for LL strings is: 72 (64 + 8) chars 895 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 896 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 897 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 898 899 bool str1_isL = isLL || isLU; 900 bool str2_isL = isLL || isUL; 901 902 int str1_chr_shift = str1_isL ? 0 : 1; 903 int str2_chr_shift = str2_isL ? 0 : 1; 904 int str1_chr_size = str1_isL ? 1 : 2; 905 int str2_chr_size = str2_isL ? 1 : 2; 906 int minCharsInWord = isLL ? wordSize : wordSize/2; 907 908 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 909 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 910 (chr_insn)&MacroAssembler::ldrh; 911 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 912 (chr_insn)&MacroAssembler::ldrh; 913 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 914 (uxt_insn)&MacroAssembler::uxthw; 915 916 BLOCK_COMMENT("string_compare {"); 917 918 // Bizzarely, the counts are passed in bytes, regardless of whether they 919 // are L or U strings, however the result is always in characters. 920 if (!str1_isL) asrw(cnt1, cnt1, 1); 921 if (!str2_isL) asrw(cnt2, cnt2, 1); 922 923 // Compute the minimum of the string lengths and save the difference. 924 subsw(result, cnt1, cnt2); 925 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 926 927 // A very short string 928 cmpw(cnt2, minCharsInWord); 929 br(Assembler::LE, SHORT_STRING); 930 931 // Compare longwords 932 // load first parts of strings and finish initialization while loading 933 { 934 if (str1_isL == str2_isL) { // LL or UU 935 ldr(tmp1, Address(str1)); 936 cmp(str1, str2); 937 br(Assembler::EQ, DONE); 938 ldr(tmp2, Address(str2)); 939 cmp(cnt2, stub_threshold); 940 br(GE, STUB); 941 subsw(cnt2, cnt2, minCharsInWord); 942 br(EQ, TAIL_CHECK); 943 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 944 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 945 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 946 } else if (isLU) { 947 ldrs(vtmp, Address(str1)); 948 ldr(tmp2, Address(str2)); 949 cmp(cnt2, stub_threshold); 950 br(GE, STUB); 951 subw(cnt2, cnt2, 4); 952 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 953 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 954 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 955 zip1(vtmp, T8B, vtmp, vtmpZ); 956 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 957 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 958 add(cnt1, cnt1, 4); 959 fmovd(tmp1, vtmp); 960 } else { // UL case 961 ldr(tmp1, Address(str1)); 962 ldrs(vtmp, Address(str2)); 963 cmp(cnt2, stub_threshold); 964 br(GE, STUB); 965 subw(cnt2, cnt2, 4); 966 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 967 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 968 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 969 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 970 zip1(vtmp, T8B, vtmp, vtmpZ); 971 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 972 add(cnt1, cnt1, 8); 973 fmovd(tmp2, vtmp); 974 } 975 adds(cnt2, cnt2, isUL ? 4 : 8); 976 br(GE, TAIL); 977 eor(rscratch2, tmp1, tmp2); 978 cbnz(rscratch2, DIFF); 979 // main loop 980 bind(NEXT_WORD); 981 if (str1_isL == str2_isL) { 982 ldr(tmp1, Address(str1, cnt2)); 983 ldr(tmp2, Address(str2, cnt2)); 984 adds(cnt2, cnt2, 8); 985 } else if (isLU) { 986 ldrs(vtmp, Address(str1, cnt1)); 987 ldr(tmp2, Address(str2, cnt2)); 988 add(cnt1, cnt1, 4); 989 zip1(vtmp, T8B, vtmp, vtmpZ); 990 fmovd(tmp1, vtmp); 991 adds(cnt2, cnt2, 8); 992 } else { // UL 993 ldrs(vtmp, Address(str2, cnt2)); 994 ldr(tmp1, Address(str1, cnt1)); 995 zip1(vtmp, T8B, vtmp, vtmpZ); 996 add(cnt1, cnt1, 8); 997 fmovd(tmp2, vtmp); 998 adds(cnt2, cnt2, 4); 999 } 1000 br(GE, TAIL); 1001 1002 eor(rscratch2, tmp1, tmp2); 1003 cbz(rscratch2, NEXT_WORD); 1004 b(DIFF); 1005 bind(TAIL); 1006 eor(rscratch2, tmp1, tmp2); 1007 cbnz(rscratch2, DIFF); 1008 // Last longword. In the case where length == 4 we compare the 1009 // same longword twice, but that's still faster than another 1010 // conditional branch. 1011 if (str1_isL == str2_isL) { 1012 ldr(tmp1, Address(str1)); 1013 ldr(tmp2, Address(str2)); 1014 } else if (isLU) { 1015 ldrs(vtmp, Address(str1)); 1016 ldr(tmp2, Address(str2)); 1017 zip1(vtmp, T8B, vtmp, vtmpZ); 1018 fmovd(tmp1, vtmp); 1019 } else { // UL 1020 ldrs(vtmp, Address(str2)); 1021 ldr(tmp1, Address(str1)); 1022 zip1(vtmp, T8B, vtmp, vtmpZ); 1023 fmovd(tmp2, vtmp); 1024 } 1025 bind(TAIL_CHECK); 1026 eor(rscratch2, tmp1, tmp2); 1027 cbz(rscratch2, DONE); 1028 1029 // Find the first different characters in the longwords and 1030 // compute their difference. 1031 bind(DIFF); 1032 rev(rscratch2, rscratch2); 1033 clz(rscratch2, rscratch2); 1034 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1035 lsrv(tmp1, tmp1, rscratch2); 1036 (this->*ext_chr)(tmp1, tmp1); 1037 lsrv(tmp2, tmp2, rscratch2); 1038 (this->*ext_chr)(tmp2, tmp2); 1039 subw(result, tmp1, tmp2); 1040 b(DONE); 1041 } 1042 1043 bind(STUB); 1044 RuntimeAddress stub = nullptr; 1045 switch(ae) { 1046 case StrIntrinsicNode::LL: 1047 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1048 break; 1049 case StrIntrinsicNode::UU: 1050 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1051 break; 1052 case StrIntrinsicNode::LU: 1053 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1054 break; 1055 case StrIntrinsicNode::UL: 1056 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1057 break; 1058 default: 1059 ShouldNotReachHere(); 1060 } 1061 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1062 address call = trampoline_call(stub); 1063 if (call == nullptr) { 1064 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1065 ciEnv::current()->record_failure("CodeCache is full"); 1066 return; 1067 } 1068 b(DONE); 1069 1070 bind(SHORT_STRING); 1071 // Is the minimum length zero? 1072 cbz(cnt2, DONE); 1073 // arrange code to do most branches while loading and loading next characters 1074 // while comparing previous 1075 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1076 subs(cnt2, cnt2, 1); 1077 br(EQ, SHORT_LAST_INIT); 1078 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1079 b(SHORT_LOOP_START); 1080 bind(SHORT_LOOP); 1081 subs(cnt2, cnt2, 1); 1082 br(EQ, SHORT_LAST); 1083 bind(SHORT_LOOP_START); 1084 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1085 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1086 cmp(tmp1, cnt1); 1087 br(NE, SHORT_LOOP_TAIL); 1088 subs(cnt2, cnt2, 1); 1089 br(EQ, SHORT_LAST2); 1090 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1091 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1092 cmp(tmp2, rscratch1); 1093 br(EQ, SHORT_LOOP); 1094 sub(result, tmp2, rscratch1); 1095 b(DONE); 1096 bind(SHORT_LOOP_TAIL); 1097 sub(result, tmp1, cnt1); 1098 b(DONE); 1099 bind(SHORT_LAST2); 1100 cmp(tmp2, rscratch1); 1101 br(EQ, DONE); 1102 sub(result, tmp2, rscratch1); 1103 1104 b(DONE); 1105 bind(SHORT_LAST_INIT); 1106 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1107 bind(SHORT_LAST); 1108 cmp(tmp1, cnt1); 1109 br(EQ, DONE); 1110 sub(result, tmp1, cnt1); 1111 1112 bind(DONE); 1113 1114 BLOCK_COMMENT("} string_compare"); 1115 } 1116 1117 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1118 FloatRegister src2, Condition cond, bool isQ) { 1119 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1120 FloatRegister zn = src1, zm = src2; 1121 bool needs_negation = false; 1122 switch (cond) { 1123 case LT: cond = GT; zn = src2; zm = src1; break; 1124 case LE: cond = GE; zn = src2; zm = src1; break; 1125 case LO: cond = HI; zn = src2; zm = src1; break; 1126 case LS: cond = HS; zn = src2; zm = src1; break; 1127 case NE: cond = EQ; needs_negation = true; break; 1128 default: 1129 break; 1130 } 1131 1132 if (is_floating_point_type(bt)) { 1133 fcm(cond, dst, size, zn, zm); 1134 } else { 1135 cm(cond, dst, size, zn, zm); 1136 } 1137 1138 if (needs_negation) { 1139 notr(dst, isQ ? T16B : T8B, dst); 1140 } 1141 } 1142 1143 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1144 Condition cond, bool isQ) { 1145 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1146 if (bt == T_FLOAT || bt == T_DOUBLE) { 1147 if (cond == Assembler::NE) { 1148 fcm(Assembler::EQ, dst, size, src); 1149 notr(dst, isQ ? T16B : T8B, dst); 1150 } else { 1151 fcm(cond, dst, size, src); 1152 } 1153 } else { 1154 if (cond == Assembler::NE) { 1155 cm(Assembler::EQ, dst, size, src); 1156 notr(dst, isQ ? T16B : T8B, dst); 1157 } else { 1158 cm(cond, dst, size, src); 1159 } 1160 } 1161 } 1162 1163 // Compress the least significant bit of each byte to the rightmost and clear 1164 // the higher garbage bits. 1165 void C2_MacroAssembler::bytemask_compress(Register dst) { 1166 // Example input, dst = 0x01 00 00 00 01 01 00 01 1167 // The "??" bytes are garbage. 1168 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1169 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1170 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1171 andr(dst, dst, 0xff); // dst = 0x8D 1172 } 1173 1174 // Pack the lowest-numbered bit of each mask element in src into a long value 1175 // in dst, at most the first 64 lane elements. 1176 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1177 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1178 FloatRegister vtmp1, FloatRegister vtmp2) { 1179 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1180 assert_different_registers(dst, rscratch1); 1181 assert_different_registers(vtmp1, vtmp2); 1182 1183 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1184 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1185 // Expected: dst = 0x658D 1186 1187 // Convert the mask into vector with sequential bytes. 1188 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1189 sve_cpy(vtmp1, size, src, 1, false); 1190 if (bt != T_BYTE) { 1191 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1192 } 1193 1194 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1195 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1196 // is to compress each significant bit of the byte in a cross-lane way. Due 1197 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1198 // (bit-compress in each lane) with the biggest lane size (T = D) then 1199 // concatenate the results. 1200 1201 // The second source input of BEXT, initialized with 0x01 in each byte. 1202 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1203 sve_dup(vtmp2, B, 1); 1204 1205 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1206 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1207 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1208 // --------------------------------------- 1209 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1210 sve_bext(vtmp1, D, vtmp1, vtmp2); 1211 1212 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1213 // result to dst. 1214 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1215 // dst = 0x658D 1216 if (lane_cnt <= 8) { 1217 // No need to concatenate. 1218 umov(dst, vtmp1, B, 0); 1219 } else if (lane_cnt <= 16) { 1220 ins(vtmp1, B, vtmp1, 1, 8); 1221 umov(dst, vtmp1, H, 0); 1222 } else { 1223 // As the lane count is 64 at most, the final expected value must be in 1224 // the lowest 64 bits after narrowing vtmp1 from D to B. 1225 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1226 umov(dst, vtmp1, D, 0); 1227 } 1228 } else if (UseSVE > 0) { 1229 // Compress the lowest 8 bytes. 1230 fmovd(dst, vtmp1); 1231 bytemask_compress(dst); 1232 if (lane_cnt <= 8) return; 1233 1234 // Repeat on higher bytes and join the results. 1235 // Compress 8 bytes in each iteration. 1236 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1237 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1238 bytemask_compress(rscratch1); 1239 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1240 } 1241 } else { 1242 assert(false, "unsupported"); 1243 ShouldNotReachHere(); 1244 } 1245 } 1246 1247 // Unpack the mask, a long value in src, into predicate register dst based on the 1248 // corresponding data type. Note that dst can support at most 64 lanes. 1249 // Below example gives the expected dst predicate register in different types, with 1250 // a valid src(0x658D) on a 1024-bit vector size machine. 1251 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1252 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1253 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1254 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1255 // 1256 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1257 // has 24 significant bits would be an invalid input if dst predicate register refers to 1258 // a LONG type 1024-bit vector, which has at most 16 lanes. 1259 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1260 FloatRegister vtmp1, FloatRegister vtmp2) { 1261 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1262 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1263 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1264 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1265 // Expected: dst = 0b01101001 10001101 1266 1267 // Put long value from general purpose register into the first lane of vector. 1268 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1269 sve_dup(vtmp1, B, 0); 1270 mov(vtmp1, D, 0, src); 1271 1272 // As sve_cmp generates mask value with the minimum unit in byte, we should 1273 // transform the value in the first lane which is mask in bit now to the 1274 // mask in byte, which can be done by SVE2's BDEP instruction. 1275 1276 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1277 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1278 if (lane_cnt <= 8) { 1279 // Nothing. As only one byte exsits. 1280 } else if (lane_cnt <= 16) { 1281 ins(vtmp1, B, vtmp1, 8, 1); 1282 mov(vtmp1, B, 1, zr); 1283 } else { 1284 sve_vector_extend(vtmp1, D, vtmp1, B); 1285 } 1286 1287 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1288 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1289 sve_dup(vtmp2, B, 1); 1290 1291 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1292 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1293 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1294 // --------------------------------------- 1295 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1296 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1297 1298 if (bt != T_BYTE) { 1299 sve_vector_extend(vtmp1, size, vtmp1, B); 1300 } 1301 // Generate mask according to the given vector, in which the elements have been 1302 // extended to expected type. 1303 // dst = 0b01101001 10001101 1304 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1305 } 1306 1307 // Clobbers: rflags 1308 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1309 FloatRegister zn, FloatRegister zm, Condition cond) { 1310 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1311 FloatRegister z1 = zn, z2 = zm; 1312 switch (cond) { 1313 case LE: z1 = zm; z2 = zn; cond = GE; break; 1314 case LT: z1 = zm; z2 = zn; cond = GT; break; 1315 case LO: z1 = zm; z2 = zn; cond = HI; break; 1316 case LS: z1 = zm; z2 = zn; cond = HS; break; 1317 default: 1318 break; 1319 } 1320 1321 SIMD_RegVariant size = elemType_to_regVariant(bt); 1322 if (is_floating_point_type(bt)) { 1323 sve_fcm(cond, pd, size, pg, z1, z2); 1324 } else { 1325 assert(is_integral_type(bt), "unsupported element type"); 1326 sve_cmp(cond, pd, size, pg, z1, z2); 1327 } 1328 } 1329 1330 // Get index of the last mask lane that is set 1331 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1332 SIMD_RegVariant size = elemType_to_regVariant(bt); 1333 sve_rev(ptmp, size, src); 1334 sve_brkb(ptmp, ptrue, ptmp, false); 1335 sve_cntp(dst, size, ptrue, ptmp); 1336 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1337 subw(dst, rscratch1, dst); 1338 } 1339 1340 // Extend integer vector src to dst with the same lane count 1341 // but larger element size, e.g. 4B -> 4I 1342 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1343 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1344 if (src_bt == T_BYTE) { 1345 if (dst_bt == T_SHORT) { 1346 // 4B/8B to 4S/8S 1347 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1348 } else { 1349 // 4B to 4I 1350 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1351 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1352 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1353 } 1354 } else if (src_bt == T_SHORT) { 1355 // 4S to 4I 1356 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1357 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1358 } else if (src_bt == T_INT) { 1359 // 2I to 2L 1360 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1361 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1362 } else { 1363 ShouldNotReachHere(); 1364 } 1365 } 1366 1367 // Narrow integer vector src down to dst with the same lane count 1368 // but smaller element size, e.g. 4I -> 4B 1369 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1370 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1371 if (src_bt == T_SHORT) { 1372 // 4S/8S to 4B/8B 1373 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1374 assert(dst_bt == T_BYTE, "unsupported"); 1375 xtn(dst, T8B, src, T8H); 1376 } else if (src_bt == T_INT) { 1377 // 4I to 4B/4S 1378 assert(src_vlen_in_bytes == 16, "unsupported"); 1379 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1380 xtn(dst, T4H, src, T4S); 1381 if (dst_bt == T_BYTE) { 1382 xtn(dst, T8B, dst, T8H); 1383 } 1384 } else if (src_bt == T_LONG) { 1385 // 2L to 2I 1386 assert(src_vlen_in_bytes == 16, "unsupported"); 1387 assert(dst_bt == T_INT, "unsupported"); 1388 xtn(dst, T2S, src, T2D); 1389 } else { 1390 ShouldNotReachHere(); 1391 } 1392 } 1393 1394 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1395 FloatRegister src, SIMD_RegVariant src_size, 1396 bool is_unsigned) { 1397 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1398 1399 if (src_size == B) { 1400 switch (dst_size) { 1401 case H: 1402 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1403 break; 1404 case S: 1405 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1406 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1407 break; 1408 case D: 1409 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1410 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1411 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1412 break; 1413 default: 1414 ShouldNotReachHere(); 1415 } 1416 } else if (src_size == H) { 1417 if (dst_size == S) { 1418 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1419 } else { // D 1420 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1421 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1422 } 1423 } else if (src_size == S) { 1424 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1425 } 1426 } 1427 1428 // Vector narrow from src to dst with specified element sizes. 1429 // High part of dst vector will be filled with zero. 1430 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1431 FloatRegister src, SIMD_RegVariant src_size, 1432 FloatRegister tmp) { 1433 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1434 assert_different_registers(src, tmp); 1435 sve_dup(tmp, src_size, 0); 1436 if (src_size == D) { 1437 switch (dst_size) { 1438 case S: 1439 sve_uzp1(dst, S, src, tmp); 1440 break; 1441 case H: 1442 assert_different_registers(dst, tmp); 1443 sve_uzp1(dst, S, src, tmp); 1444 sve_uzp1(dst, H, dst, tmp); 1445 break; 1446 case B: 1447 assert_different_registers(dst, tmp); 1448 sve_uzp1(dst, S, src, tmp); 1449 sve_uzp1(dst, H, dst, tmp); 1450 sve_uzp1(dst, B, dst, tmp); 1451 break; 1452 default: 1453 ShouldNotReachHere(); 1454 } 1455 } else if (src_size == S) { 1456 if (dst_size == H) { 1457 sve_uzp1(dst, H, src, tmp); 1458 } else { // B 1459 assert_different_registers(dst, tmp); 1460 sve_uzp1(dst, H, src, tmp); 1461 sve_uzp1(dst, B, dst, tmp); 1462 } 1463 } else if (src_size == H) { 1464 sve_uzp1(dst, B, src, tmp); 1465 } 1466 } 1467 1468 // Extend src predicate to dst predicate with the same lane count but larger 1469 // element size, e.g. 64Byte -> 512Long 1470 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1471 uint dst_element_length_in_bytes, 1472 uint src_element_length_in_bytes) { 1473 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1474 sve_punpklo(dst, src); 1475 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1476 sve_punpklo(dst, src); 1477 sve_punpklo(dst, dst); 1478 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1479 sve_punpklo(dst, src); 1480 sve_punpklo(dst, dst); 1481 sve_punpklo(dst, dst); 1482 } else { 1483 assert(false, "unsupported"); 1484 ShouldNotReachHere(); 1485 } 1486 } 1487 1488 // Narrow src predicate to dst predicate with the same lane count but 1489 // smaller element size, e.g. 512Long -> 64Byte 1490 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1491 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1492 // The insignificant bits in src predicate are expected to be zero. 1493 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1494 // passed as the second argument. An example narrowing operation with a given mask would be - 1495 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1496 // Mask (for 2 Longs) : TF 1497 // Predicate register for the above mask (16 bits) : 00000001 00000000 1498 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1499 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1500 assert_different_registers(src, ptmp); 1501 assert_different_registers(dst, ptmp); 1502 sve_pfalse(ptmp); 1503 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1504 sve_uzp1(dst, B, src, ptmp); 1505 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1506 sve_uzp1(dst, H, src, ptmp); 1507 sve_uzp1(dst, B, dst, ptmp); 1508 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1509 sve_uzp1(dst, S, src, ptmp); 1510 sve_uzp1(dst, H, dst, ptmp); 1511 sve_uzp1(dst, B, dst, ptmp); 1512 } else { 1513 assert(false, "unsupported"); 1514 ShouldNotReachHere(); 1515 } 1516 } 1517 1518 // Vector reduction add for integral type with ASIMD instructions. 1519 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1520 Register isrc, FloatRegister vsrc, 1521 unsigned vector_length_in_bytes, 1522 FloatRegister vtmp) { 1523 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1524 assert_different_registers(dst, isrc); 1525 bool isQ = vector_length_in_bytes == 16; 1526 1527 BLOCK_COMMENT("neon_reduce_add_integral {"); 1528 switch(bt) { 1529 case T_BYTE: 1530 addv(vtmp, isQ ? T16B : T8B, vsrc); 1531 smov(dst, vtmp, B, 0); 1532 addw(dst, dst, isrc, ext::sxtb); 1533 break; 1534 case T_SHORT: 1535 addv(vtmp, isQ ? T8H : T4H, vsrc); 1536 smov(dst, vtmp, H, 0); 1537 addw(dst, dst, isrc, ext::sxth); 1538 break; 1539 case T_INT: 1540 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1541 umov(dst, vtmp, S, 0); 1542 addw(dst, dst, isrc); 1543 break; 1544 case T_LONG: 1545 assert(isQ, "unsupported"); 1546 addpd(vtmp, vsrc); 1547 umov(dst, vtmp, D, 0); 1548 add(dst, dst, isrc); 1549 break; 1550 default: 1551 assert(false, "unsupported"); 1552 ShouldNotReachHere(); 1553 } 1554 BLOCK_COMMENT("} neon_reduce_add_integral"); 1555 } 1556 1557 // Vector reduction multiply for integral type with ASIMD instructions. 1558 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1559 // Clobbers: rscratch1 1560 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1561 Register isrc, FloatRegister vsrc, 1562 unsigned vector_length_in_bytes, 1563 FloatRegister vtmp1, FloatRegister vtmp2) { 1564 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1565 bool isQ = vector_length_in_bytes == 16; 1566 1567 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1568 switch(bt) { 1569 case T_BYTE: 1570 if (isQ) { 1571 // Multiply the lower half and higher half of vector iteratively. 1572 // vtmp1 = vsrc[8:15] 1573 ins(vtmp1, D, vsrc, 0, 1); 1574 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1575 mulv(vtmp1, T8B, vtmp1, vsrc); 1576 // vtmp2 = vtmp1[4:7] 1577 ins(vtmp2, S, vtmp1, 0, 1); 1578 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1579 mulv(vtmp1, T8B, vtmp2, vtmp1); 1580 } else { 1581 ins(vtmp1, S, vsrc, 0, 1); 1582 mulv(vtmp1, T8B, vtmp1, vsrc); 1583 } 1584 // vtmp2 = vtmp1[2:3] 1585 ins(vtmp2, H, vtmp1, 0, 1); 1586 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1587 mulv(vtmp2, T8B, vtmp2, vtmp1); 1588 // dst = vtmp2[0] * isrc * vtmp2[1] 1589 umov(rscratch1, vtmp2, B, 0); 1590 mulw(dst, rscratch1, isrc); 1591 sxtb(dst, dst); 1592 umov(rscratch1, vtmp2, B, 1); 1593 mulw(dst, rscratch1, dst); 1594 sxtb(dst, dst); 1595 break; 1596 case T_SHORT: 1597 if (isQ) { 1598 ins(vtmp2, D, vsrc, 0, 1); 1599 mulv(vtmp2, T4H, vtmp2, vsrc); 1600 ins(vtmp1, S, vtmp2, 0, 1); 1601 mulv(vtmp1, T4H, vtmp1, vtmp2); 1602 } else { 1603 ins(vtmp1, S, vsrc, 0, 1); 1604 mulv(vtmp1, T4H, vtmp1, vsrc); 1605 } 1606 umov(rscratch1, vtmp1, H, 0); 1607 mulw(dst, rscratch1, isrc); 1608 sxth(dst, dst); 1609 umov(rscratch1, vtmp1, H, 1); 1610 mulw(dst, rscratch1, dst); 1611 sxth(dst, dst); 1612 break; 1613 case T_INT: 1614 if (isQ) { 1615 ins(vtmp1, D, vsrc, 0, 1); 1616 mulv(vtmp1, T2S, vtmp1, vsrc); 1617 } else { 1618 vtmp1 = vsrc; 1619 } 1620 umov(rscratch1, vtmp1, S, 0); 1621 mul(dst, rscratch1, isrc); 1622 umov(rscratch1, vtmp1, S, 1); 1623 mul(dst, rscratch1, dst); 1624 break; 1625 case T_LONG: 1626 umov(rscratch1, vsrc, D, 0); 1627 mul(dst, isrc, rscratch1); 1628 umov(rscratch1, vsrc, D, 1); 1629 mul(dst, dst, rscratch1); 1630 break; 1631 default: 1632 assert(false, "unsupported"); 1633 ShouldNotReachHere(); 1634 } 1635 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1636 } 1637 1638 // Vector reduction multiply for floating-point type with ASIMD instructions. 1639 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1640 FloatRegister fsrc, FloatRegister vsrc, 1641 unsigned vector_length_in_bytes, 1642 FloatRegister vtmp) { 1643 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1644 bool isQ = vector_length_in_bytes == 16; 1645 1646 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1647 switch(bt) { 1648 case T_FLOAT: 1649 fmuls(dst, fsrc, vsrc); 1650 ins(vtmp, S, vsrc, 0, 1); 1651 fmuls(dst, dst, vtmp); 1652 if (isQ) { 1653 ins(vtmp, S, vsrc, 0, 2); 1654 fmuls(dst, dst, vtmp); 1655 ins(vtmp, S, vsrc, 0, 3); 1656 fmuls(dst, dst, vtmp); 1657 } 1658 break; 1659 case T_DOUBLE: 1660 assert(isQ, "unsupported"); 1661 fmuld(dst, fsrc, vsrc); 1662 ins(vtmp, D, vsrc, 0, 1); 1663 fmuld(dst, dst, vtmp); 1664 break; 1665 default: 1666 assert(false, "unsupported"); 1667 ShouldNotReachHere(); 1668 } 1669 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1670 } 1671 1672 // Helper to select logical instruction 1673 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1674 Register Rn, Register Rm, 1675 enum shift_kind kind, unsigned shift) { 1676 switch(opc) { 1677 case Op_AndReductionV: 1678 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1679 break; 1680 case Op_OrReductionV: 1681 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1682 break; 1683 case Op_XorReductionV: 1684 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1685 break; 1686 default: 1687 assert(false, "unsupported"); 1688 ShouldNotReachHere(); 1689 } 1690 } 1691 1692 // Vector reduction logical operations And, Or, Xor 1693 // Clobbers: rscratch1 1694 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1695 Register isrc, FloatRegister vsrc, 1696 unsigned vector_length_in_bytes) { 1697 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1698 "unsupported"); 1699 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1700 assert_different_registers(dst, isrc); 1701 bool isQ = vector_length_in_bytes == 16; 1702 1703 BLOCK_COMMENT("neon_reduce_logical {"); 1704 umov(rscratch1, vsrc, isQ ? D : S, 0); 1705 umov(dst, vsrc, isQ ? D : S, 1); 1706 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1707 switch(bt) { 1708 case T_BYTE: 1709 if (isQ) { 1710 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1711 } 1712 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1713 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1714 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1715 sxtb(dst, dst); 1716 break; 1717 case T_SHORT: 1718 if (isQ) { 1719 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1720 } 1721 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1722 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1723 sxth(dst, dst); 1724 break; 1725 case T_INT: 1726 if (isQ) { 1727 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1728 } 1729 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1730 break; 1731 case T_LONG: 1732 assert(isQ, "unsupported"); 1733 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1734 break; 1735 default: 1736 assert(false, "unsupported"); 1737 ShouldNotReachHere(); 1738 } 1739 BLOCK_COMMENT("} neon_reduce_logical"); 1740 } 1741 1742 // Vector reduction min/max for integral type with ASIMD instructions. 1743 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1744 // Clobbers: rscratch1, rflags 1745 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1746 Register isrc, FloatRegister vsrc, 1747 unsigned vector_length_in_bytes, 1748 FloatRegister vtmp) { 1749 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1750 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1751 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1752 assert_different_registers(dst, isrc); 1753 bool isQ = vector_length_in_bytes == 16; 1754 bool is_min = opc == Op_MinReductionV; 1755 1756 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1757 if (bt == T_LONG) { 1758 assert(vtmp == fnoreg, "should be"); 1759 assert(isQ, "should be"); 1760 umov(rscratch1, vsrc, D, 0); 1761 cmp(isrc, rscratch1); 1762 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1763 umov(rscratch1, vsrc, D, 1); 1764 cmp(dst, rscratch1); 1765 csel(dst, dst, rscratch1, is_min ? LT : GT); 1766 } else { 1767 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1768 if (size == T2S) { 1769 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 1770 } else { 1771 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 1772 } 1773 if (bt == T_INT) { 1774 umov(dst, vtmp, S, 0); 1775 } else { 1776 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 1777 } 1778 cmpw(dst, isrc); 1779 cselw(dst, dst, isrc, is_min ? LT : GT); 1780 } 1781 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 1782 } 1783 1784 // Vector reduction for integral type with SVE instruction. 1785 // Supported operations are Add, And, Or, Xor, Max, Min. 1786 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 1787 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1788 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1789 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1790 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1791 assert_different_registers(src1, dst); 1792 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1793 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1794 switch (opc) { 1795 case Op_AddReductionVI: { 1796 sve_uaddv(tmp, size, pg, src2); 1797 if (bt == T_BYTE) { 1798 smov(dst, tmp, size, 0); 1799 addw(dst, src1, dst, ext::sxtb); 1800 } else if (bt == T_SHORT) { 1801 smov(dst, tmp, size, 0); 1802 addw(dst, src1, dst, ext::sxth); 1803 } else { 1804 umov(dst, tmp, size, 0); 1805 addw(dst, dst, src1); 1806 } 1807 break; 1808 } 1809 case Op_AddReductionVL: { 1810 sve_uaddv(tmp, size, pg, src2); 1811 umov(dst, tmp, size, 0); 1812 add(dst, dst, src1); 1813 break; 1814 } 1815 case Op_AndReductionV: { 1816 sve_andv(tmp, size, pg, src2); 1817 if (bt == T_INT || bt == T_LONG) { 1818 umov(dst, tmp, size, 0); 1819 } else { 1820 smov(dst, tmp, size, 0); 1821 } 1822 if (bt == T_LONG) { 1823 andr(dst, dst, src1); 1824 } else { 1825 andw(dst, dst, src1); 1826 } 1827 break; 1828 } 1829 case Op_OrReductionV: { 1830 sve_orv(tmp, size, pg, src2); 1831 if (bt == T_INT || bt == T_LONG) { 1832 umov(dst, tmp, size, 0); 1833 } else { 1834 smov(dst, tmp, size, 0); 1835 } 1836 if (bt == T_LONG) { 1837 orr(dst, dst, src1); 1838 } else { 1839 orrw(dst, dst, src1); 1840 } 1841 break; 1842 } 1843 case Op_XorReductionV: { 1844 sve_eorv(tmp, size, pg, src2); 1845 if (bt == T_INT || bt == T_LONG) { 1846 umov(dst, tmp, size, 0); 1847 } else { 1848 smov(dst, tmp, size, 0); 1849 } 1850 if (bt == T_LONG) { 1851 eor(dst, dst, src1); 1852 } else { 1853 eorw(dst, dst, src1); 1854 } 1855 break; 1856 } 1857 case Op_MaxReductionV: { 1858 sve_smaxv(tmp, size, pg, src2); 1859 if (bt == T_INT || bt == T_LONG) { 1860 umov(dst, tmp, size, 0); 1861 } else { 1862 smov(dst, tmp, size, 0); 1863 } 1864 if (bt == T_LONG) { 1865 cmp(dst, src1); 1866 csel(dst, dst, src1, Assembler::GT); 1867 } else { 1868 cmpw(dst, src1); 1869 cselw(dst, dst, src1, Assembler::GT); 1870 } 1871 break; 1872 } 1873 case Op_MinReductionV: { 1874 sve_sminv(tmp, size, pg, src2); 1875 if (bt == T_INT || bt == T_LONG) { 1876 umov(dst, tmp, size, 0); 1877 } else { 1878 smov(dst, tmp, size, 0); 1879 } 1880 if (bt == T_LONG) { 1881 cmp(dst, src1); 1882 csel(dst, dst, src1, Assembler::LT); 1883 } else { 1884 cmpw(dst, src1); 1885 cselw(dst, dst, src1, Assembler::LT); 1886 } 1887 break; 1888 } 1889 default: 1890 assert(false, "unsupported"); 1891 ShouldNotReachHere(); 1892 } 1893 1894 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1895 if (bt == T_BYTE) { 1896 sxtb(dst, dst); 1897 } else if (bt == T_SHORT) { 1898 sxth(dst, dst); 1899 } 1900 } 1901 } 1902 1903 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 1904 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 1905 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 1906 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 1907 uint32_t max_vector_length = Matcher::max_vector_size(bt); 1908 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 1909 1910 // Set all elements to false if the input "lane_cnt" is zero. 1911 if (lane_cnt == 0) { 1912 sve_pfalse(dst); 1913 return; 1914 } 1915 1916 SIMD_RegVariant size = elemType_to_regVariant(bt); 1917 assert(size != Q, "invalid size"); 1918 1919 // Set all true if "lane_cnt" equals to the max lane count. 1920 if (lane_cnt == max_vector_length) { 1921 sve_ptrue(dst, size, /* ALL */ 0b11111); 1922 return; 1923 } 1924 1925 // Fixed numbers for "ptrue". 1926 switch(lane_cnt) { 1927 case 1: /* VL1 */ 1928 case 2: /* VL2 */ 1929 case 3: /* VL3 */ 1930 case 4: /* VL4 */ 1931 case 5: /* VL5 */ 1932 case 6: /* VL6 */ 1933 case 7: /* VL7 */ 1934 case 8: /* VL8 */ 1935 sve_ptrue(dst, size, lane_cnt); 1936 return; 1937 case 16: 1938 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1939 return; 1940 case 32: 1941 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1942 return; 1943 case 64: 1944 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1945 return; 1946 case 128: 1947 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1948 return; 1949 case 256: 1950 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1951 return; 1952 default: 1953 break; 1954 } 1955 1956 // Special patterns for "ptrue". 1957 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 1958 sve_ptrue(dst, size, /* POW2 */ 0b00000); 1959 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 1960 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 1961 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 1962 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 1963 } else { 1964 // Encode to "whileltw" for the remaining cases. 1965 mov(rscratch1, lane_cnt); 1966 sve_whileltw(dst, size, zr, rscratch1); 1967 } 1968 } 1969 1970 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1971 // Any remaining elements of dst will be filled with zero. 1972 // Clobbers: rscratch1 1973 // Preserves: src, mask 1974 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 1975 FloatRegister vtmp1, FloatRegister vtmp2, 1976 PRegister pgtmp) { 1977 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1978 assert_different_registers(dst, src, vtmp1, vtmp2); 1979 assert_different_registers(mask, pgtmp); 1980 1981 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 1982 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 1983 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 1984 sve_dup(vtmp2, H, 0); 1985 1986 // Extend lowest half to type INT. 1987 // dst = 00004444 00003333 00002222 00001111 1988 sve_uunpklo(dst, S, src); 1989 // pgtmp = 00000001 00000000 00000001 00000001 1990 sve_punpklo(pgtmp, mask); 1991 // Pack the active elements in size of type INT to the right, 1992 // and fill the remainings with zero. 1993 // dst = 00000000 00004444 00002222 00001111 1994 sve_compact(dst, S, dst, pgtmp); 1995 // Narrow the result back to type SHORT. 1996 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 1997 sve_uzp1(dst, H, dst, vtmp2); 1998 // Count the active elements of lowest half. 1999 // rscratch1 = 3 2000 sve_cntp(rscratch1, S, ptrue, pgtmp); 2001 2002 // Repeat to the highest half. 2003 // pgtmp = 00000001 00000000 00000000 00000001 2004 sve_punpkhi(pgtmp, mask); 2005 // vtmp1 = 00008888 00007777 00006666 00005555 2006 sve_uunpkhi(vtmp1, S, src); 2007 // vtmp1 = 00000000 00000000 00008888 00005555 2008 sve_compact(vtmp1, S, vtmp1, pgtmp); 2009 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2010 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2011 2012 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2013 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2014 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2015 // TRUE_CNT is the number of active elements in the compressed low. 2016 neg(rscratch1, rscratch1); 2017 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2018 sve_index(vtmp2, H, rscratch1, 1); 2019 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2020 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2021 2022 // Combine the compressed high(after shifted) with the compressed low. 2023 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2024 sve_orr(dst, dst, vtmp1); 2025 } 2026 2027 // Clobbers: rscratch1, rscratch2 2028 // Preserves: src, mask 2029 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2030 FloatRegister vtmp1, FloatRegister vtmp2, 2031 FloatRegister vtmp3, FloatRegister vtmp4, 2032 PRegister ptmp, PRegister pgtmp) { 2033 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2034 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2035 assert_different_registers(mask, ptmp, pgtmp); 2036 // Example input: src = 88 77 66 55 44 33 22 11 2037 // mask = 01 00 00 01 01 00 01 01 2038 // Expected result: dst = 00 00 00 88 55 44 22 11 2039 2040 sve_dup(vtmp4, B, 0); 2041 // Extend lowest half to type SHORT. 2042 // vtmp1 = 0044 0033 0022 0011 2043 sve_uunpklo(vtmp1, H, src); 2044 // ptmp = 0001 0000 0001 0001 2045 sve_punpklo(ptmp, mask); 2046 // Count the active elements of lowest half. 2047 // rscratch2 = 3 2048 sve_cntp(rscratch2, H, ptrue, ptmp); 2049 // Pack the active elements in size of type SHORT to the right, 2050 // and fill the remainings with zero. 2051 // dst = 0000 0044 0022 0011 2052 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2053 // Narrow the result back to type BYTE. 2054 // dst = 00 00 00 00 00 44 22 11 2055 sve_uzp1(dst, B, dst, vtmp4); 2056 2057 // Repeat to the highest half. 2058 // ptmp = 0001 0000 0000 0001 2059 sve_punpkhi(ptmp, mask); 2060 // vtmp1 = 0088 0077 0066 0055 2061 sve_uunpkhi(vtmp2, H, src); 2062 // vtmp1 = 0000 0000 0088 0055 2063 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2064 2065 sve_dup(vtmp4, B, 0); 2066 // vtmp1 = 00 00 00 00 00 00 88 55 2067 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2068 2069 // Compressed low: dst = 00 00 00 00 00 44 22 11 2070 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2071 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2072 // TRUE_CNT is the number of active elements in the compressed low. 2073 neg(rscratch2, rscratch2); 2074 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2075 sve_index(vtmp2, B, rscratch2, 1); 2076 // vtmp1 = 00 00 00 88 55 00 00 00 2077 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2078 // Combine the compressed high(after shifted) with the compressed low. 2079 // dst = 00 00 00 88 55 44 22 11 2080 sve_orr(dst, dst, vtmp1); 2081 } 2082 2083 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2084 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2085 SIMD_Arrangement size = isQ ? T16B : T8B; 2086 if (bt == T_BYTE) { 2087 rbit(dst, size, src); 2088 } else { 2089 neon_reverse_bytes(dst, src, bt, isQ); 2090 rbit(dst, size, dst); 2091 } 2092 } 2093 2094 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2095 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2096 SIMD_Arrangement size = isQ ? T16B : T8B; 2097 switch (bt) { 2098 case T_BYTE: 2099 if (dst != src) { 2100 orr(dst, size, src, src); 2101 } 2102 break; 2103 case T_SHORT: 2104 rev16(dst, size, src); 2105 break; 2106 case T_INT: 2107 rev32(dst, size, src); 2108 break; 2109 case T_LONG: 2110 rev64(dst, size, src); 2111 break; 2112 default: 2113 assert(false, "unsupported"); 2114 ShouldNotReachHere(); 2115 } 2116 } 2117 2118 // Extract a scalar element from an sve vector at position 'idx'. 2119 // The input elements in src are expected to be of integral type. 2120 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2121 int idx, FloatRegister vtmp) { 2122 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2123 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2124 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2125 if (bt == T_INT || bt == T_LONG) { 2126 umov(dst, src, size, idx); 2127 } else { 2128 smov(dst, src, size, idx); 2129 } 2130 } else { 2131 sve_orr(vtmp, src, src); 2132 sve_ext(vtmp, vtmp, idx << size); 2133 if (bt == T_INT || bt == T_LONG) { 2134 umov(dst, vtmp, size, 0); 2135 } else { 2136 smov(dst, vtmp, size, 0); 2137 } 2138 } 2139 } 2140 2141 // java.lang.Math::round intrinsics 2142 2143 // Clobbers: rscratch1, rflags 2144 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2145 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2146 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2147 switch (T) { 2148 case T2S: 2149 case T4S: 2150 fmovs(tmp1, T, 0.5f); 2151 mov(rscratch1, jint_cast(0x1.0p23f)); 2152 break; 2153 case T2D: 2154 fmovd(tmp1, T, 0.5); 2155 mov(rscratch1, julong_cast(0x1.0p52)); 2156 break; 2157 default: 2158 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2159 } 2160 fadd(tmp1, T, tmp1, src); 2161 fcvtms(tmp1, T, tmp1); 2162 // tmp1 = floor(src + 0.5, ties to even) 2163 2164 fcvtas(dst, T, src); 2165 // dst = round(src), ties to away 2166 2167 fneg(tmp3, T, src); 2168 dup(tmp2, T, rscratch1); 2169 cm(HS, tmp3, T, tmp3, tmp2); 2170 // tmp3 is now a set of flags 2171 2172 bif(dst, T16B, tmp1, tmp3); 2173 // result in dst 2174 } 2175 2176 // Clobbers: rscratch1, rflags 2177 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2178 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2179 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2180 assert_different_registers(tmp1, tmp2, src, dst); 2181 2182 switch (T) { 2183 case S: 2184 mov(rscratch1, jint_cast(0x1.0p23f)); 2185 break; 2186 case D: 2187 mov(rscratch1, julong_cast(0x1.0p52)); 2188 break; 2189 default: 2190 assert(T == S || T == D, "invalid register variant"); 2191 } 2192 2193 sve_frinta(dst, T, ptrue, src); 2194 // dst = round(src), ties to away 2195 2196 Label none; 2197 2198 sve_fneg(tmp1, T, ptrue, src); 2199 sve_dup(tmp2, T, rscratch1); 2200 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2201 br(EQ, none); 2202 { 2203 sve_cpy(tmp1, T, pgtmp, 0.5); 2204 sve_fadd(tmp1, T, pgtmp, src); 2205 sve_frintm(dst, T, pgtmp, tmp1); 2206 // dst = floor(src + 0.5, ties to even) 2207 } 2208 bind(none); 2209 2210 sve_fcvtzs(dst, T, ptrue, dst, T); 2211 // result in dst 2212 } 2213 2214 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2215 FloatRegister one, SIMD_Arrangement T) { 2216 assert_different_registers(dst, src, zero, one); 2217 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2218 2219 facgt(dst, T, src, zero); 2220 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2221 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2222 } 2223 2224 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2225 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2226 assert_different_registers(dst, src, zero, one, vtmp); 2227 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2228 2229 sve_orr(vtmp, src, src); 2230 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2231 switch (T) { 2232 case S: 2233 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2234 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2235 // on the sign of the float value 2236 break; 2237 case D: 2238 sve_and(vtmp, T, min_jlong); 2239 sve_orr(vtmp, T, jlong_cast(1.0)); 2240 break; 2241 default: 2242 assert(false, "unsupported"); 2243 ShouldNotReachHere(); 2244 } 2245 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2246 // Result in dst 2247 } 2248 2249 bool C2_MacroAssembler::in_scratch_emit_size() { 2250 if (ciEnv::current()->task() != nullptr) { 2251 PhaseOutput* phase_output = Compile::current()->output(); 2252 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2253 return true; 2254 } 2255 } 2256 return MacroAssembler::in_scratch_emit_size(); 2257 }