1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 47 48 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 49 Register tmp2Reg, Register tmp3Reg) { 50 Register oop = objectReg; 51 Register box = boxReg; 52 Register disp_hdr = tmpReg; 53 Register tmp = tmp2Reg; 54 Label cont; 55 Label object_has_monitor; 56 Label count, no_count; 57 58 assert_different_registers(oop, box, tmp, disp_hdr); 59 60 // Load markWord from object into displaced_header. 61 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 62 63 if (DiagnoseSyncOnValueBasedClasses != 0) { 64 load_klass(tmp, oop); 65 ldrw(tmp, Address(tmp, Klass::access_flags_offset())); 66 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS); 67 br(Assembler::NE, cont); 68 } 69 70 // Check for existing monitor 71 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 72 73 if (LockingMode == LM_MONITOR) { 74 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 75 b(cont); 76 } else if (LockingMode == LM_LEGACY) { 77 // Set tmp to be (markWord of object | UNLOCK_VALUE). 78 orr(tmp, disp_hdr, markWord::unlocked_value); 79 80 // Initialize the box. (Must happen before we update the object mark!) 81 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 82 83 // Compare object markWord with an unlocked value (tmp) and if 84 // equal exchange the stack address of our box with object markWord. 85 // On failure disp_hdr contains the possibly locked markWord. 86 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 87 /*release*/ true, /*weak*/ false, disp_hdr); 88 br(Assembler::EQ, cont); 89 90 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 91 92 // If the compare-and-exchange succeeded, then we found an unlocked 93 // object, will have now locked it will continue at label cont 94 95 // Check if the owner is self by comparing the value in the 96 // markWord of object (disp_hdr) with the stack pointer. 97 mov(rscratch1, sp); 98 sub(disp_hdr, disp_hdr, rscratch1); 99 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 100 // If condition is true we are cont and hence we can store 0 as the 101 // displaced header in the box, which indicates that it is a recursive lock. 102 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 103 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 104 b(cont); 105 } else { 106 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 107 lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, no_count); 108 b(count); 109 } 110 111 // Handle existing monitor. 112 bind(object_has_monitor); 113 114 // The object's monitor m is unlocked iff m->owner == NULL, 115 // otherwise m->owner may contain a thread or a stack address. 116 // 117 // Try to CAS m->owner from NULL to current thread. 118 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 119 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 120 /*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result 121 122 if (LockingMode != LM_LIGHTWEIGHT) { 123 // Store a non-null value into the box to avoid looking like a re-entrant 124 // lock. The fast-path monitor unlock code checks for 125 // markWord::monitor_value so use markWord::unused_mark which has the 126 // relevant bit set, and also matches ObjectSynchronizer::enter. 127 mov(tmp, (address)markWord::unused_mark().value()); 128 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 129 } 130 br(Assembler::EQ, cont); // CAS success means locking succeeded 131 132 cmp(rscratch1, rthread); 133 br(Assembler::NE, cont); // Check for recursive locking 134 135 // Recursive lock case 136 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 137 // flag == EQ still from the cmp above, checking if this is a reentrant lock 138 139 bind(cont); 140 // flag == EQ indicates success 141 // flag == NE indicates failure 142 br(Assembler::NE, no_count); 143 144 bind(count); 145 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 146 147 bind(no_count); 148 } 149 150 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 151 Register tmp2Reg) { 152 Register oop = objectReg; 153 Register box = boxReg; 154 Register disp_hdr = tmpReg; 155 Register tmp = tmp2Reg; 156 Label cont; 157 Label object_has_monitor; 158 Label count, no_count; 159 160 assert_different_registers(oop, box, tmp, disp_hdr); 161 162 if (LockingMode == LM_LEGACY) { 163 // Find the lock address and load the displaced header from the stack. 164 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 165 166 // If the displaced header is 0, we have a recursive unlock. 167 cmp(disp_hdr, zr); 168 br(Assembler::EQ, cont); 169 } 170 171 // Handle existing monitor. 172 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 173 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 174 175 if (LockingMode == LM_MONITOR) { 176 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 177 b(cont); 178 } else if (LockingMode == LM_LEGACY) { 179 // Check if it is still a light weight lock, this is is true if we 180 // see the stack address of the basicLock in the markWord of the 181 // object. 182 183 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 184 /*release*/ true, /*weak*/ false, tmp); 185 b(cont); 186 } else { 187 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 188 lightweight_unlock(oop, tmp, box, disp_hdr, no_count); 189 b(count); 190 } 191 192 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 193 194 // Handle existing monitor. 195 bind(object_has_monitor); 196 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 197 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 198 199 if (LockingMode == LM_LIGHTWEIGHT) { 200 // If the owner is anonymous, we need to fix it -- in an outline stub. 201 Register tmp2 = disp_hdr; 202 ldr(tmp2, Address(tmp, ObjectMonitor::owner_offset())); 203 // We cannot use tbnz here, the target might be too far away and cannot 204 // be encoded. 205 tst(tmp2, (uint64_t)ObjectMonitor::ANONYMOUS_OWNER); 206 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2); 207 Compile::current()->output()->add_stub(stub); 208 br(Assembler::NE, stub->entry()); 209 bind(stub->continuation()); 210 } 211 212 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 213 214 Label notRecursive; 215 cbz(disp_hdr, notRecursive); 216 217 // Recursive lock 218 sub(disp_hdr, disp_hdr, 1u); 219 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 220 cmp(disp_hdr, disp_hdr); // Sets flags for result 221 b(cont); 222 223 bind(notRecursive); 224 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 225 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset())); 226 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0. 227 cmp(rscratch1, zr); // Sets flags for result 228 cbnz(rscratch1, cont); 229 // need a release store here 230 lea(tmp, Address(tmp, ObjectMonitor::owner_offset())); 231 stlr(zr, tmp); // set unowned 232 233 bind(cont); 234 // flag == EQ indicates success 235 // flag == NE indicates failure 236 br(Assembler::NE, no_count); 237 238 bind(count); 239 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 240 241 bind(no_count); 242 } 243 244 // Search for str1 in str2 and return index or -1 245 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 246 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 247 Register cnt2, Register cnt1, 248 Register tmp1, Register tmp2, 249 Register tmp3, Register tmp4, 250 Register tmp5, Register tmp6, 251 int icnt1, Register result, int ae) { 252 // NOTE: tmp5, tmp6 can be zr depending on specific method version 253 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 254 255 Register ch1 = rscratch1; 256 Register ch2 = rscratch2; 257 Register cnt1tmp = tmp1; 258 Register cnt2tmp = tmp2; 259 Register cnt1_neg = cnt1; 260 Register cnt2_neg = cnt2; 261 Register result_tmp = tmp4; 262 263 bool isL = ae == StrIntrinsicNode::LL; 264 265 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 266 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 267 int str1_chr_shift = str1_isL ? 0:1; 268 int str2_chr_shift = str2_isL ? 0:1; 269 int str1_chr_size = str1_isL ? 1:2; 270 int str2_chr_size = str2_isL ? 1:2; 271 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 272 (chr_insn)&MacroAssembler::ldrh; 273 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 274 (chr_insn)&MacroAssembler::ldrh; 275 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 276 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 277 278 // Note, inline_string_indexOf() generates checks: 279 // if (substr.count > string.count) return -1; 280 // if (substr.count == 0) return 0; 281 282 // We have two strings, a source string in str2, cnt2 and a pattern string 283 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 284 285 // For larger pattern and source we use a simplified Boyer Moore algorithm. 286 // With a small pattern and source we use linear scan. 287 288 if (icnt1 == -1) { 289 sub(result_tmp, cnt2, cnt1); 290 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 291 br(LT, LINEARSEARCH); 292 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 293 subs(zr, cnt1, 256); 294 lsr(tmp1, cnt2, 2); 295 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 296 br(GE, LINEARSTUB); 297 } 298 299 // The Boyer Moore alogorithm is based on the description here:- 300 // 301 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 302 // 303 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 304 // and the 'Good Suffix' rule. 305 // 306 // These rules are essentially heuristics for how far we can shift the 307 // pattern along the search string. 308 // 309 // The implementation here uses the 'Bad Character' rule only because of the 310 // complexity of initialisation for the 'Good Suffix' rule. 311 // 312 // This is also known as the Boyer-Moore-Horspool algorithm:- 313 // 314 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 315 // 316 // This particular implementation has few java-specific optimizations. 317 // 318 // #define ASIZE 256 319 // 320 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 321 // int i, j; 322 // unsigned c; 323 // unsigned char bc[ASIZE]; 324 // 325 // /* Preprocessing */ 326 // for (i = 0; i < ASIZE; ++i) 327 // bc[i] = m; 328 // for (i = 0; i < m - 1; ) { 329 // c = x[i]; 330 // ++i; 331 // // c < 256 for Latin1 string, so, no need for branch 332 // #ifdef PATTERN_STRING_IS_LATIN1 333 // bc[c] = m - i; 334 // #else 335 // if (c < ASIZE) bc[c] = m - i; 336 // #endif 337 // } 338 // 339 // /* Searching */ 340 // j = 0; 341 // while (j <= n - m) { 342 // c = y[i+j]; 343 // if (x[m-1] == c) 344 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 345 // if (i < 0) return j; 346 // // c < 256 for Latin1 string, so, no need for branch 347 // #ifdef SOURCE_STRING_IS_LATIN1 348 // // LL case: (c< 256) always true. Remove branch 349 // j += bc[y[j+m-1]]; 350 // #endif 351 // #ifndef PATTERN_STRING_IS_UTF 352 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 353 // if (c < ASIZE) 354 // j += bc[y[j+m-1]]; 355 // else 356 // j += 1 357 // #endif 358 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 359 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 360 // if (c < ASIZE) 361 // j += bc[y[j+m-1]]; 362 // else 363 // j += m 364 // #endif 365 // } 366 // } 367 368 if (icnt1 == -1) { 369 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 370 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 371 Register cnt1end = tmp2; 372 Register str2end = cnt2; 373 Register skipch = tmp2; 374 375 // str1 length is >=8, so, we can read at least 1 register for cases when 376 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 377 // UL case. We'll re-read last character in inner pre-loop code to have 378 // single outer pre-loop load 379 const int firstStep = isL ? 7 : 3; 380 381 const int ASIZE = 256; 382 const int STORED_BYTES = 32; // amount of bytes stored per instruction 383 sub(sp, sp, ASIZE); 384 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 385 mov(ch1, sp); 386 BIND(BM_INIT_LOOP); 387 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 388 subs(tmp5, tmp5, 1); 389 br(GT, BM_INIT_LOOP); 390 391 sub(cnt1tmp, cnt1, 1); 392 mov(tmp5, str2); 393 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 394 sub(ch2, cnt1, 1); 395 mov(tmp3, str1); 396 BIND(BCLOOP); 397 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 398 if (!str1_isL) { 399 subs(zr, ch1, ASIZE); 400 br(HS, BCSKIP); 401 } 402 strb(ch2, Address(sp, ch1)); 403 BIND(BCSKIP); 404 subs(ch2, ch2, 1); 405 br(GT, BCLOOP); 406 407 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 408 if (str1_isL == str2_isL) { 409 // load last 8 bytes (8LL/4UU symbols) 410 ldr(tmp6, Address(tmp6, -wordSize)); 411 } else { 412 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 413 // convert Latin1 to UTF. We'll have to wait until load completed, but 414 // it's still faster than per-character loads+checks 415 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 416 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 417 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 418 andr(tmp6, tmp6, 0xFF); // str1[N-4] 419 orr(ch2, ch1, ch2, LSL, 16); 420 orr(tmp6, tmp6, tmp3, LSL, 48); 421 orr(tmp6, tmp6, ch2, LSL, 16); 422 } 423 BIND(BMLOOPSTR2); 424 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 425 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 426 if (str1_isL == str2_isL) { 427 // re-init tmp3. It's for free because it's executed in parallel with 428 // load above. Alternative is to initialize it before loop, but it'll 429 // affect performance on in-order systems with 2 or more ld/st pipelines 430 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 431 } 432 if (!isL) { // UU/UL case 433 lsl(ch2, cnt1tmp, 1); // offset in bytes 434 } 435 cmp(tmp3, skipch); 436 br(NE, BMSKIP); 437 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 438 mov(ch1, tmp6); 439 if (isL) { 440 b(BMLOOPSTR1_AFTER_LOAD); 441 } else { 442 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 443 b(BMLOOPSTR1_CMP); 444 } 445 BIND(BMLOOPSTR1); 446 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 447 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 448 BIND(BMLOOPSTR1_AFTER_LOAD); 449 subs(cnt1tmp, cnt1tmp, 1); 450 br(LT, BMLOOPSTR1_LASTCMP); 451 BIND(BMLOOPSTR1_CMP); 452 cmp(ch1, ch2); 453 br(EQ, BMLOOPSTR1); 454 BIND(BMSKIP); 455 if (!isL) { 456 // if we've met UTF symbol while searching Latin1 pattern, then we can 457 // skip cnt1 symbols 458 if (str1_isL != str2_isL) { 459 mov(result_tmp, cnt1); 460 } else { 461 mov(result_tmp, 1); 462 } 463 subs(zr, skipch, ASIZE); 464 br(HS, BMADV); 465 } 466 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 467 BIND(BMADV); 468 sub(cnt1tmp, cnt1, 1); 469 add(str2, str2, result_tmp, LSL, str2_chr_shift); 470 cmp(str2, str2end); 471 br(LE, BMLOOPSTR2); 472 add(sp, sp, ASIZE); 473 b(NOMATCH); 474 BIND(BMLOOPSTR1_LASTCMP); 475 cmp(ch1, ch2); 476 br(NE, BMSKIP); 477 BIND(BMMATCH); 478 sub(result, str2, tmp5); 479 if (!str2_isL) lsr(result, result, 1); 480 add(sp, sp, ASIZE); 481 b(DONE); 482 483 BIND(LINEARSTUB); 484 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 485 br(LT, LINEAR_MEDIUM); 486 mov(result, zr); 487 RuntimeAddress stub = nullptr; 488 if (isL) { 489 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 490 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 491 } else if (str1_isL) { 492 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 493 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 494 } else { 495 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 496 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 497 } 498 address call = trampoline_call(stub); 499 if (call == nullptr) { 500 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 501 ciEnv::current()->record_failure("CodeCache is full"); 502 return; 503 } 504 b(DONE); 505 } 506 507 BIND(LINEARSEARCH); 508 { 509 Label DO1, DO2, DO3; 510 511 Register str2tmp = tmp2; 512 Register first = tmp3; 513 514 if (icnt1 == -1) 515 { 516 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 517 518 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 519 br(LT, DOSHORT); 520 BIND(LINEAR_MEDIUM); 521 (this->*str1_load_1chr)(first, Address(str1)); 522 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 523 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 524 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 525 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 526 527 BIND(FIRST_LOOP); 528 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 529 cmp(first, ch2); 530 br(EQ, STR1_LOOP); 531 BIND(STR2_NEXT); 532 adds(cnt2_neg, cnt2_neg, str2_chr_size); 533 br(LE, FIRST_LOOP); 534 b(NOMATCH); 535 536 BIND(STR1_LOOP); 537 adds(cnt1tmp, cnt1_neg, str1_chr_size); 538 add(cnt2tmp, cnt2_neg, str2_chr_size); 539 br(GE, MATCH); 540 541 BIND(STR1_NEXT); 542 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 543 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 544 cmp(ch1, ch2); 545 br(NE, STR2_NEXT); 546 adds(cnt1tmp, cnt1tmp, str1_chr_size); 547 add(cnt2tmp, cnt2tmp, str2_chr_size); 548 br(LT, STR1_NEXT); 549 b(MATCH); 550 551 BIND(DOSHORT); 552 if (str1_isL == str2_isL) { 553 cmp(cnt1, (u1)2); 554 br(LT, DO1); 555 br(GT, DO3); 556 } 557 } 558 559 if (icnt1 == 4) { 560 Label CH1_LOOP; 561 562 (this->*load_4chr)(ch1, str1); 563 sub(result_tmp, cnt2, 4); 564 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 565 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 566 567 BIND(CH1_LOOP); 568 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 569 cmp(ch1, ch2); 570 br(EQ, MATCH); 571 adds(cnt2_neg, cnt2_neg, str2_chr_size); 572 br(LE, CH1_LOOP); 573 b(NOMATCH); 574 } 575 576 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 577 Label CH1_LOOP; 578 579 BIND(DO2); 580 (this->*load_2chr)(ch1, str1); 581 if (icnt1 == 2) { 582 sub(result_tmp, cnt2, 2); 583 } 584 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 585 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 586 BIND(CH1_LOOP); 587 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 588 cmp(ch1, ch2); 589 br(EQ, MATCH); 590 adds(cnt2_neg, cnt2_neg, str2_chr_size); 591 br(LE, CH1_LOOP); 592 b(NOMATCH); 593 } 594 595 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 596 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 597 598 BIND(DO3); 599 (this->*load_2chr)(first, str1); 600 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 601 if (icnt1 == 3) { 602 sub(result_tmp, cnt2, 3); 603 } 604 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 605 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 606 BIND(FIRST_LOOP); 607 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 608 cmpw(first, ch2); 609 br(EQ, STR1_LOOP); 610 BIND(STR2_NEXT); 611 adds(cnt2_neg, cnt2_neg, str2_chr_size); 612 br(LE, FIRST_LOOP); 613 b(NOMATCH); 614 615 BIND(STR1_LOOP); 616 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 617 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 618 cmp(ch1, ch2); 619 br(NE, STR2_NEXT); 620 b(MATCH); 621 } 622 623 if (icnt1 == -1 || icnt1 == 1) { 624 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 625 626 BIND(DO1); 627 (this->*str1_load_1chr)(ch1, str1); 628 cmp(cnt2, (u1)8); 629 br(LT, DO1_SHORT); 630 631 sub(result_tmp, cnt2, 8/str2_chr_size); 632 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 633 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 634 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 635 636 if (str2_isL) { 637 orr(ch1, ch1, ch1, LSL, 8); 638 } 639 orr(ch1, ch1, ch1, LSL, 16); 640 orr(ch1, ch1, ch1, LSL, 32); 641 BIND(CH1_LOOP); 642 ldr(ch2, Address(str2, cnt2_neg)); 643 eor(ch2, ch1, ch2); 644 sub(tmp1, ch2, tmp3); 645 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 646 bics(tmp1, tmp1, tmp2); 647 br(NE, HAS_ZERO); 648 adds(cnt2_neg, cnt2_neg, 8); 649 br(LT, CH1_LOOP); 650 651 cmp(cnt2_neg, (u1)8); 652 mov(cnt2_neg, 0); 653 br(LT, CH1_LOOP); 654 b(NOMATCH); 655 656 BIND(HAS_ZERO); 657 rev(tmp1, tmp1); 658 clz(tmp1, tmp1); 659 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 660 b(MATCH); 661 662 BIND(DO1_SHORT); 663 mov(result_tmp, cnt2); 664 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 665 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 666 BIND(DO1_LOOP); 667 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 668 cmpw(ch1, ch2); 669 br(EQ, MATCH); 670 adds(cnt2_neg, cnt2_neg, str2_chr_size); 671 br(LT, DO1_LOOP); 672 } 673 } 674 BIND(NOMATCH); 675 mov(result, -1); 676 b(DONE); 677 BIND(MATCH); 678 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 679 BIND(DONE); 680 } 681 682 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 683 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 684 685 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 686 Register ch, Register result, 687 Register tmp1, Register tmp2, Register tmp3) 688 { 689 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 690 Register cnt1_neg = cnt1; 691 Register ch1 = rscratch1; 692 Register result_tmp = rscratch2; 693 694 cbz(cnt1, NOMATCH); 695 696 cmp(cnt1, (u1)4); 697 br(LT, DO1_SHORT); 698 699 orr(ch, ch, ch, LSL, 16); 700 orr(ch, ch, ch, LSL, 32); 701 702 sub(cnt1, cnt1, 4); 703 mov(result_tmp, cnt1); 704 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 705 sub(cnt1_neg, zr, cnt1, LSL, 1); 706 707 mov(tmp3, 0x0001000100010001); 708 709 BIND(CH1_LOOP); 710 ldr(ch1, Address(str1, cnt1_neg)); 711 eor(ch1, ch, ch1); 712 sub(tmp1, ch1, tmp3); 713 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 714 bics(tmp1, tmp1, tmp2); 715 br(NE, HAS_ZERO); 716 adds(cnt1_neg, cnt1_neg, 8); 717 br(LT, CH1_LOOP); 718 719 cmp(cnt1_neg, (u1)8); 720 mov(cnt1_neg, 0); 721 br(LT, CH1_LOOP); 722 b(NOMATCH); 723 724 BIND(HAS_ZERO); 725 rev(tmp1, tmp1); 726 clz(tmp1, tmp1); 727 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 728 b(MATCH); 729 730 BIND(DO1_SHORT); 731 mov(result_tmp, cnt1); 732 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 733 sub(cnt1_neg, zr, cnt1, LSL, 1); 734 BIND(DO1_LOOP); 735 ldrh(ch1, Address(str1, cnt1_neg)); 736 cmpw(ch, ch1); 737 br(EQ, MATCH); 738 adds(cnt1_neg, cnt1_neg, 2); 739 br(LT, DO1_LOOP); 740 BIND(NOMATCH); 741 mov(result, -1); 742 b(DONE); 743 BIND(MATCH); 744 add(result, result_tmp, cnt1_neg, ASR, 1); 745 BIND(DONE); 746 } 747 748 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 749 Register ch, Register result, 750 FloatRegister ztmp1, 751 FloatRegister ztmp2, 752 PRegister tmp_pg, 753 PRegister tmp_pdn, bool isL) 754 { 755 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 756 assert(tmp_pg->is_governing(), 757 "this register has to be a governing predicate register"); 758 759 Label LOOP, MATCH, DONE, NOMATCH; 760 Register vec_len = rscratch1; 761 Register idx = rscratch2; 762 763 SIMD_RegVariant T = (isL == true) ? B : H; 764 765 cbz(cnt1, NOMATCH); 766 767 // Assign the particular char throughout the vector. 768 sve_dup(ztmp2, T, ch); 769 if (isL) { 770 sve_cntb(vec_len); 771 } else { 772 sve_cnth(vec_len); 773 } 774 mov(idx, 0); 775 776 // Generate a predicate to control the reading of input string. 777 sve_whilelt(tmp_pg, T, idx, cnt1); 778 779 BIND(LOOP); 780 // Read a vector of 8- or 16-bit data depending on the string type. Note 781 // that inactive elements indicated by the predicate register won't cause 782 // a data read from memory to the destination vector. 783 if (isL) { 784 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 785 } else { 786 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 787 } 788 add(idx, idx, vec_len); 789 790 // Perform the comparison. An element of the destination predicate is set 791 // to active if the particular char is matched. 792 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 793 794 // Branch if the particular char is found. 795 br(NE, MATCH); 796 797 sve_whilelt(tmp_pg, T, idx, cnt1); 798 799 // Loop back if the particular char not found. 800 br(MI, LOOP); 801 802 BIND(NOMATCH); 803 mov(result, -1); 804 b(DONE); 805 806 BIND(MATCH); 807 // Undo the index increment. 808 sub(idx, idx, vec_len); 809 810 // Crop the vector to find its location. 811 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 812 add(result, idx, -1); 813 sve_incp(result, T, tmp_pdn); 814 BIND(DONE); 815 } 816 817 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 818 Register ch, Register result, 819 Register tmp1, Register tmp2, Register tmp3) 820 { 821 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 822 Register cnt1_neg = cnt1; 823 Register ch1 = rscratch1; 824 Register result_tmp = rscratch2; 825 826 cbz(cnt1, NOMATCH); 827 828 cmp(cnt1, (u1)8); 829 br(LT, DO1_SHORT); 830 831 orr(ch, ch, ch, LSL, 8); 832 orr(ch, ch, ch, LSL, 16); 833 orr(ch, ch, ch, LSL, 32); 834 835 sub(cnt1, cnt1, 8); 836 mov(result_tmp, cnt1); 837 lea(str1, Address(str1, cnt1)); 838 sub(cnt1_neg, zr, cnt1); 839 840 mov(tmp3, 0x0101010101010101); 841 842 BIND(CH1_LOOP); 843 ldr(ch1, Address(str1, cnt1_neg)); 844 eor(ch1, ch, ch1); 845 sub(tmp1, ch1, tmp3); 846 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 847 bics(tmp1, tmp1, tmp2); 848 br(NE, HAS_ZERO); 849 adds(cnt1_neg, cnt1_neg, 8); 850 br(LT, CH1_LOOP); 851 852 cmp(cnt1_neg, (u1)8); 853 mov(cnt1_neg, 0); 854 br(LT, CH1_LOOP); 855 b(NOMATCH); 856 857 BIND(HAS_ZERO); 858 rev(tmp1, tmp1); 859 clz(tmp1, tmp1); 860 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 861 b(MATCH); 862 863 BIND(DO1_SHORT); 864 mov(result_tmp, cnt1); 865 lea(str1, Address(str1, cnt1)); 866 sub(cnt1_neg, zr, cnt1); 867 BIND(DO1_LOOP); 868 ldrb(ch1, Address(str1, cnt1_neg)); 869 cmp(ch, ch1); 870 br(EQ, MATCH); 871 adds(cnt1_neg, cnt1_neg, 1); 872 br(LT, DO1_LOOP); 873 BIND(NOMATCH); 874 mov(result, -1); 875 b(DONE); 876 BIND(MATCH); 877 add(result, result_tmp, cnt1_neg); 878 BIND(DONE); 879 } 880 881 // Compare strings. 882 void C2_MacroAssembler::string_compare(Register str1, Register str2, 883 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 884 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 885 PRegister pgtmp1, PRegister pgtmp2, int ae) { 886 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 887 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 888 SHORT_LOOP_START, TAIL_CHECK; 889 890 bool isLL = ae == StrIntrinsicNode::LL; 891 bool isLU = ae == StrIntrinsicNode::LU; 892 bool isUL = ae == StrIntrinsicNode::UL; 893 894 // The stub threshold for LL strings is: 72 (64 + 8) chars 895 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 896 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 897 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 898 899 bool str1_isL = isLL || isLU; 900 bool str2_isL = isLL || isUL; 901 902 int str1_chr_shift = str1_isL ? 0 : 1; 903 int str2_chr_shift = str2_isL ? 0 : 1; 904 int str1_chr_size = str1_isL ? 1 : 2; 905 int str2_chr_size = str2_isL ? 1 : 2; 906 int minCharsInWord = isLL ? wordSize : wordSize/2; 907 908 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 909 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 910 (chr_insn)&MacroAssembler::ldrh; 911 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 912 (chr_insn)&MacroAssembler::ldrh; 913 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 914 (uxt_insn)&MacroAssembler::uxthw; 915 916 BLOCK_COMMENT("string_compare {"); 917 918 // Bizzarely, the counts are passed in bytes, regardless of whether they 919 // are L or U strings, however the result is always in characters. 920 if (!str1_isL) asrw(cnt1, cnt1, 1); 921 if (!str2_isL) asrw(cnt2, cnt2, 1); 922 923 // Compute the minimum of the string lengths and save the difference. 924 subsw(result, cnt1, cnt2); 925 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 926 927 // A very short string 928 cmpw(cnt2, minCharsInWord); 929 br(Assembler::LE, SHORT_STRING); 930 931 // Compare longwords 932 // load first parts of strings and finish initialization while loading 933 { 934 if (str1_isL == str2_isL) { // LL or UU 935 ldr(tmp1, Address(str1)); 936 cmp(str1, str2); 937 br(Assembler::EQ, DONE); 938 ldr(tmp2, Address(str2)); 939 cmp(cnt2, stub_threshold); 940 br(GE, STUB); 941 subsw(cnt2, cnt2, minCharsInWord); 942 br(EQ, TAIL_CHECK); 943 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 944 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 945 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 946 } else if (isLU) { 947 ldrs(vtmp, Address(str1)); 948 ldr(tmp2, Address(str2)); 949 cmp(cnt2, stub_threshold); 950 br(GE, STUB); 951 subw(cnt2, cnt2, 4); 952 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 953 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 954 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 955 zip1(vtmp, T8B, vtmp, vtmpZ); 956 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 957 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 958 add(cnt1, cnt1, 4); 959 fmovd(tmp1, vtmp); 960 } else { // UL case 961 ldr(tmp1, Address(str1)); 962 ldrs(vtmp, Address(str2)); 963 cmp(cnt2, stub_threshold); 964 br(GE, STUB); 965 subw(cnt2, cnt2, 4); 966 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 967 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 968 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 969 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 970 zip1(vtmp, T8B, vtmp, vtmpZ); 971 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 972 add(cnt1, cnt1, 8); 973 fmovd(tmp2, vtmp); 974 } 975 adds(cnt2, cnt2, isUL ? 4 : 8); 976 br(GE, TAIL); 977 eor(rscratch2, tmp1, tmp2); 978 cbnz(rscratch2, DIFF); 979 // main loop 980 bind(NEXT_WORD); 981 if (str1_isL == str2_isL) { 982 ldr(tmp1, Address(str1, cnt2)); 983 ldr(tmp2, Address(str2, cnt2)); 984 adds(cnt2, cnt2, 8); 985 } else if (isLU) { 986 ldrs(vtmp, Address(str1, cnt1)); 987 ldr(tmp2, Address(str2, cnt2)); 988 add(cnt1, cnt1, 4); 989 zip1(vtmp, T8B, vtmp, vtmpZ); 990 fmovd(tmp1, vtmp); 991 adds(cnt2, cnt2, 8); 992 } else { // UL 993 ldrs(vtmp, Address(str2, cnt2)); 994 ldr(tmp1, Address(str1, cnt1)); 995 zip1(vtmp, T8B, vtmp, vtmpZ); 996 add(cnt1, cnt1, 8); 997 fmovd(tmp2, vtmp); 998 adds(cnt2, cnt2, 4); 999 } 1000 br(GE, TAIL); 1001 1002 eor(rscratch2, tmp1, tmp2); 1003 cbz(rscratch2, NEXT_WORD); 1004 b(DIFF); 1005 bind(TAIL); 1006 eor(rscratch2, tmp1, tmp2); 1007 cbnz(rscratch2, DIFF); 1008 // Last longword. In the case where length == 4 we compare the 1009 // same longword twice, but that's still faster than another 1010 // conditional branch. 1011 if (str1_isL == str2_isL) { 1012 ldr(tmp1, Address(str1)); 1013 ldr(tmp2, Address(str2)); 1014 } else if (isLU) { 1015 ldrs(vtmp, Address(str1)); 1016 ldr(tmp2, Address(str2)); 1017 zip1(vtmp, T8B, vtmp, vtmpZ); 1018 fmovd(tmp1, vtmp); 1019 } else { // UL 1020 ldrs(vtmp, Address(str2)); 1021 ldr(tmp1, Address(str1)); 1022 zip1(vtmp, T8B, vtmp, vtmpZ); 1023 fmovd(tmp2, vtmp); 1024 } 1025 bind(TAIL_CHECK); 1026 eor(rscratch2, tmp1, tmp2); 1027 cbz(rscratch2, DONE); 1028 1029 // Find the first different characters in the longwords and 1030 // compute their difference. 1031 bind(DIFF); 1032 rev(rscratch2, rscratch2); 1033 clz(rscratch2, rscratch2); 1034 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1035 lsrv(tmp1, tmp1, rscratch2); 1036 (this->*ext_chr)(tmp1, tmp1); 1037 lsrv(tmp2, tmp2, rscratch2); 1038 (this->*ext_chr)(tmp2, tmp2); 1039 subw(result, tmp1, tmp2); 1040 b(DONE); 1041 } 1042 1043 bind(STUB); 1044 RuntimeAddress stub = nullptr; 1045 switch(ae) { 1046 case StrIntrinsicNode::LL: 1047 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1048 break; 1049 case StrIntrinsicNode::UU: 1050 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1051 break; 1052 case StrIntrinsicNode::LU: 1053 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1054 break; 1055 case StrIntrinsicNode::UL: 1056 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1057 break; 1058 default: 1059 ShouldNotReachHere(); 1060 } 1061 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1062 address call = trampoline_call(stub); 1063 if (call == nullptr) { 1064 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1065 ciEnv::current()->record_failure("CodeCache is full"); 1066 return; 1067 } 1068 b(DONE); 1069 1070 bind(SHORT_STRING); 1071 // Is the minimum length zero? 1072 cbz(cnt2, DONE); 1073 // arrange code to do most branches while loading and loading next characters 1074 // while comparing previous 1075 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1076 subs(cnt2, cnt2, 1); 1077 br(EQ, SHORT_LAST_INIT); 1078 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1079 b(SHORT_LOOP_START); 1080 bind(SHORT_LOOP); 1081 subs(cnt2, cnt2, 1); 1082 br(EQ, SHORT_LAST); 1083 bind(SHORT_LOOP_START); 1084 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1085 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1086 cmp(tmp1, cnt1); 1087 br(NE, SHORT_LOOP_TAIL); 1088 subs(cnt2, cnt2, 1); 1089 br(EQ, SHORT_LAST2); 1090 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1091 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1092 cmp(tmp2, rscratch1); 1093 br(EQ, SHORT_LOOP); 1094 sub(result, tmp2, rscratch1); 1095 b(DONE); 1096 bind(SHORT_LOOP_TAIL); 1097 sub(result, tmp1, cnt1); 1098 b(DONE); 1099 bind(SHORT_LAST2); 1100 cmp(tmp2, rscratch1); 1101 br(EQ, DONE); 1102 sub(result, tmp2, rscratch1); 1103 1104 b(DONE); 1105 bind(SHORT_LAST_INIT); 1106 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1107 bind(SHORT_LAST); 1108 cmp(tmp1, cnt1); 1109 br(EQ, DONE); 1110 sub(result, tmp1, cnt1); 1111 1112 bind(DONE); 1113 1114 BLOCK_COMMENT("} string_compare"); 1115 } 1116 1117 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1118 FloatRegister src2, Condition cond, bool isQ) { 1119 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1120 FloatRegister zn = src1, zm = src2; 1121 bool needs_negation = false; 1122 switch (cond) { 1123 case LT: cond = GT; zn = src2; zm = src1; break; 1124 case LE: cond = GE; zn = src2; zm = src1; break; 1125 case LO: cond = HI; zn = src2; zm = src1; break; 1126 case LS: cond = HS; zn = src2; zm = src1; break; 1127 case NE: cond = EQ; needs_negation = true; break; 1128 default: 1129 break; 1130 } 1131 1132 if (is_floating_point_type(bt)) { 1133 fcm(cond, dst, size, zn, zm); 1134 } else { 1135 cm(cond, dst, size, zn, zm); 1136 } 1137 1138 if (needs_negation) { 1139 notr(dst, isQ ? T16B : T8B, dst); 1140 } 1141 } 1142 1143 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1144 Condition cond, bool isQ) { 1145 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1146 if (bt == T_FLOAT || bt == T_DOUBLE) { 1147 if (cond == Assembler::NE) { 1148 fcm(Assembler::EQ, dst, size, src); 1149 notr(dst, isQ ? T16B : T8B, dst); 1150 } else { 1151 fcm(cond, dst, size, src); 1152 } 1153 } else { 1154 if (cond == Assembler::NE) { 1155 cm(Assembler::EQ, dst, size, src); 1156 notr(dst, isQ ? T16B : T8B, dst); 1157 } else { 1158 cm(cond, dst, size, src); 1159 } 1160 } 1161 } 1162 1163 // Compress the least significant bit of each byte to the rightmost and clear 1164 // the higher garbage bits. 1165 void C2_MacroAssembler::bytemask_compress(Register dst) { 1166 // Example input, dst = 0x01 00 00 00 01 01 00 01 1167 // The "??" bytes are garbage. 1168 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1169 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1170 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1171 andr(dst, dst, 0xff); // dst = 0x8D 1172 } 1173 1174 // Pack the lowest-numbered bit of each mask element in src into a long value 1175 // in dst, at most the first 64 lane elements. 1176 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1177 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1178 FloatRegister vtmp1, FloatRegister vtmp2) { 1179 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1180 assert_different_registers(dst, rscratch1); 1181 assert_different_registers(vtmp1, vtmp2); 1182 1183 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1184 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1185 // Expected: dst = 0x658D 1186 1187 // Convert the mask into vector with sequential bytes. 1188 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1189 sve_cpy(vtmp1, size, src, 1, false); 1190 if (bt != T_BYTE) { 1191 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1192 } 1193 1194 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1195 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1196 // is to compress each significant bit of the byte in a cross-lane way. Due 1197 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1198 // (bit-compress in each lane) with the biggest lane size (T = D) then 1199 // concatenate the results. 1200 1201 // The second source input of BEXT, initialized with 0x01 in each byte. 1202 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1203 sve_dup(vtmp2, B, 1); 1204 1205 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1206 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1207 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1208 // --------------------------------------- 1209 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1210 sve_bext(vtmp1, D, vtmp1, vtmp2); 1211 1212 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1213 // result to dst. 1214 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1215 // dst = 0x658D 1216 if (lane_cnt <= 8) { 1217 // No need to concatenate. 1218 umov(dst, vtmp1, B, 0); 1219 } else if (lane_cnt <= 16) { 1220 ins(vtmp1, B, vtmp1, 1, 8); 1221 umov(dst, vtmp1, H, 0); 1222 } else { 1223 // As the lane count is 64 at most, the final expected value must be in 1224 // the lowest 64 bits after narrowing vtmp1 from D to B. 1225 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1226 umov(dst, vtmp1, D, 0); 1227 } 1228 } else if (UseSVE > 0) { 1229 // Compress the lowest 8 bytes. 1230 fmovd(dst, vtmp1); 1231 bytemask_compress(dst); 1232 if (lane_cnt <= 8) return; 1233 1234 // Repeat on higher bytes and join the results. 1235 // Compress 8 bytes in each iteration. 1236 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1237 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1238 bytemask_compress(rscratch1); 1239 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1240 } 1241 } else { 1242 assert(false, "unsupported"); 1243 ShouldNotReachHere(); 1244 } 1245 } 1246 1247 // Unpack the mask, a long value in src, into predicate register dst based on the 1248 // corresponding data type. Note that dst can support at most 64 lanes. 1249 // Below example gives the expected dst predicate register in different types, with 1250 // a valid src(0x658D) on a 1024-bit vector size machine. 1251 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1252 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1253 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1254 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1255 // 1256 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1257 // has 24 significant bits would be an invalid input if dst predicate register refers to 1258 // a LONG type 1024-bit vector, which has at most 16 lanes. 1259 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1260 FloatRegister vtmp1, FloatRegister vtmp2) { 1261 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1262 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1263 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1264 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1265 // Expected: dst = 0b01101001 10001101 1266 1267 // Put long value from general purpose register into the first lane of vector. 1268 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1269 sve_dup(vtmp1, B, 0); 1270 mov(vtmp1, D, 0, src); 1271 1272 // As sve_cmp generates mask value with the minimum unit in byte, we should 1273 // transform the value in the first lane which is mask in bit now to the 1274 // mask in byte, which can be done by SVE2's BDEP instruction. 1275 1276 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1277 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1278 if (lane_cnt <= 8) { 1279 // Nothing. As only one byte exsits. 1280 } else if (lane_cnt <= 16) { 1281 ins(vtmp1, B, vtmp1, 8, 1); 1282 mov(vtmp1, B, 1, zr); 1283 } else { 1284 sve_vector_extend(vtmp1, D, vtmp1, B); 1285 } 1286 1287 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1288 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1289 sve_dup(vtmp2, B, 1); 1290 1291 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1292 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1293 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1294 // --------------------------------------- 1295 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1296 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1297 1298 if (bt != T_BYTE) { 1299 sve_vector_extend(vtmp1, size, vtmp1, B); 1300 } 1301 // Generate mask according to the given vector, in which the elements have been 1302 // extended to expected type. 1303 // dst = 0b01101001 10001101 1304 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1305 } 1306 1307 // Clobbers: rflags 1308 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1309 FloatRegister zn, FloatRegister zm, Condition cond) { 1310 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1311 FloatRegister z1 = zn, z2 = zm; 1312 switch (cond) { 1313 case LE: z1 = zm; z2 = zn; cond = GE; break; 1314 case LT: z1 = zm; z2 = zn; cond = GT; break; 1315 case LO: z1 = zm; z2 = zn; cond = HI; break; 1316 case LS: z1 = zm; z2 = zn; cond = HS; break; 1317 default: 1318 break; 1319 } 1320 1321 SIMD_RegVariant size = elemType_to_regVariant(bt); 1322 if (is_floating_point_type(bt)) { 1323 sve_fcm(cond, pd, size, pg, z1, z2); 1324 } else { 1325 assert(is_integral_type(bt), "unsupported element type"); 1326 sve_cmp(cond, pd, size, pg, z1, z2); 1327 } 1328 } 1329 1330 // Get index of the last mask lane that is set 1331 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1332 SIMD_RegVariant size = elemType_to_regVariant(bt); 1333 sve_rev(ptmp, size, src); 1334 sve_brkb(ptmp, ptrue, ptmp, false); 1335 sve_cntp(dst, size, ptrue, ptmp); 1336 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1337 subw(dst, rscratch1, dst); 1338 } 1339 1340 // Extend integer vector src to dst with the same lane count 1341 // but larger element size, e.g. 4B -> 4I 1342 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1343 FloatRegister src, BasicType src_bt) { 1344 if (src_bt == T_BYTE) { 1345 if (dst_bt == T_SHORT) { 1346 // 4B/8B to 4S/8S 1347 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1348 sxtl(dst, T8H, src, T8B); 1349 } else { 1350 // 4B to 4I 1351 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1352 sxtl(dst, T8H, src, T8B); 1353 sxtl(dst, T4S, dst, T4H); 1354 } 1355 } else if (src_bt == T_SHORT) { 1356 // 4S to 4I 1357 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1358 sxtl(dst, T4S, src, T4H); 1359 } else if (src_bt == T_INT) { 1360 // 2I to 2L 1361 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1362 sxtl(dst, T2D, src, T2S); 1363 } else { 1364 ShouldNotReachHere(); 1365 } 1366 } 1367 1368 // Narrow integer vector src down to dst with the same lane count 1369 // but smaller element size, e.g. 4I -> 4B 1370 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1371 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1372 if (src_bt == T_SHORT) { 1373 // 4S/8S to 4B/8B 1374 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1375 assert(dst_bt == T_BYTE, "unsupported"); 1376 xtn(dst, T8B, src, T8H); 1377 } else if (src_bt == T_INT) { 1378 // 4I to 4B/4S 1379 assert(src_vlen_in_bytes == 16, "unsupported"); 1380 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1381 xtn(dst, T4H, src, T4S); 1382 if (dst_bt == T_BYTE) { 1383 xtn(dst, T8B, dst, T8H); 1384 } 1385 } else if (src_bt == T_LONG) { 1386 // 2L to 2I 1387 assert(src_vlen_in_bytes == 16, "unsupported"); 1388 assert(dst_bt == T_INT, "unsupported"); 1389 xtn(dst, T2S, src, T2D); 1390 } else { 1391 ShouldNotReachHere(); 1392 } 1393 } 1394 1395 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1396 FloatRegister src, SIMD_RegVariant src_size) { 1397 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1398 if (src_size == B) { 1399 switch (dst_size) { 1400 case H: 1401 sve_sunpklo(dst, H, src); 1402 break; 1403 case S: 1404 sve_sunpklo(dst, H, src); 1405 sve_sunpklo(dst, S, dst); 1406 break; 1407 case D: 1408 sve_sunpklo(dst, H, src); 1409 sve_sunpklo(dst, S, dst); 1410 sve_sunpklo(dst, D, dst); 1411 break; 1412 default: 1413 ShouldNotReachHere(); 1414 } 1415 } else if (src_size == H) { 1416 if (dst_size == S) { 1417 sve_sunpklo(dst, S, src); 1418 } else { // D 1419 sve_sunpklo(dst, S, src); 1420 sve_sunpklo(dst, D, dst); 1421 } 1422 } else if (src_size == S) { 1423 sve_sunpklo(dst, D, src); 1424 } 1425 } 1426 1427 // Vector narrow from src to dst with specified element sizes. 1428 // High part of dst vector will be filled with zero. 1429 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1430 FloatRegister src, SIMD_RegVariant src_size, 1431 FloatRegister tmp) { 1432 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1433 assert_different_registers(src, tmp); 1434 sve_dup(tmp, src_size, 0); 1435 if (src_size == D) { 1436 switch (dst_size) { 1437 case S: 1438 sve_uzp1(dst, S, src, tmp); 1439 break; 1440 case H: 1441 assert_different_registers(dst, tmp); 1442 sve_uzp1(dst, S, src, tmp); 1443 sve_uzp1(dst, H, dst, tmp); 1444 break; 1445 case B: 1446 assert_different_registers(dst, tmp); 1447 sve_uzp1(dst, S, src, tmp); 1448 sve_uzp1(dst, H, dst, tmp); 1449 sve_uzp1(dst, B, dst, tmp); 1450 break; 1451 default: 1452 ShouldNotReachHere(); 1453 } 1454 } else if (src_size == S) { 1455 if (dst_size == H) { 1456 sve_uzp1(dst, H, src, tmp); 1457 } else { // B 1458 assert_different_registers(dst, tmp); 1459 sve_uzp1(dst, H, src, tmp); 1460 sve_uzp1(dst, B, dst, tmp); 1461 } 1462 } else if (src_size == H) { 1463 sve_uzp1(dst, B, src, tmp); 1464 } 1465 } 1466 1467 // Extend src predicate to dst predicate with the same lane count but larger 1468 // element size, e.g. 64Byte -> 512Long 1469 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1470 uint dst_element_length_in_bytes, 1471 uint src_element_length_in_bytes) { 1472 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1473 sve_punpklo(dst, src); 1474 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1475 sve_punpklo(dst, src); 1476 sve_punpklo(dst, dst); 1477 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1478 sve_punpklo(dst, src); 1479 sve_punpklo(dst, dst); 1480 sve_punpklo(dst, dst); 1481 } else { 1482 assert(false, "unsupported"); 1483 ShouldNotReachHere(); 1484 } 1485 } 1486 1487 // Narrow src predicate to dst predicate with the same lane count but 1488 // smaller element size, e.g. 512Long -> 64Byte 1489 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1490 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1491 // The insignificant bits in src predicate are expected to be zero. 1492 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1493 // passed as the second argument. An example narrowing operation with a given mask would be - 1494 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1495 // Mask (for 2 Longs) : TF 1496 // Predicate register for the above mask (16 bits) : 00000001 00000000 1497 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1498 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1499 assert_different_registers(src, ptmp); 1500 assert_different_registers(dst, ptmp); 1501 sve_pfalse(ptmp); 1502 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1503 sve_uzp1(dst, B, src, ptmp); 1504 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1505 sve_uzp1(dst, H, src, ptmp); 1506 sve_uzp1(dst, B, dst, ptmp); 1507 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1508 sve_uzp1(dst, S, src, ptmp); 1509 sve_uzp1(dst, H, dst, ptmp); 1510 sve_uzp1(dst, B, dst, ptmp); 1511 } else { 1512 assert(false, "unsupported"); 1513 ShouldNotReachHere(); 1514 } 1515 } 1516 1517 // Vector reduction add for integral type with ASIMD instructions. 1518 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1519 Register isrc, FloatRegister vsrc, 1520 unsigned vector_length_in_bytes, 1521 FloatRegister vtmp) { 1522 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1523 assert_different_registers(dst, isrc); 1524 bool isQ = vector_length_in_bytes == 16; 1525 1526 BLOCK_COMMENT("neon_reduce_add_integral {"); 1527 switch(bt) { 1528 case T_BYTE: 1529 addv(vtmp, isQ ? T16B : T8B, vsrc); 1530 smov(dst, vtmp, B, 0); 1531 addw(dst, dst, isrc, ext::sxtb); 1532 break; 1533 case T_SHORT: 1534 addv(vtmp, isQ ? T8H : T4H, vsrc); 1535 smov(dst, vtmp, H, 0); 1536 addw(dst, dst, isrc, ext::sxth); 1537 break; 1538 case T_INT: 1539 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1540 umov(dst, vtmp, S, 0); 1541 addw(dst, dst, isrc); 1542 break; 1543 case T_LONG: 1544 assert(isQ, "unsupported"); 1545 addpd(vtmp, vsrc); 1546 umov(dst, vtmp, D, 0); 1547 add(dst, dst, isrc); 1548 break; 1549 default: 1550 assert(false, "unsupported"); 1551 ShouldNotReachHere(); 1552 } 1553 BLOCK_COMMENT("} neon_reduce_add_integral"); 1554 } 1555 1556 // Vector reduction multiply for integral type with ASIMD instructions. 1557 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1558 // Clobbers: rscratch1 1559 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1560 Register isrc, FloatRegister vsrc, 1561 unsigned vector_length_in_bytes, 1562 FloatRegister vtmp1, FloatRegister vtmp2) { 1563 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1564 bool isQ = vector_length_in_bytes == 16; 1565 1566 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1567 switch(bt) { 1568 case T_BYTE: 1569 if (isQ) { 1570 // Multiply the lower half and higher half of vector iteratively. 1571 // vtmp1 = vsrc[8:15] 1572 ins(vtmp1, D, vsrc, 0, 1); 1573 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1574 mulv(vtmp1, T8B, vtmp1, vsrc); 1575 // vtmp2 = vtmp1[4:7] 1576 ins(vtmp2, S, vtmp1, 0, 1); 1577 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1578 mulv(vtmp1, T8B, vtmp2, vtmp1); 1579 } else { 1580 ins(vtmp1, S, vsrc, 0, 1); 1581 mulv(vtmp1, T8B, vtmp1, vsrc); 1582 } 1583 // vtmp2 = vtmp1[2:3] 1584 ins(vtmp2, H, vtmp1, 0, 1); 1585 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1586 mulv(vtmp2, T8B, vtmp2, vtmp1); 1587 // dst = vtmp2[0] * isrc * vtmp2[1] 1588 umov(rscratch1, vtmp2, B, 0); 1589 mulw(dst, rscratch1, isrc); 1590 sxtb(dst, dst); 1591 umov(rscratch1, vtmp2, B, 1); 1592 mulw(dst, rscratch1, dst); 1593 sxtb(dst, dst); 1594 break; 1595 case T_SHORT: 1596 if (isQ) { 1597 ins(vtmp2, D, vsrc, 0, 1); 1598 mulv(vtmp2, T4H, vtmp2, vsrc); 1599 ins(vtmp1, S, vtmp2, 0, 1); 1600 mulv(vtmp1, T4H, vtmp1, vtmp2); 1601 } else { 1602 ins(vtmp1, S, vsrc, 0, 1); 1603 mulv(vtmp1, T4H, vtmp1, vsrc); 1604 } 1605 umov(rscratch1, vtmp1, H, 0); 1606 mulw(dst, rscratch1, isrc); 1607 sxth(dst, dst); 1608 umov(rscratch1, vtmp1, H, 1); 1609 mulw(dst, rscratch1, dst); 1610 sxth(dst, dst); 1611 break; 1612 case T_INT: 1613 if (isQ) { 1614 ins(vtmp1, D, vsrc, 0, 1); 1615 mulv(vtmp1, T2S, vtmp1, vsrc); 1616 } else { 1617 vtmp1 = vsrc; 1618 } 1619 umov(rscratch1, vtmp1, S, 0); 1620 mul(dst, rscratch1, isrc); 1621 umov(rscratch1, vtmp1, S, 1); 1622 mul(dst, rscratch1, dst); 1623 break; 1624 case T_LONG: 1625 umov(rscratch1, vsrc, D, 0); 1626 mul(dst, isrc, rscratch1); 1627 umov(rscratch1, vsrc, D, 1); 1628 mul(dst, dst, rscratch1); 1629 break; 1630 default: 1631 assert(false, "unsupported"); 1632 ShouldNotReachHere(); 1633 } 1634 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1635 } 1636 1637 // Vector reduction multiply for floating-point type with ASIMD instructions. 1638 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1639 FloatRegister fsrc, FloatRegister vsrc, 1640 unsigned vector_length_in_bytes, 1641 FloatRegister vtmp) { 1642 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1643 bool isQ = vector_length_in_bytes == 16; 1644 1645 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1646 switch(bt) { 1647 case T_FLOAT: 1648 fmuls(dst, fsrc, vsrc); 1649 ins(vtmp, S, vsrc, 0, 1); 1650 fmuls(dst, dst, vtmp); 1651 if (isQ) { 1652 ins(vtmp, S, vsrc, 0, 2); 1653 fmuls(dst, dst, vtmp); 1654 ins(vtmp, S, vsrc, 0, 3); 1655 fmuls(dst, dst, vtmp); 1656 } 1657 break; 1658 case T_DOUBLE: 1659 assert(isQ, "unsupported"); 1660 fmuld(dst, fsrc, vsrc); 1661 ins(vtmp, D, vsrc, 0, 1); 1662 fmuld(dst, dst, vtmp); 1663 break; 1664 default: 1665 assert(false, "unsupported"); 1666 ShouldNotReachHere(); 1667 } 1668 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1669 } 1670 1671 // Helper to select logical instruction 1672 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1673 Register Rn, Register Rm, 1674 enum shift_kind kind, unsigned shift) { 1675 switch(opc) { 1676 case Op_AndReductionV: 1677 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1678 break; 1679 case Op_OrReductionV: 1680 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1681 break; 1682 case Op_XorReductionV: 1683 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1684 break; 1685 default: 1686 assert(false, "unsupported"); 1687 ShouldNotReachHere(); 1688 } 1689 } 1690 1691 // Vector reduction logical operations And, Or, Xor 1692 // Clobbers: rscratch1 1693 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1694 Register isrc, FloatRegister vsrc, 1695 unsigned vector_length_in_bytes) { 1696 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1697 "unsupported"); 1698 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1699 assert_different_registers(dst, isrc); 1700 bool isQ = vector_length_in_bytes == 16; 1701 1702 BLOCK_COMMENT("neon_reduce_logical {"); 1703 umov(rscratch1, vsrc, isQ ? D : S, 0); 1704 umov(dst, vsrc, isQ ? D : S, 1); 1705 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1706 switch(bt) { 1707 case T_BYTE: 1708 if (isQ) { 1709 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1710 } 1711 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1712 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1713 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1714 sxtb(dst, dst); 1715 break; 1716 case T_SHORT: 1717 if (isQ) { 1718 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1719 } 1720 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1721 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1722 sxth(dst, dst); 1723 break; 1724 case T_INT: 1725 if (isQ) { 1726 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1727 } 1728 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1729 break; 1730 case T_LONG: 1731 assert(isQ, "unsupported"); 1732 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1733 break; 1734 default: 1735 assert(false, "unsupported"); 1736 ShouldNotReachHere(); 1737 } 1738 BLOCK_COMMENT("} neon_reduce_logical"); 1739 } 1740 1741 // Vector reduction min/max for integral type with ASIMD instructions. 1742 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1743 // Clobbers: rscratch1, rflags 1744 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1745 Register isrc, FloatRegister vsrc, 1746 unsigned vector_length_in_bytes, 1747 FloatRegister vtmp) { 1748 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1749 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1750 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1751 assert_different_registers(dst, isrc); 1752 bool isQ = vector_length_in_bytes == 16; 1753 bool is_min = opc == Op_MinReductionV; 1754 1755 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1756 if (bt == T_LONG) { 1757 assert(vtmp == fnoreg, "should be"); 1758 assert(isQ, "should be"); 1759 umov(rscratch1, vsrc, D, 0); 1760 cmp(isrc, rscratch1); 1761 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1762 umov(rscratch1, vsrc, D, 1); 1763 cmp(dst, rscratch1); 1764 csel(dst, dst, rscratch1, is_min ? LT : GT); 1765 } else { 1766 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1767 if (size == T2S) { 1768 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 1769 } else { 1770 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 1771 } 1772 if (bt == T_INT) { 1773 umov(dst, vtmp, S, 0); 1774 } else { 1775 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 1776 } 1777 cmpw(dst, isrc); 1778 cselw(dst, dst, isrc, is_min ? LT : GT); 1779 } 1780 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 1781 } 1782 1783 // Vector reduction for integral type with SVE instruction. 1784 // Supported operations are Add, And, Or, Xor, Max, Min. 1785 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 1786 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1787 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1788 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1789 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1790 assert_different_registers(src1, dst); 1791 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1792 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1793 switch (opc) { 1794 case Op_AddReductionVI: { 1795 sve_uaddv(tmp, size, pg, src2); 1796 if (bt == T_BYTE) { 1797 smov(dst, tmp, size, 0); 1798 addw(dst, src1, dst, ext::sxtb); 1799 } else if (bt == T_SHORT) { 1800 smov(dst, tmp, size, 0); 1801 addw(dst, src1, dst, ext::sxth); 1802 } else { 1803 umov(dst, tmp, size, 0); 1804 addw(dst, dst, src1); 1805 } 1806 break; 1807 } 1808 case Op_AddReductionVL: { 1809 sve_uaddv(tmp, size, pg, src2); 1810 umov(dst, tmp, size, 0); 1811 add(dst, dst, src1); 1812 break; 1813 } 1814 case Op_AndReductionV: { 1815 sve_andv(tmp, size, pg, src2); 1816 if (bt == T_INT || bt == T_LONG) { 1817 umov(dst, tmp, size, 0); 1818 } else { 1819 smov(dst, tmp, size, 0); 1820 } 1821 if (bt == T_LONG) { 1822 andr(dst, dst, src1); 1823 } else { 1824 andw(dst, dst, src1); 1825 } 1826 break; 1827 } 1828 case Op_OrReductionV: { 1829 sve_orv(tmp, size, pg, src2); 1830 if (bt == T_INT || bt == T_LONG) { 1831 umov(dst, tmp, size, 0); 1832 } else { 1833 smov(dst, tmp, size, 0); 1834 } 1835 if (bt == T_LONG) { 1836 orr(dst, dst, src1); 1837 } else { 1838 orrw(dst, dst, src1); 1839 } 1840 break; 1841 } 1842 case Op_XorReductionV: { 1843 sve_eorv(tmp, size, pg, src2); 1844 if (bt == T_INT || bt == T_LONG) { 1845 umov(dst, tmp, size, 0); 1846 } else { 1847 smov(dst, tmp, size, 0); 1848 } 1849 if (bt == T_LONG) { 1850 eor(dst, dst, src1); 1851 } else { 1852 eorw(dst, dst, src1); 1853 } 1854 break; 1855 } 1856 case Op_MaxReductionV: { 1857 sve_smaxv(tmp, size, pg, src2); 1858 if (bt == T_INT || bt == T_LONG) { 1859 umov(dst, tmp, size, 0); 1860 } else { 1861 smov(dst, tmp, size, 0); 1862 } 1863 if (bt == T_LONG) { 1864 cmp(dst, src1); 1865 csel(dst, dst, src1, Assembler::GT); 1866 } else { 1867 cmpw(dst, src1); 1868 cselw(dst, dst, src1, Assembler::GT); 1869 } 1870 break; 1871 } 1872 case Op_MinReductionV: { 1873 sve_sminv(tmp, size, pg, src2); 1874 if (bt == T_INT || bt == T_LONG) { 1875 umov(dst, tmp, size, 0); 1876 } else { 1877 smov(dst, tmp, size, 0); 1878 } 1879 if (bt == T_LONG) { 1880 cmp(dst, src1); 1881 csel(dst, dst, src1, Assembler::LT); 1882 } else { 1883 cmpw(dst, src1); 1884 cselw(dst, dst, src1, Assembler::LT); 1885 } 1886 break; 1887 } 1888 default: 1889 assert(false, "unsupported"); 1890 ShouldNotReachHere(); 1891 } 1892 1893 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1894 if (bt == T_BYTE) { 1895 sxtb(dst, dst); 1896 } else if (bt == T_SHORT) { 1897 sxth(dst, dst); 1898 } 1899 } 1900 } 1901 1902 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 1903 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 1904 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 1905 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 1906 uint32_t max_vector_length = Matcher::max_vector_size(bt); 1907 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 1908 1909 // Set all elements to false if the input "lane_cnt" is zero. 1910 if (lane_cnt == 0) { 1911 sve_pfalse(dst); 1912 return; 1913 } 1914 1915 SIMD_RegVariant size = elemType_to_regVariant(bt); 1916 assert(size != Q, "invalid size"); 1917 1918 // Set all true if "lane_cnt" equals to the max lane count. 1919 if (lane_cnt == max_vector_length) { 1920 sve_ptrue(dst, size, /* ALL */ 0b11111); 1921 return; 1922 } 1923 1924 // Fixed numbers for "ptrue". 1925 switch(lane_cnt) { 1926 case 1: /* VL1 */ 1927 case 2: /* VL2 */ 1928 case 3: /* VL3 */ 1929 case 4: /* VL4 */ 1930 case 5: /* VL5 */ 1931 case 6: /* VL6 */ 1932 case 7: /* VL7 */ 1933 case 8: /* VL8 */ 1934 sve_ptrue(dst, size, lane_cnt); 1935 return; 1936 case 16: 1937 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1938 return; 1939 case 32: 1940 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1941 return; 1942 case 64: 1943 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1944 return; 1945 case 128: 1946 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1947 return; 1948 case 256: 1949 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1950 return; 1951 default: 1952 break; 1953 } 1954 1955 // Special patterns for "ptrue". 1956 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 1957 sve_ptrue(dst, size, /* POW2 */ 0b00000); 1958 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 1959 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 1960 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 1961 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 1962 } else { 1963 // Encode to "whileltw" for the remaining cases. 1964 mov(rscratch1, lane_cnt); 1965 sve_whileltw(dst, size, zr, rscratch1); 1966 } 1967 } 1968 1969 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1970 // Any remaining elements of dst will be filled with zero. 1971 // Clobbers: rscratch1 1972 // Preserves: src, mask 1973 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 1974 FloatRegister vtmp1, FloatRegister vtmp2, 1975 PRegister pgtmp) { 1976 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1977 assert_different_registers(dst, src, vtmp1, vtmp2); 1978 assert_different_registers(mask, pgtmp); 1979 1980 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 1981 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 1982 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 1983 sve_dup(vtmp2, H, 0); 1984 1985 // Extend lowest half to type INT. 1986 // dst = 00004444 00003333 00002222 00001111 1987 sve_uunpklo(dst, S, src); 1988 // pgtmp = 00000001 00000000 00000001 00000001 1989 sve_punpklo(pgtmp, mask); 1990 // Pack the active elements in size of type INT to the right, 1991 // and fill the remainings with zero. 1992 // dst = 00000000 00004444 00002222 00001111 1993 sve_compact(dst, S, dst, pgtmp); 1994 // Narrow the result back to type SHORT. 1995 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 1996 sve_uzp1(dst, H, dst, vtmp2); 1997 // Count the active elements of lowest half. 1998 // rscratch1 = 3 1999 sve_cntp(rscratch1, S, ptrue, pgtmp); 2000 2001 // Repeat to the highest half. 2002 // pgtmp = 00000001 00000000 00000000 00000001 2003 sve_punpkhi(pgtmp, mask); 2004 // vtmp1 = 00008888 00007777 00006666 00005555 2005 sve_uunpkhi(vtmp1, S, src); 2006 // vtmp1 = 00000000 00000000 00008888 00005555 2007 sve_compact(vtmp1, S, vtmp1, pgtmp); 2008 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2009 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2010 2011 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2012 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2013 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2014 // TRUE_CNT is the number of active elements in the compressed low. 2015 neg(rscratch1, rscratch1); 2016 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2017 sve_index(vtmp2, H, rscratch1, 1); 2018 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2019 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2020 2021 // Combine the compressed high(after shifted) with the compressed low. 2022 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2023 sve_orr(dst, dst, vtmp1); 2024 } 2025 2026 // Clobbers: rscratch1, rscratch2 2027 // Preserves: src, mask 2028 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2029 FloatRegister vtmp1, FloatRegister vtmp2, 2030 FloatRegister vtmp3, FloatRegister vtmp4, 2031 PRegister ptmp, PRegister pgtmp) { 2032 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2033 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2034 assert_different_registers(mask, ptmp, pgtmp); 2035 // Example input: src = 88 77 66 55 44 33 22 11 2036 // mask = 01 00 00 01 01 00 01 01 2037 // Expected result: dst = 00 00 00 88 55 44 22 11 2038 2039 sve_dup(vtmp4, B, 0); 2040 // Extend lowest half to type SHORT. 2041 // vtmp1 = 0044 0033 0022 0011 2042 sve_uunpklo(vtmp1, H, src); 2043 // ptmp = 0001 0000 0001 0001 2044 sve_punpklo(ptmp, mask); 2045 // Count the active elements of lowest half. 2046 // rscratch2 = 3 2047 sve_cntp(rscratch2, H, ptrue, ptmp); 2048 // Pack the active elements in size of type SHORT to the right, 2049 // and fill the remainings with zero. 2050 // dst = 0000 0044 0022 0011 2051 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2052 // Narrow the result back to type BYTE. 2053 // dst = 00 00 00 00 00 44 22 11 2054 sve_uzp1(dst, B, dst, vtmp4); 2055 2056 // Repeat to the highest half. 2057 // ptmp = 0001 0000 0000 0001 2058 sve_punpkhi(ptmp, mask); 2059 // vtmp1 = 0088 0077 0066 0055 2060 sve_uunpkhi(vtmp2, H, src); 2061 // vtmp1 = 0000 0000 0088 0055 2062 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2063 2064 sve_dup(vtmp4, B, 0); 2065 // vtmp1 = 00 00 00 00 00 00 88 55 2066 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2067 2068 // Compressed low: dst = 00 00 00 00 00 44 22 11 2069 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2070 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2071 // TRUE_CNT is the number of active elements in the compressed low. 2072 neg(rscratch2, rscratch2); 2073 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2074 sve_index(vtmp2, B, rscratch2, 1); 2075 // vtmp1 = 00 00 00 88 55 00 00 00 2076 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2077 // Combine the compressed high(after shifted) with the compressed low. 2078 // dst = 00 00 00 88 55 44 22 11 2079 sve_orr(dst, dst, vtmp1); 2080 } 2081 2082 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2083 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2084 SIMD_Arrangement size = isQ ? T16B : T8B; 2085 if (bt == T_BYTE) { 2086 rbit(dst, size, src); 2087 } else { 2088 neon_reverse_bytes(dst, src, bt, isQ); 2089 rbit(dst, size, dst); 2090 } 2091 } 2092 2093 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2094 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2095 SIMD_Arrangement size = isQ ? T16B : T8B; 2096 switch (bt) { 2097 case T_BYTE: 2098 if (dst != src) { 2099 orr(dst, size, src, src); 2100 } 2101 break; 2102 case T_SHORT: 2103 rev16(dst, size, src); 2104 break; 2105 case T_INT: 2106 rev32(dst, size, src); 2107 break; 2108 case T_LONG: 2109 rev64(dst, size, src); 2110 break; 2111 default: 2112 assert(false, "unsupported"); 2113 ShouldNotReachHere(); 2114 } 2115 } 2116 2117 // Extract a scalar element from an sve vector at position 'idx'. 2118 // The input elements in src are expected to be of integral type. 2119 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2120 int idx, FloatRegister vtmp) { 2121 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2122 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2123 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2124 if (bt == T_INT || bt == T_LONG) { 2125 umov(dst, src, size, idx); 2126 } else { 2127 smov(dst, src, size, idx); 2128 } 2129 } else { 2130 sve_orr(vtmp, src, src); 2131 sve_ext(vtmp, vtmp, idx << size); 2132 if (bt == T_INT || bt == T_LONG) { 2133 umov(dst, vtmp, size, 0); 2134 } else { 2135 smov(dst, vtmp, size, 0); 2136 } 2137 } 2138 } 2139 2140 // java.lang.Math::round intrinsics 2141 2142 // Clobbers: rscratch1, rflags 2143 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2144 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2145 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2146 switch (T) { 2147 case T2S: 2148 case T4S: 2149 fmovs(tmp1, T, 0.5f); 2150 mov(rscratch1, jint_cast(0x1.0p23f)); 2151 break; 2152 case T2D: 2153 fmovd(tmp1, T, 0.5); 2154 mov(rscratch1, julong_cast(0x1.0p52)); 2155 break; 2156 default: 2157 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2158 } 2159 fadd(tmp1, T, tmp1, src); 2160 fcvtms(tmp1, T, tmp1); 2161 // tmp1 = floor(src + 0.5, ties to even) 2162 2163 fcvtas(dst, T, src); 2164 // dst = round(src), ties to away 2165 2166 fneg(tmp3, T, src); 2167 dup(tmp2, T, rscratch1); 2168 cm(HS, tmp3, T, tmp3, tmp2); 2169 // tmp3 is now a set of flags 2170 2171 bif(dst, T16B, tmp1, tmp3); 2172 // result in dst 2173 } 2174 2175 // Clobbers: rscratch1, rflags 2176 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2177 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2178 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2179 assert_different_registers(tmp1, tmp2, src, dst); 2180 2181 switch (T) { 2182 case S: 2183 mov(rscratch1, jint_cast(0x1.0p23f)); 2184 break; 2185 case D: 2186 mov(rscratch1, julong_cast(0x1.0p52)); 2187 break; 2188 default: 2189 assert(T == S || T == D, "invalid register variant"); 2190 } 2191 2192 sve_frinta(dst, T, ptrue, src); 2193 // dst = round(src), ties to away 2194 2195 Label none; 2196 2197 sve_fneg(tmp1, T, ptrue, src); 2198 sve_dup(tmp2, T, rscratch1); 2199 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2200 br(EQ, none); 2201 { 2202 sve_cpy(tmp1, T, pgtmp, 0.5); 2203 sve_fadd(tmp1, T, pgtmp, src); 2204 sve_frintm(dst, T, pgtmp, tmp1); 2205 // dst = floor(src + 0.5, ties to even) 2206 } 2207 bind(none); 2208 2209 sve_fcvtzs(dst, T, ptrue, dst, T); 2210 // result in dst 2211 } 2212 2213 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2214 FloatRegister one, SIMD_Arrangement T) { 2215 assert_different_registers(dst, src, zero, one); 2216 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2217 2218 facgt(dst, T, src, zero); 2219 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2220 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2221 } 2222 2223 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2224 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2225 assert_different_registers(dst, src, zero, one, vtmp); 2226 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2227 2228 sve_orr(vtmp, src, src); 2229 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2230 switch (T) { 2231 case S: 2232 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2233 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2234 // on the sign of the float value 2235 break; 2236 case D: 2237 sve_and(vtmp, T, min_jlong); 2238 sve_orr(vtmp, T, jlong_cast(1.0)); 2239 break; 2240 default: 2241 assert(false, "unsupported"); 2242 ShouldNotReachHere(); 2243 } 2244 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2245 // Result in dst 2246 } 2247 2248 bool C2_MacroAssembler::in_scratch_emit_size() { 2249 if (ciEnv::current()->task() != nullptr) { 2250 PhaseOutput* phase_output = Compile::current()->output(); 2251 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2252 return true; 2253 } 2254 } 2255 return MacroAssembler::in_scratch_emit_size(); 2256 }