1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 47 48 void C2_MacroAssembler::entry_barrier() { 49 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 50 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 51 // Dummy labels for just measuring the code size 52 Label dummy_slow_path; 53 Label dummy_continuation; 54 Label dummy_guard; 55 Label* slow_path = &dummy_slow_path; 56 Label* continuation = &dummy_continuation; 57 Label* guard = &dummy_guard; 58 if (!Compile::current()->output()->in_scratch_emit_size()) { 59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 61 Compile::current()->output()->add_stub(stub); 62 slow_path = &stub->entry(); 63 continuation = &stub->continuation(); 64 guard = &stub->guard(); 65 } 66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 68 } 69 } 70 71 // Search for str1 in str2 and return index or -1 72 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 73 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 74 Register cnt2, Register cnt1, 75 Register tmp1, Register tmp2, 76 Register tmp3, Register tmp4, 77 Register tmp5, Register tmp6, 78 int icnt1, Register result, int ae) { 79 // NOTE: tmp5, tmp6 can be zr depending on specific method version 80 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 81 82 Register ch1 = rscratch1; 83 Register ch2 = rscratch2; 84 Register cnt1tmp = tmp1; 85 Register cnt2tmp = tmp2; 86 Register cnt1_neg = cnt1; 87 Register cnt2_neg = cnt2; 88 Register result_tmp = tmp4; 89 90 bool isL = ae == StrIntrinsicNode::LL; 91 92 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 93 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 94 int str1_chr_shift = str1_isL ? 0:1; 95 int str2_chr_shift = str2_isL ? 0:1; 96 int str1_chr_size = str1_isL ? 1:2; 97 int str2_chr_size = str2_isL ? 1:2; 98 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 99 (chr_insn)&MacroAssembler::ldrh; 100 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 101 (chr_insn)&MacroAssembler::ldrh; 102 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 103 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 104 105 // Note, inline_string_indexOf() generates checks: 106 // if (substr.count > string.count) return -1; 107 // if (substr.count == 0) return 0; 108 109 // We have two strings, a source string in str2, cnt2 and a pattern string 110 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 111 112 // For larger pattern and source we use a simplified Boyer Moore algorithm. 113 // With a small pattern and source we use linear scan. 114 115 if (icnt1 == -1) { 116 sub(result_tmp, cnt2, cnt1); 117 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 118 br(LT, LINEARSEARCH); 119 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 120 subs(zr, cnt1, 256); 121 lsr(tmp1, cnt2, 2); 122 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 123 br(GE, LINEARSTUB); 124 } 125 126 // The Boyer Moore alogorithm is based on the description here:- 127 // 128 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 129 // 130 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 131 // and the 'Good Suffix' rule. 132 // 133 // These rules are essentially heuristics for how far we can shift the 134 // pattern along the search string. 135 // 136 // The implementation here uses the 'Bad Character' rule only because of the 137 // complexity of initialisation for the 'Good Suffix' rule. 138 // 139 // This is also known as the Boyer-Moore-Horspool algorithm:- 140 // 141 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 142 // 143 // This particular implementation has few java-specific optimizations. 144 // 145 // #define ASIZE 256 146 // 147 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 148 // int i, j; 149 // unsigned c; 150 // unsigned char bc[ASIZE]; 151 // 152 // /* Preprocessing */ 153 // for (i = 0; i < ASIZE; ++i) 154 // bc[i] = m; 155 // for (i = 0; i < m - 1; ) { 156 // c = x[i]; 157 // ++i; 158 // // c < 256 for Latin1 string, so, no need for branch 159 // #ifdef PATTERN_STRING_IS_LATIN1 160 // bc[c] = m - i; 161 // #else 162 // if (c < ASIZE) bc[c] = m - i; 163 // #endif 164 // } 165 // 166 // /* Searching */ 167 // j = 0; 168 // while (j <= n - m) { 169 // c = y[i+j]; 170 // if (x[m-1] == c) 171 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 172 // if (i < 0) return j; 173 // // c < 256 for Latin1 string, so, no need for branch 174 // #ifdef SOURCE_STRING_IS_LATIN1 175 // // LL case: (c< 256) always true. Remove branch 176 // j += bc[y[j+m-1]]; 177 // #endif 178 // #ifndef PATTERN_STRING_IS_UTF 179 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 180 // if (c < ASIZE) 181 // j += bc[y[j+m-1]]; 182 // else 183 // j += 1 184 // #endif 185 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 186 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 187 // if (c < ASIZE) 188 // j += bc[y[j+m-1]]; 189 // else 190 // j += m 191 // #endif 192 // } 193 // } 194 195 if (icnt1 == -1) { 196 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 197 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 198 Register cnt1end = tmp2; 199 Register str2end = cnt2; 200 Register skipch = tmp2; 201 202 // str1 length is >=8, so, we can read at least 1 register for cases when 203 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 204 // UL case. We'll re-read last character in inner pre-loop code to have 205 // single outer pre-loop load 206 const int firstStep = isL ? 7 : 3; 207 208 const int ASIZE = 256; 209 const int STORED_BYTES = 32; // amount of bytes stored per instruction 210 sub(sp, sp, ASIZE); 211 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 212 mov(ch1, sp); 213 BIND(BM_INIT_LOOP); 214 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 215 subs(tmp5, tmp5, 1); 216 br(GT, BM_INIT_LOOP); 217 218 sub(cnt1tmp, cnt1, 1); 219 mov(tmp5, str2); 220 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 221 sub(ch2, cnt1, 1); 222 mov(tmp3, str1); 223 BIND(BCLOOP); 224 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 225 if (!str1_isL) { 226 subs(zr, ch1, ASIZE); 227 br(HS, BCSKIP); 228 } 229 strb(ch2, Address(sp, ch1)); 230 BIND(BCSKIP); 231 subs(ch2, ch2, 1); 232 br(GT, BCLOOP); 233 234 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 235 if (str1_isL == str2_isL) { 236 // load last 8 bytes (8LL/4UU symbols) 237 ldr(tmp6, Address(tmp6, -wordSize)); 238 } else { 239 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 240 // convert Latin1 to UTF. We'll have to wait until load completed, but 241 // it's still faster than per-character loads+checks 242 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 243 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 244 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 245 andr(tmp6, tmp6, 0xFF); // str1[N-4] 246 orr(ch2, ch1, ch2, LSL, 16); 247 orr(tmp6, tmp6, tmp3, LSL, 48); 248 orr(tmp6, tmp6, ch2, LSL, 16); 249 } 250 BIND(BMLOOPSTR2); 251 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 252 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 253 if (str1_isL == str2_isL) { 254 // re-init tmp3. It's for free because it's executed in parallel with 255 // load above. Alternative is to initialize it before loop, but it'll 256 // affect performance on in-order systems with 2 or more ld/st pipelines 257 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 258 } 259 if (!isL) { // UU/UL case 260 lsl(ch2, cnt1tmp, 1); // offset in bytes 261 } 262 cmp(tmp3, skipch); 263 br(NE, BMSKIP); 264 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 265 mov(ch1, tmp6); 266 if (isL) { 267 b(BMLOOPSTR1_AFTER_LOAD); 268 } else { 269 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 270 b(BMLOOPSTR1_CMP); 271 } 272 BIND(BMLOOPSTR1); 273 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 274 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 275 BIND(BMLOOPSTR1_AFTER_LOAD); 276 subs(cnt1tmp, cnt1tmp, 1); 277 br(LT, BMLOOPSTR1_LASTCMP); 278 BIND(BMLOOPSTR1_CMP); 279 cmp(ch1, ch2); 280 br(EQ, BMLOOPSTR1); 281 BIND(BMSKIP); 282 if (!isL) { 283 // if we've met UTF symbol while searching Latin1 pattern, then we can 284 // skip cnt1 symbols 285 if (str1_isL != str2_isL) { 286 mov(result_tmp, cnt1); 287 } else { 288 mov(result_tmp, 1); 289 } 290 subs(zr, skipch, ASIZE); 291 br(HS, BMADV); 292 } 293 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 294 BIND(BMADV); 295 sub(cnt1tmp, cnt1, 1); 296 add(str2, str2, result_tmp, LSL, str2_chr_shift); 297 cmp(str2, str2end); 298 br(LE, BMLOOPSTR2); 299 add(sp, sp, ASIZE); 300 b(NOMATCH); 301 BIND(BMLOOPSTR1_LASTCMP); 302 cmp(ch1, ch2); 303 br(NE, BMSKIP); 304 BIND(BMMATCH); 305 sub(result, str2, tmp5); 306 if (!str2_isL) lsr(result, result, 1); 307 add(sp, sp, ASIZE); 308 b(DONE); 309 310 BIND(LINEARSTUB); 311 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 312 br(LT, LINEAR_MEDIUM); 313 mov(result, zr); 314 RuntimeAddress stub = nullptr; 315 if (isL) { 316 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 317 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 318 } else if (str1_isL) { 319 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 320 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 321 } else { 322 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 323 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 324 } 325 address call = trampoline_call(stub); 326 if (call == nullptr) { 327 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 328 ciEnv::current()->record_failure("CodeCache is full"); 329 return; 330 } 331 b(DONE); 332 } 333 334 BIND(LINEARSEARCH); 335 { 336 Label DO1, DO2, DO3; 337 338 Register str2tmp = tmp2; 339 Register first = tmp3; 340 341 if (icnt1 == -1) 342 { 343 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 344 345 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 346 br(LT, DOSHORT); 347 BIND(LINEAR_MEDIUM); 348 (this->*str1_load_1chr)(first, Address(str1)); 349 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 350 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 351 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 352 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 353 354 BIND(FIRST_LOOP); 355 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 356 cmp(first, ch2); 357 br(EQ, STR1_LOOP); 358 BIND(STR2_NEXT); 359 adds(cnt2_neg, cnt2_neg, str2_chr_size); 360 br(LE, FIRST_LOOP); 361 b(NOMATCH); 362 363 BIND(STR1_LOOP); 364 adds(cnt1tmp, cnt1_neg, str1_chr_size); 365 add(cnt2tmp, cnt2_neg, str2_chr_size); 366 br(GE, MATCH); 367 368 BIND(STR1_NEXT); 369 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 370 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 371 cmp(ch1, ch2); 372 br(NE, STR2_NEXT); 373 adds(cnt1tmp, cnt1tmp, str1_chr_size); 374 add(cnt2tmp, cnt2tmp, str2_chr_size); 375 br(LT, STR1_NEXT); 376 b(MATCH); 377 378 BIND(DOSHORT); 379 if (str1_isL == str2_isL) { 380 cmp(cnt1, (u1)2); 381 br(LT, DO1); 382 br(GT, DO3); 383 } 384 } 385 386 if (icnt1 == 4) { 387 Label CH1_LOOP; 388 389 (this->*load_4chr)(ch1, str1); 390 sub(result_tmp, cnt2, 4); 391 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 392 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 393 394 BIND(CH1_LOOP); 395 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 396 cmp(ch1, ch2); 397 br(EQ, MATCH); 398 adds(cnt2_neg, cnt2_neg, str2_chr_size); 399 br(LE, CH1_LOOP); 400 b(NOMATCH); 401 } 402 403 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 404 Label CH1_LOOP; 405 406 BIND(DO2); 407 (this->*load_2chr)(ch1, str1); 408 if (icnt1 == 2) { 409 sub(result_tmp, cnt2, 2); 410 } 411 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 412 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 413 BIND(CH1_LOOP); 414 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 415 cmp(ch1, ch2); 416 br(EQ, MATCH); 417 adds(cnt2_neg, cnt2_neg, str2_chr_size); 418 br(LE, CH1_LOOP); 419 b(NOMATCH); 420 } 421 422 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 423 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 424 425 BIND(DO3); 426 (this->*load_2chr)(first, str1); 427 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 428 if (icnt1 == 3) { 429 sub(result_tmp, cnt2, 3); 430 } 431 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 432 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 433 BIND(FIRST_LOOP); 434 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 435 cmpw(first, ch2); 436 br(EQ, STR1_LOOP); 437 BIND(STR2_NEXT); 438 adds(cnt2_neg, cnt2_neg, str2_chr_size); 439 br(LE, FIRST_LOOP); 440 b(NOMATCH); 441 442 BIND(STR1_LOOP); 443 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 444 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 445 cmp(ch1, ch2); 446 br(NE, STR2_NEXT); 447 b(MATCH); 448 } 449 450 if (icnt1 == -1 || icnt1 == 1) { 451 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 452 453 BIND(DO1); 454 (this->*str1_load_1chr)(ch1, str1); 455 cmp(cnt2, (u1)8); 456 br(LT, DO1_SHORT); 457 458 sub(result_tmp, cnt2, 8/str2_chr_size); 459 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 460 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 461 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 462 463 if (str2_isL) { 464 orr(ch1, ch1, ch1, LSL, 8); 465 } 466 orr(ch1, ch1, ch1, LSL, 16); 467 orr(ch1, ch1, ch1, LSL, 32); 468 BIND(CH1_LOOP); 469 ldr(ch2, Address(str2, cnt2_neg)); 470 eor(ch2, ch1, ch2); 471 sub(tmp1, ch2, tmp3); 472 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 473 bics(tmp1, tmp1, tmp2); 474 br(NE, HAS_ZERO); 475 adds(cnt2_neg, cnt2_neg, 8); 476 br(LT, CH1_LOOP); 477 478 cmp(cnt2_neg, (u1)8); 479 mov(cnt2_neg, 0); 480 br(LT, CH1_LOOP); 481 b(NOMATCH); 482 483 BIND(HAS_ZERO); 484 rev(tmp1, tmp1); 485 clz(tmp1, tmp1); 486 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 487 b(MATCH); 488 489 BIND(DO1_SHORT); 490 mov(result_tmp, cnt2); 491 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 492 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 493 BIND(DO1_LOOP); 494 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 495 cmpw(ch1, ch2); 496 br(EQ, MATCH); 497 adds(cnt2_neg, cnt2_neg, str2_chr_size); 498 br(LT, DO1_LOOP); 499 } 500 } 501 BIND(NOMATCH); 502 mov(result, -1); 503 b(DONE); 504 BIND(MATCH); 505 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 506 BIND(DONE); 507 } 508 509 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 510 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 511 512 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 513 Register ch, Register result, 514 Register tmp1, Register tmp2, Register tmp3) 515 { 516 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 517 Register cnt1_neg = cnt1; 518 Register ch1 = rscratch1; 519 Register result_tmp = rscratch2; 520 521 cbz(cnt1, NOMATCH); 522 523 cmp(cnt1, (u1)4); 524 br(LT, DO1_SHORT); 525 526 orr(ch, ch, ch, LSL, 16); 527 orr(ch, ch, ch, LSL, 32); 528 529 sub(cnt1, cnt1, 4); 530 mov(result_tmp, cnt1); 531 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 532 sub(cnt1_neg, zr, cnt1, LSL, 1); 533 534 mov(tmp3, 0x0001000100010001); 535 536 BIND(CH1_LOOP); 537 ldr(ch1, Address(str1, cnt1_neg)); 538 eor(ch1, ch, ch1); 539 sub(tmp1, ch1, tmp3); 540 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 541 bics(tmp1, tmp1, tmp2); 542 br(NE, HAS_ZERO); 543 adds(cnt1_neg, cnt1_neg, 8); 544 br(LT, CH1_LOOP); 545 546 cmp(cnt1_neg, (u1)8); 547 mov(cnt1_neg, 0); 548 br(LT, CH1_LOOP); 549 b(NOMATCH); 550 551 BIND(HAS_ZERO); 552 rev(tmp1, tmp1); 553 clz(tmp1, tmp1); 554 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 555 b(MATCH); 556 557 BIND(DO1_SHORT); 558 mov(result_tmp, cnt1); 559 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 560 sub(cnt1_neg, zr, cnt1, LSL, 1); 561 BIND(DO1_LOOP); 562 ldrh(ch1, Address(str1, cnt1_neg)); 563 cmpw(ch, ch1); 564 br(EQ, MATCH); 565 adds(cnt1_neg, cnt1_neg, 2); 566 br(LT, DO1_LOOP); 567 BIND(NOMATCH); 568 mov(result, -1); 569 b(DONE); 570 BIND(MATCH); 571 add(result, result_tmp, cnt1_neg, ASR, 1); 572 BIND(DONE); 573 } 574 575 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 576 Register ch, Register result, 577 FloatRegister ztmp1, 578 FloatRegister ztmp2, 579 PRegister tmp_pg, 580 PRegister tmp_pdn, bool isL) 581 { 582 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 583 assert(tmp_pg->is_governing(), 584 "this register has to be a governing predicate register"); 585 586 Label LOOP, MATCH, DONE, NOMATCH; 587 Register vec_len = rscratch1; 588 Register idx = rscratch2; 589 590 SIMD_RegVariant T = (isL == true) ? B : H; 591 592 cbz(cnt1, NOMATCH); 593 594 // Assign the particular char throughout the vector. 595 sve_dup(ztmp2, T, ch); 596 if (isL) { 597 sve_cntb(vec_len); 598 } else { 599 sve_cnth(vec_len); 600 } 601 mov(idx, 0); 602 603 // Generate a predicate to control the reading of input string. 604 sve_whilelt(tmp_pg, T, idx, cnt1); 605 606 BIND(LOOP); 607 // Read a vector of 8- or 16-bit data depending on the string type. Note 608 // that inactive elements indicated by the predicate register won't cause 609 // a data read from memory to the destination vector. 610 if (isL) { 611 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 612 } else { 613 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 614 } 615 add(idx, idx, vec_len); 616 617 // Perform the comparison. An element of the destination predicate is set 618 // to active if the particular char is matched. 619 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 620 621 // Branch if the particular char is found. 622 br(NE, MATCH); 623 624 sve_whilelt(tmp_pg, T, idx, cnt1); 625 626 // Loop back if the particular char not found. 627 br(MI, LOOP); 628 629 BIND(NOMATCH); 630 mov(result, -1); 631 b(DONE); 632 633 BIND(MATCH); 634 // Undo the index increment. 635 sub(idx, idx, vec_len); 636 637 // Crop the vector to find its location. 638 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 639 add(result, idx, -1); 640 sve_incp(result, T, tmp_pdn); 641 BIND(DONE); 642 } 643 644 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 645 Register ch, Register result, 646 Register tmp1, Register tmp2, Register tmp3) 647 { 648 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 649 Register cnt1_neg = cnt1; 650 Register ch1 = rscratch1; 651 Register result_tmp = rscratch2; 652 653 cbz(cnt1, NOMATCH); 654 655 cmp(cnt1, (u1)8); 656 br(LT, DO1_SHORT); 657 658 orr(ch, ch, ch, LSL, 8); 659 orr(ch, ch, ch, LSL, 16); 660 orr(ch, ch, ch, LSL, 32); 661 662 sub(cnt1, cnt1, 8); 663 mov(result_tmp, cnt1); 664 lea(str1, Address(str1, cnt1)); 665 sub(cnt1_neg, zr, cnt1); 666 667 mov(tmp3, 0x0101010101010101); 668 669 BIND(CH1_LOOP); 670 ldr(ch1, Address(str1, cnt1_neg)); 671 eor(ch1, ch, ch1); 672 sub(tmp1, ch1, tmp3); 673 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 674 bics(tmp1, tmp1, tmp2); 675 br(NE, HAS_ZERO); 676 adds(cnt1_neg, cnt1_neg, 8); 677 br(LT, CH1_LOOP); 678 679 cmp(cnt1_neg, (u1)8); 680 mov(cnt1_neg, 0); 681 br(LT, CH1_LOOP); 682 b(NOMATCH); 683 684 BIND(HAS_ZERO); 685 rev(tmp1, tmp1); 686 clz(tmp1, tmp1); 687 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 688 b(MATCH); 689 690 BIND(DO1_SHORT); 691 mov(result_tmp, cnt1); 692 lea(str1, Address(str1, cnt1)); 693 sub(cnt1_neg, zr, cnt1); 694 BIND(DO1_LOOP); 695 ldrb(ch1, Address(str1, cnt1_neg)); 696 cmp(ch, ch1); 697 br(EQ, MATCH); 698 adds(cnt1_neg, cnt1_neg, 1); 699 br(LT, DO1_LOOP); 700 BIND(NOMATCH); 701 mov(result, -1); 702 b(DONE); 703 BIND(MATCH); 704 add(result, result_tmp, cnt1_neg); 705 BIND(DONE); 706 } 707 708 // Compare strings. 709 void C2_MacroAssembler::string_compare(Register str1, Register str2, 710 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 711 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 712 PRegister pgtmp1, PRegister pgtmp2, int ae) { 713 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 714 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 715 SHORT_LOOP_START, TAIL_CHECK; 716 717 bool isLL = ae == StrIntrinsicNode::LL; 718 bool isLU = ae == StrIntrinsicNode::LU; 719 bool isUL = ae == StrIntrinsicNode::UL; 720 721 // The stub threshold for LL strings is: 72 (64 + 8) chars 722 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 723 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 724 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 725 726 bool str1_isL = isLL || isLU; 727 bool str2_isL = isLL || isUL; 728 729 int str1_chr_shift = str1_isL ? 0 : 1; 730 int str2_chr_shift = str2_isL ? 0 : 1; 731 int str1_chr_size = str1_isL ? 1 : 2; 732 int str2_chr_size = str2_isL ? 1 : 2; 733 int minCharsInWord = isLL ? wordSize : wordSize/2; 734 735 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 736 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 737 (chr_insn)&MacroAssembler::ldrh; 738 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 739 (chr_insn)&MacroAssembler::ldrh; 740 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 741 (uxt_insn)&MacroAssembler::uxthw; 742 743 BLOCK_COMMENT("string_compare {"); 744 745 // Bizzarely, the counts are passed in bytes, regardless of whether they 746 // are L or U strings, however the result is always in characters. 747 if (!str1_isL) asrw(cnt1, cnt1, 1); 748 if (!str2_isL) asrw(cnt2, cnt2, 1); 749 750 // Compute the minimum of the string lengths and save the difference. 751 subsw(result, cnt1, cnt2); 752 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 753 754 // A very short string 755 cmpw(cnt2, minCharsInWord); 756 br(Assembler::LE, SHORT_STRING); 757 758 // Compare longwords 759 // load first parts of strings and finish initialization while loading 760 { 761 if (str1_isL == str2_isL) { // LL or UU 762 ldr(tmp1, Address(str1)); 763 cmp(str1, str2); 764 br(Assembler::EQ, DONE); 765 ldr(tmp2, Address(str2)); 766 cmp(cnt2, stub_threshold); 767 br(GE, STUB); 768 subsw(cnt2, cnt2, minCharsInWord); 769 br(EQ, TAIL_CHECK); 770 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 771 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 772 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 773 } else if (isLU) { 774 ldrs(vtmp, Address(str1)); 775 ldr(tmp2, Address(str2)); 776 cmp(cnt2, stub_threshold); 777 br(GE, STUB); 778 subw(cnt2, cnt2, 4); 779 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 780 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 781 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 782 zip1(vtmp, T8B, vtmp, vtmpZ); 783 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 784 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 785 add(cnt1, cnt1, 4); 786 fmovd(tmp1, vtmp); 787 } else { // UL case 788 ldr(tmp1, Address(str1)); 789 ldrs(vtmp, Address(str2)); 790 cmp(cnt2, stub_threshold); 791 br(GE, STUB); 792 subw(cnt2, cnt2, 4); 793 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 794 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 795 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 796 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 797 zip1(vtmp, T8B, vtmp, vtmpZ); 798 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 799 add(cnt1, cnt1, 8); 800 fmovd(tmp2, vtmp); 801 } 802 adds(cnt2, cnt2, isUL ? 4 : 8); 803 br(GE, TAIL); 804 eor(rscratch2, tmp1, tmp2); 805 cbnz(rscratch2, DIFF); 806 // main loop 807 bind(NEXT_WORD); 808 if (str1_isL == str2_isL) { 809 ldr(tmp1, Address(str1, cnt2)); 810 ldr(tmp2, Address(str2, cnt2)); 811 adds(cnt2, cnt2, 8); 812 } else if (isLU) { 813 ldrs(vtmp, Address(str1, cnt1)); 814 ldr(tmp2, Address(str2, cnt2)); 815 add(cnt1, cnt1, 4); 816 zip1(vtmp, T8B, vtmp, vtmpZ); 817 fmovd(tmp1, vtmp); 818 adds(cnt2, cnt2, 8); 819 } else { // UL 820 ldrs(vtmp, Address(str2, cnt2)); 821 ldr(tmp1, Address(str1, cnt1)); 822 zip1(vtmp, T8B, vtmp, vtmpZ); 823 add(cnt1, cnt1, 8); 824 fmovd(tmp2, vtmp); 825 adds(cnt2, cnt2, 4); 826 } 827 br(GE, TAIL); 828 829 eor(rscratch2, tmp1, tmp2); 830 cbz(rscratch2, NEXT_WORD); 831 b(DIFF); 832 bind(TAIL); 833 eor(rscratch2, tmp1, tmp2); 834 cbnz(rscratch2, DIFF); 835 // Last longword. In the case where length == 4 we compare the 836 // same longword twice, but that's still faster than another 837 // conditional branch. 838 if (str1_isL == str2_isL) { 839 ldr(tmp1, Address(str1)); 840 ldr(tmp2, Address(str2)); 841 } else if (isLU) { 842 ldrs(vtmp, Address(str1)); 843 ldr(tmp2, Address(str2)); 844 zip1(vtmp, T8B, vtmp, vtmpZ); 845 fmovd(tmp1, vtmp); 846 } else { // UL 847 ldrs(vtmp, Address(str2)); 848 ldr(tmp1, Address(str1)); 849 zip1(vtmp, T8B, vtmp, vtmpZ); 850 fmovd(tmp2, vtmp); 851 } 852 bind(TAIL_CHECK); 853 eor(rscratch2, tmp1, tmp2); 854 cbz(rscratch2, DONE); 855 856 // Find the first different characters in the longwords and 857 // compute their difference. 858 bind(DIFF); 859 rev(rscratch2, rscratch2); 860 clz(rscratch2, rscratch2); 861 andr(rscratch2, rscratch2, isLL ? -8 : -16); 862 lsrv(tmp1, tmp1, rscratch2); 863 (this->*ext_chr)(tmp1, tmp1); 864 lsrv(tmp2, tmp2, rscratch2); 865 (this->*ext_chr)(tmp2, tmp2); 866 subw(result, tmp1, tmp2); 867 b(DONE); 868 } 869 870 bind(STUB); 871 RuntimeAddress stub = nullptr; 872 switch(ae) { 873 case StrIntrinsicNode::LL: 874 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 875 break; 876 case StrIntrinsicNode::UU: 877 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 878 break; 879 case StrIntrinsicNode::LU: 880 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 881 break; 882 case StrIntrinsicNode::UL: 883 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 884 break; 885 default: 886 ShouldNotReachHere(); 887 } 888 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 889 address call = trampoline_call(stub); 890 if (call == nullptr) { 891 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 892 ciEnv::current()->record_failure("CodeCache is full"); 893 return; 894 } 895 b(DONE); 896 897 bind(SHORT_STRING); 898 // Is the minimum length zero? 899 cbz(cnt2, DONE); 900 // arrange code to do most branches while loading and loading next characters 901 // while comparing previous 902 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 903 subs(cnt2, cnt2, 1); 904 br(EQ, SHORT_LAST_INIT); 905 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 906 b(SHORT_LOOP_START); 907 bind(SHORT_LOOP); 908 subs(cnt2, cnt2, 1); 909 br(EQ, SHORT_LAST); 910 bind(SHORT_LOOP_START); 911 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 912 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 913 cmp(tmp1, cnt1); 914 br(NE, SHORT_LOOP_TAIL); 915 subs(cnt2, cnt2, 1); 916 br(EQ, SHORT_LAST2); 917 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 918 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 919 cmp(tmp2, rscratch1); 920 br(EQ, SHORT_LOOP); 921 sub(result, tmp2, rscratch1); 922 b(DONE); 923 bind(SHORT_LOOP_TAIL); 924 sub(result, tmp1, cnt1); 925 b(DONE); 926 bind(SHORT_LAST2); 927 cmp(tmp2, rscratch1); 928 br(EQ, DONE); 929 sub(result, tmp2, rscratch1); 930 931 b(DONE); 932 bind(SHORT_LAST_INIT); 933 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 934 bind(SHORT_LAST); 935 cmp(tmp1, cnt1); 936 br(EQ, DONE); 937 sub(result, tmp1, cnt1); 938 939 bind(DONE); 940 941 BLOCK_COMMENT("} string_compare"); 942 } 943 944 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 945 FloatRegister src2, Condition cond, bool isQ) { 946 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 947 FloatRegister zn = src1, zm = src2; 948 bool needs_negation = false; 949 switch (cond) { 950 case LT: cond = GT; zn = src2; zm = src1; break; 951 case LE: cond = GE; zn = src2; zm = src1; break; 952 case LO: cond = HI; zn = src2; zm = src1; break; 953 case LS: cond = HS; zn = src2; zm = src1; break; 954 case NE: cond = EQ; needs_negation = true; break; 955 default: 956 break; 957 } 958 959 if (is_floating_point_type(bt)) { 960 fcm(cond, dst, size, zn, zm); 961 } else { 962 cm(cond, dst, size, zn, zm); 963 } 964 965 if (needs_negation) { 966 notr(dst, isQ ? T16B : T8B, dst); 967 } 968 } 969 970 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 971 Condition cond, bool isQ) { 972 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 973 if (bt == T_FLOAT || bt == T_DOUBLE) { 974 if (cond == Assembler::NE) { 975 fcm(Assembler::EQ, dst, size, src); 976 notr(dst, isQ ? T16B : T8B, dst); 977 } else { 978 fcm(cond, dst, size, src); 979 } 980 } else { 981 if (cond == Assembler::NE) { 982 cm(Assembler::EQ, dst, size, src); 983 notr(dst, isQ ? T16B : T8B, dst); 984 } else { 985 cm(cond, dst, size, src); 986 } 987 } 988 } 989 990 // Compress the least significant bit of each byte to the rightmost and clear 991 // the higher garbage bits. 992 void C2_MacroAssembler::bytemask_compress(Register dst) { 993 // Example input, dst = 0x01 00 00 00 01 01 00 01 994 // The "??" bytes are garbage. 995 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 996 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 997 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 998 andr(dst, dst, 0xff); // dst = 0x8D 999 } 1000 1001 // Pack the lowest-numbered bit of each mask element in src into a long value 1002 // in dst, at most the first 64 lane elements. 1003 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1004 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1005 FloatRegister vtmp1, FloatRegister vtmp2) { 1006 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1007 assert_different_registers(dst, rscratch1); 1008 assert_different_registers(vtmp1, vtmp2); 1009 1010 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1011 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1012 // Expected: dst = 0x658D 1013 1014 // Convert the mask into vector with sequential bytes. 1015 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1016 sve_cpy(vtmp1, size, src, 1, false); 1017 if (bt != T_BYTE) { 1018 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1019 } 1020 1021 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1022 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1023 // is to compress each significant bit of the byte in a cross-lane way. Due 1024 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1025 // (bit-compress in each lane) with the biggest lane size (T = D) then 1026 // concatenate the results. 1027 1028 // The second source input of BEXT, initialized with 0x01 in each byte. 1029 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1030 sve_dup(vtmp2, B, 1); 1031 1032 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1033 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1034 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1035 // --------------------------------------- 1036 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1037 sve_bext(vtmp1, D, vtmp1, vtmp2); 1038 1039 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1040 // result to dst. 1041 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1042 // dst = 0x658D 1043 if (lane_cnt <= 8) { 1044 // No need to concatenate. 1045 umov(dst, vtmp1, B, 0); 1046 } else if (lane_cnt <= 16) { 1047 ins(vtmp1, B, vtmp1, 1, 8); 1048 umov(dst, vtmp1, H, 0); 1049 } else { 1050 // As the lane count is 64 at most, the final expected value must be in 1051 // the lowest 64 bits after narrowing vtmp1 from D to B. 1052 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1053 umov(dst, vtmp1, D, 0); 1054 } 1055 } else if (UseSVE > 0) { 1056 // Compress the lowest 8 bytes. 1057 fmovd(dst, vtmp1); 1058 bytemask_compress(dst); 1059 if (lane_cnt <= 8) return; 1060 1061 // Repeat on higher bytes and join the results. 1062 // Compress 8 bytes in each iteration. 1063 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1064 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1065 bytemask_compress(rscratch1); 1066 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1067 } 1068 } else { 1069 assert(false, "unsupported"); 1070 ShouldNotReachHere(); 1071 } 1072 } 1073 1074 // Unpack the mask, a long value in src, into predicate register dst based on the 1075 // corresponding data type. Note that dst can support at most 64 lanes. 1076 // Below example gives the expected dst predicate register in different types, with 1077 // a valid src(0x658D) on a 1024-bit vector size machine. 1078 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1079 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1080 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1081 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1082 // 1083 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1084 // has 24 significant bits would be an invalid input if dst predicate register refers to 1085 // a LONG type 1024-bit vector, which has at most 16 lanes. 1086 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1087 FloatRegister vtmp1, FloatRegister vtmp2) { 1088 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1089 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1090 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1091 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1092 // Expected: dst = 0b01101001 10001101 1093 1094 // Put long value from general purpose register into the first lane of vector. 1095 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1096 sve_dup(vtmp1, B, 0); 1097 mov(vtmp1, D, 0, src); 1098 1099 // As sve_cmp generates mask value with the minimum unit in byte, we should 1100 // transform the value in the first lane which is mask in bit now to the 1101 // mask in byte, which can be done by SVE2's BDEP instruction. 1102 1103 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1104 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1105 if (lane_cnt <= 8) { 1106 // Nothing. As only one byte exsits. 1107 } else if (lane_cnt <= 16) { 1108 ins(vtmp1, B, vtmp1, 8, 1); 1109 mov(vtmp1, B, 1, zr); 1110 } else { 1111 sve_vector_extend(vtmp1, D, vtmp1, B); 1112 } 1113 1114 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1115 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1116 sve_dup(vtmp2, B, 1); 1117 1118 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1119 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1120 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1121 // --------------------------------------- 1122 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1123 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1124 1125 if (bt != T_BYTE) { 1126 sve_vector_extend(vtmp1, size, vtmp1, B); 1127 } 1128 // Generate mask according to the given vector, in which the elements have been 1129 // extended to expected type. 1130 // dst = 0b01101001 10001101 1131 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1132 } 1133 1134 // Clobbers: rflags 1135 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1136 FloatRegister zn, FloatRegister zm, Condition cond) { 1137 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1138 FloatRegister z1 = zn, z2 = zm; 1139 switch (cond) { 1140 case LE: z1 = zm; z2 = zn; cond = GE; break; 1141 case LT: z1 = zm; z2 = zn; cond = GT; break; 1142 case LO: z1 = zm; z2 = zn; cond = HI; break; 1143 case LS: z1 = zm; z2 = zn; cond = HS; break; 1144 default: 1145 break; 1146 } 1147 1148 SIMD_RegVariant size = elemType_to_regVariant(bt); 1149 if (is_floating_point_type(bt)) { 1150 sve_fcm(cond, pd, size, pg, z1, z2); 1151 } else { 1152 assert(is_integral_type(bt), "unsupported element type"); 1153 sve_cmp(cond, pd, size, pg, z1, z2); 1154 } 1155 } 1156 1157 // Get index of the last mask lane that is set 1158 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1159 SIMD_RegVariant size = elemType_to_regVariant(bt); 1160 sve_rev(ptmp, size, src); 1161 sve_brkb(ptmp, ptrue, ptmp, false); 1162 sve_cntp(dst, size, ptrue, ptmp); 1163 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1164 subw(dst, rscratch1, dst); 1165 } 1166 1167 // Extend integer vector src to dst with the same lane count 1168 // but larger element size, e.g. 4B -> 4I 1169 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1170 FloatRegister src, BasicType src_bt) { 1171 if (src_bt == T_BYTE) { 1172 if (dst_bt == T_SHORT) { 1173 // 4B/8B to 4S/8S 1174 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1175 sxtl(dst, T8H, src, T8B); 1176 } else { 1177 // 4B to 4I 1178 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1179 sxtl(dst, T8H, src, T8B); 1180 sxtl(dst, T4S, dst, T4H); 1181 } 1182 } else if (src_bt == T_SHORT) { 1183 // 4S to 4I 1184 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1185 sxtl(dst, T4S, src, T4H); 1186 } else if (src_bt == T_INT) { 1187 // 2I to 2L 1188 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1189 sxtl(dst, T2D, src, T2S); 1190 } else { 1191 ShouldNotReachHere(); 1192 } 1193 } 1194 1195 // Narrow integer vector src down to dst with the same lane count 1196 // but smaller element size, e.g. 4I -> 4B 1197 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1198 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1199 if (src_bt == T_SHORT) { 1200 // 4S/8S to 4B/8B 1201 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1202 assert(dst_bt == T_BYTE, "unsupported"); 1203 xtn(dst, T8B, src, T8H); 1204 } else if (src_bt == T_INT) { 1205 // 4I to 4B/4S 1206 assert(src_vlen_in_bytes == 16, "unsupported"); 1207 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1208 xtn(dst, T4H, src, T4S); 1209 if (dst_bt == T_BYTE) { 1210 xtn(dst, T8B, dst, T8H); 1211 } 1212 } else if (src_bt == T_LONG) { 1213 // 2L to 2I 1214 assert(src_vlen_in_bytes == 16, "unsupported"); 1215 assert(dst_bt == T_INT, "unsupported"); 1216 xtn(dst, T2S, src, T2D); 1217 } else { 1218 ShouldNotReachHere(); 1219 } 1220 } 1221 1222 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1223 FloatRegister src, SIMD_RegVariant src_size) { 1224 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1225 if (src_size == B) { 1226 switch (dst_size) { 1227 case H: 1228 sve_sunpklo(dst, H, src); 1229 break; 1230 case S: 1231 sve_sunpklo(dst, H, src); 1232 sve_sunpklo(dst, S, dst); 1233 break; 1234 case D: 1235 sve_sunpklo(dst, H, src); 1236 sve_sunpklo(dst, S, dst); 1237 sve_sunpklo(dst, D, dst); 1238 break; 1239 default: 1240 ShouldNotReachHere(); 1241 } 1242 } else if (src_size == H) { 1243 if (dst_size == S) { 1244 sve_sunpklo(dst, S, src); 1245 } else { // D 1246 sve_sunpklo(dst, S, src); 1247 sve_sunpklo(dst, D, dst); 1248 } 1249 } else if (src_size == S) { 1250 sve_sunpklo(dst, D, src); 1251 } 1252 } 1253 1254 // Vector narrow from src to dst with specified element sizes. 1255 // High part of dst vector will be filled with zero. 1256 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1257 FloatRegister src, SIMD_RegVariant src_size, 1258 FloatRegister tmp) { 1259 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1260 assert_different_registers(src, tmp); 1261 sve_dup(tmp, src_size, 0); 1262 if (src_size == D) { 1263 switch (dst_size) { 1264 case S: 1265 sve_uzp1(dst, S, src, tmp); 1266 break; 1267 case H: 1268 assert_different_registers(dst, tmp); 1269 sve_uzp1(dst, S, src, tmp); 1270 sve_uzp1(dst, H, dst, tmp); 1271 break; 1272 case B: 1273 assert_different_registers(dst, tmp); 1274 sve_uzp1(dst, S, src, tmp); 1275 sve_uzp1(dst, H, dst, tmp); 1276 sve_uzp1(dst, B, dst, tmp); 1277 break; 1278 default: 1279 ShouldNotReachHere(); 1280 } 1281 } else if (src_size == S) { 1282 if (dst_size == H) { 1283 sve_uzp1(dst, H, src, tmp); 1284 } else { // B 1285 assert_different_registers(dst, tmp); 1286 sve_uzp1(dst, H, src, tmp); 1287 sve_uzp1(dst, B, dst, tmp); 1288 } 1289 } else if (src_size == H) { 1290 sve_uzp1(dst, B, src, tmp); 1291 } 1292 } 1293 1294 // Extend src predicate to dst predicate with the same lane count but larger 1295 // element size, e.g. 64Byte -> 512Long 1296 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1297 uint dst_element_length_in_bytes, 1298 uint src_element_length_in_bytes) { 1299 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1300 sve_punpklo(dst, src); 1301 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1302 sve_punpklo(dst, src); 1303 sve_punpklo(dst, dst); 1304 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1305 sve_punpklo(dst, src); 1306 sve_punpklo(dst, dst); 1307 sve_punpklo(dst, dst); 1308 } else { 1309 assert(false, "unsupported"); 1310 ShouldNotReachHere(); 1311 } 1312 } 1313 1314 // Narrow src predicate to dst predicate with the same lane count but 1315 // smaller element size, e.g. 512Long -> 64Byte 1316 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1317 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1318 // The insignificant bits in src predicate are expected to be zero. 1319 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1320 // passed as the second argument. An example narrowing operation with a given mask would be - 1321 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1322 // Mask (for 2 Longs) : TF 1323 // Predicate register for the above mask (16 bits) : 00000001 00000000 1324 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1325 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1326 assert_different_registers(src, ptmp); 1327 assert_different_registers(dst, ptmp); 1328 sve_pfalse(ptmp); 1329 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1330 sve_uzp1(dst, B, src, ptmp); 1331 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1332 sve_uzp1(dst, H, src, ptmp); 1333 sve_uzp1(dst, B, dst, ptmp); 1334 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1335 sve_uzp1(dst, S, src, ptmp); 1336 sve_uzp1(dst, H, dst, ptmp); 1337 sve_uzp1(dst, B, dst, ptmp); 1338 } else { 1339 assert(false, "unsupported"); 1340 ShouldNotReachHere(); 1341 } 1342 } 1343 1344 // Vector reduction add for integral type with ASIMD instructions. 1345 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1346 Register isrc, FloatRegister vsrc, 1347 unsigned vector_length_in_bytes, 1348 FloatRegister vtmp) { 1349 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1350 assert_different_registers(dst, isrc); 1351 bool isQ = vector_length_in_bytes == 16; 1352 1353 BLOCK_COMMENT("neon_reduce_add_integral {"); 1354 switch(bt) { 1355 case T_BYTE: 1356 addv(vtmp, isQ ? T16B : T8B, vsrc); 1357 smov(dst, vtmp, B, 0); 1358 addw(dst, dst, isrc, ext::sxtb); 1359 break; 1360 case T_SHORT: 1361 addv(vtmp, isQ ? T8H : T4H, vsrc); 1362 smov(dst, vtmp, H, 0); 1363 addw(dst, dst, isrc, ext::sxth); 1364 break; 1365 case T_INT: 1366 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1367 umov(dst, vtmp, S, 0); 1368 addw(dst, dst, isrc); 1369 break; 1370 case T_LONG: 1371 assert(isQ, "unsupported"); 1372 addpd(vtmp, vsrc); 1373 umov(dst, vtmp, D, 0); 1374 add(dst, dst, isrc); 1375 break; 1376 default: 1377 assert(false, "unsupported"); 1378 ShouldNotReachHere(); 1379 } 1380 BLOCK_COMMENT("} neon_reduce_add_integral"); 1381 } 1382 1383 // Vector reduction multiply for integral type with ASIMD instructions. 1384 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1385 // Clobbers: rscratch1 1386 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1387 Register isrc, FloatRegister vsrc, 1388 unsigned vector_length_in_bytes, 1389 FloatRegister vtmp1, FloatRegister vtmp2) { 1390 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1391 bool isQ = vector_length_in_bytes == 16; 1392 1393 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1394 switch(bt) { 1395 case T_BYTE: 1396 if (isQ) { 1397 // Multiply the lower half and higher half of vector iteratively. 1398 // vtmp1 = vsrc[8:15] 1399 ins(vtmp1, D, vsrc, 0, 1); 1400 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1401 mulv(vtmp1, T8B, vtmp1, vsrc); 1402 // vtmp2 = vtmp1[4:7] 1403 ins(vtmp2, S, vtmp1, 0, 1); 1404 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1405 mulv(vtmp1, T8B, vtmp2, vtmp1); 1406 } else { 1407 ins(vtmp1, S, vsrc, 0, 1); 1408 mulv(vtmp1, T8B, vtmp1, vsrc); 1409 } 1410 // vtmp2 = vtmp1[2:3] 1411 ins(vtmp2, H, vtmp1, 0, 1); 1412 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1413 mulv(vtmp2, T8B, vtmp2, vtmp1); 1414 // dst = vtmp2[0] * isrc * vtmp2[1] 1415 umov(rscratch1, vtmp2, B, 0); 1416 mulw(dst, rscratch1, isrc); 1417 sxtb(dst, dst); 1418 umov(rscratch1, vtmp2, B, 1); 1419 mulw(dst, rscratch1, dst); 1420 sxtb(dst, dst); 1421 break; 1422 case T_SHORT: 1423 if (isQ) { 1424 ins(vtmp2, D, vsrc, 0, 1); 1425 mulv(vtmp2, T4H, vtmp2, vsrc); 1426 ins(vtmp1, S, vtmp2, 0, 1); 1427 mulv(vtmp1, T4H, vtmp1, vtmp2); 1428 } else { 1429 ins(vtmp1, S, vsrc, 0, 1); 1430 mulv(vtmp1, T4H, vtmp1, vsrc); 1431 } 1432 umov(rscratch1, vtmp1, H, 0); 1433 mulw(dst, rscratch1, isrc); 1434 sxth(dst, dst); 1435 umov(rscratch1, vtmp1, H, 1); 1436 mulw(dst, rscratch1, dst); 1437 sxth(dst, dst); 1438 break; 1439 case T_INT: 1440 if (isQ) { 1441 ins(vtmp1, D, vsrc, 0, 1); 1442 mulv(vtmp1, T2S, vtmp1, vsrc); 1443 } else { 1444 vtmp1 = vsrc; 1445 } 1446 umov(rscratch1, vtmp1, S, 0); 1447 mul(dst, rscratch1, isrc); 1448 umov(rscratch1, vtmp1, S, 1); 1449 mul(dst, rscratch1, dst); 1450 break; 1451 case T_LONG: 1452 umov(rscratch1, vsrc, D, 0); 1453 mul(dst, isrc, rscratch1); 1454 umov(rscratch1, vsrc, D, 1); 1455 mul(dst, dst, rscratch1); 1456 break; 1457 default: 1458 assert(false, "unsupported"); 1459 ShouldNotReachHere(); 1460 } 1461 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1462 } 1463 1464 // Vector reduction multiply for floating-point type with ASIMD instructions. 1465 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1466 FloatRegister fsrc, FloatRegister vsrc, 1467 unsigned vector_length_in_bytes, 1468 FloatRegister vtmp) { 1469 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1470 bool isQ = vector_length_in_bytes == 16; 1471 1472 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1473 switch(bt) { 1474 case T_FLOAT: 1475 fmuls(dst, fsrc, vsrc); 1476 ins(vtmp, S, vsrc, 0, 1); 1477 fmuls(dst, dst, vtmp); 1478 if (isQ) { 1479 ins(vtmp, S, vsrc, 0, 2); 1480 fmuls(dst, dst, vtmp); 1481 ins(vtmp, S, vsrc, 0, 3); 1482 fmuls(dst, dst, vtmp); 1483 } 1484 break; 1485 case T_DOUBLE: 1486 assert(isQ, "unsupported"); 1487 fmuld(dst, fsrc, vsrc); 1488 ins(vtmp, D, vsrc, 0, 1); 1489 fmuld(dst, dst, vtmp); 1490 break; 1491 default: 1492 assert(false, "unsupported"); 1493 ShouldNotReachHere(); 1494 } 1495 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1496 } 1497 1498 // Helper to select logical instruction 1499 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1500 Register Rn, Register Rm, 1501 enum shift_kind kind, unsigned shift) { 1502 switch(opc) { 1503 case Op_AndReductionV: 1504 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1505 break; 1506 case Op_OrReductionV: 1507 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1508 break; 1509 case Op_XorReductionV: 1510 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1511 break; 1512 default: 1513 assert(false, "unsupported"); 1514 ShouldNotReachHere(); 1515 } 1516 } 1517 1518 // Vector reduction logical operations And, Or, Xor 1519 // Clobbers: rscratch1 1520 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1521 Register isrc, FloatRegister vsrc, 1522 unsigned vector_length_in_bytes) { 1523 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1524 "unsupported"); 1525 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1526 assert_different_registers(dst, isrc); 1527 bool isQ = vector_length_in_bytes == 16; 1528 1529 BLOCK_COMMENT("neon_reduce_logical {"); 1530 umov(rscratch1, vsrc, isQ ? D : S, 0); 1531 umov(dst, vsrc, isQ ? D : S, 1); 1532 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1533 switch(bt) { 1534 case T_BYTE: 1535 if (isQ) { 1536 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1537 } 1538 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1539 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1540 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1541 sxtb(dst, dst); 1542 break; 1543 case T_SHORT: 1544 if (isQ) { 1545 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1546 } 1547 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1548 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1549 sxth(dst, dst); 1550 break; 1551 case T_INT: 1552 if (isQ) { 1553 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1554 } 1555 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1556 break; 1557 case T_LONG: 1558 assert(isQ, "unsupported"); 1559 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1560 break; 1561 default: 1562 assert(false, "unsupported"); 1563 ShouldNotReachHere(); 1564 } 1565 BLOCK_COMMENT("} neon_reduce_logical"); 1566 } 1567 1568 // Vector reduction min/max for integral type with ASIMD instructions. 1569 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1570 // Clobbers: rscratch1, rflags 1571 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1572 Register isrc, FloatRegister vsrc, 1573 unsigned vector_length_in_bytes, 1574 FloatRegister vtmp) { 1575 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1576 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1577 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1578 assert_different_registers(dst, isrc); 1579 bool isQ = vector_length_in_bytes == 16; 1580 bool is_min = opc == Op_MinReductionV; 1581 1582 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1583 if (bt == T_LONG) { 1584 assert(vtmp == fnoreg, "should be"); 1585 assert(isQ, "should be"); 1586 umov(rscratch1, vsrc, D, 0); 1587 cmp(isrc, rscratch1); 1588 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1589 umov(rscratch1, vsrc, D, 1); 1590 cmp(dst, rscratch1); 1591 csel(dst, dst, rscratch1, is_min ? LT : GT); 1592 } else { 1593 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1594 if (size == T2S) { 1595 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 1596 } else { 1597 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 1598 } 1599 if (bt == T_INT) { 1600 umov(dst, vtmp, S, 0); 1601 } else { 1602 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 1603 } 1604 cmpw(dst, isrc); 1605 cselw(dst, dst, isrc, is_min ? LT : GT); 1606 } 1607 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 1608 } 1609 1610 // Vector reduction for integral type with SVE instruction. 1611 // Supported operations are Add, And, Or, Xor, Max, Min. 1612 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 1613 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1614 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1615 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1616 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1617 assert_different_registers(src1, dst); 1618 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1619 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1620 switch (opc) { 1621 case Op_AddReductionVI: { 1622 sve_uaddv(tmp, size, pg, src2); 1623 if (bt == T_BYTE) { 1624 smov(dst, tmp, size, 0); 1625 addw(dst, src1, dst, ext::sxtb); 1626 } else if (bt == T_SHORT) { 1627 smov(dst, tmp, size, 0); 1628 addw(dst, src1, dst, ext::sxth); 1629 } else { 1630 umov(dst, tmp, size, 0); 1631 addw(dst, dst, src1); 1632 } 1633 break; 1634 } 1635 case Op_AddReductionVL: { 1636 sve_uaddv(tmp, size, pg, src2); 1637 umov(dst, tmp, size, 0); 1638 add(dst, dst, src1); 1639 break; 1640 } 1641 case Op_AndReductionV: { 1642 sve_andv(tmp, size, pg, src2); 1643 if (bt == T_INT || bt == T_LONG) { 1644 umov(dst, tmp, size, 0); 1645 } else { 1646 smov(dst, tmp, size, 0); 1647 } 1648 if (bt == T_LONG) { 1649 andr(dst, dst, src1); 1650 } else { 1651 andw(dst, dst, src1); 1652 } 1653 break; 1654 } 1655 case Op_OrReductionV: { 1656 sve_orv(tmp, size, pg, src2); 1657 if (bt == T_INT || bt == T_LONG) { 1658 umov(dst, tmp, size, 0); 1659 } else { 1660 smov(dst, tmp, size, 0); 1661 } 1662 if (bt == T_LONG) { 1663 orr(dst, dst, src1); 1664 } else { 1665 orrw(dst, dst, src1); 1666 } 1667 break; 1668 } 1669 case Op_XorReductionV: { 1670 sve_eorv(tmp, size, pg, src2); 1671 if (bt == T_INT || bt == T_LONG) { 1672 umov(dst, tmp, size, 0); 1673 } else { 1674 smov(dst, tmp, size, 0); 1675 } 1676 if (bt == T_LONG) { 1677 eor(dst, dst, src1); 1678 } else { 1679 eorw(dst, dst, src1); 1680 } 1681 break; 1682 } 1683 case Op_MaxReductionV: { 1684 sve_smaxv(tmp, size, pg, src2); 1685 if (bt == T_INT || bt == T_LONG) { 1686 umov(dst, tmp, size, 0); 1687 } else { 1688 smov(dst, tmp, size, 0); 1689 } 1690 if (bt == T_LONG) { 1691 cmp(dst, src1); 1692 csel(dst, dst, src1, Assembler::GT); 1693 } else { 1694 cmpw(dst, src1); 1695 cselw(dst, dst, src1, Assembler::GT); 1696 } 1697 break; 1698 } 1699 case Op_MinReductionV: { 1700 sve_sminv(tmp, size, pg, src2); 1701 if (bt == T_INT || bt == T_LONG) { 1702 umov(dst, tmp, size, 0); 1703 } else { 1704 smov(dst, tmp, size, 0); 1705 } 1706 if (bt == T_LONG) { 1707 cmp(dst, src1); 1708 csel(dst, dst, src1, Assembler::LT); 1709 } else { 1710 cmpw(dst, src1); 1711 cselw(dst, dst, src1, Assembler::LT); 1712 } 1713 break; 1714 } 1715 default: 1716 assert(false, "unsupported"); 1717 ShouldNotReachHere(); 1718 } 1719 1720 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1721 if (bt == T_BYTE) { 1722 sxtb(dst, dst); 1723 } else if (bt == T_SHORT) { 1724 sxth(dst, dst); 1725 } 1726 } 1727 } 1728 1729 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 1730 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 1731 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 1732 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 1733 uint32_t max_vector_length = Matcher::max_vector_size(bt); 1734 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 1735 1736 // Set all elements to false if the input "lane_cnt" is zero. 1737 if (lane_cnt == 0) { 1738 sve_pfalse(dst); 1739 return; 1740 } 1741 1742 SIMD_RegVariant size = elemType_to_regVariant(bt); 1743 assert(size != Q, "invalid size"); 1744 1745 // Set all true if "lane_cnt" equals to the max lane count. 1746 if (lane_cnt == max_vector_length) { 1747 sve_ptrue(dst, size, /* ALL */ 0b11111); 1748 return; 1749 } 1750 1751 // Fixed numbers for "ptrue". 1752 switch(lane_cnt) { 1753 case 1: /* VL1 */ 1754 case 2: /* VL2 */ 1755 case 3: /* VL3 */ 1756 case 4: /* VL4 */ 1757 case 5: /* VL5 */ 1758 case 6: /* VL6 */ 1759 case 7: /* VL7 */ 1760 case 8: /* VL8 */ 1761 sve_ptrue(dst, size, lane_cnt); 1762 return; 1763 case 16: 1764 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1765 return; 1766 case 32: 1767 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1768 return; 1769 case 64: 1770 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1771 return; 1772 case 128: 1773 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1774 return; 1775 case 256: 1776 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1777 return; 1778 default: 1779 break; 1780 } 1781 1782 // Special patterns for "ptrue". 1783 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 1784 sve_ptrue(dst, size, /* POW2 */ 0b00000); 1785 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 1786 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 1787 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 1788 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 1789 } else { 1790 // Encode to "whileltw" for the remaining cases. 1791 mov(rscratch1, lane_cnt); 1792 sve_whileltw(dst, size, zr, rscratch1); 1793 } 1794 } 1795 1796 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1797 // Any remaining elements of dst will be filled with zero. 1798 // Clobbers: rscratch1 1799 // Preserves: src, mask 1800 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 1801 FloatRegister vtmp1, FloatRegister vtmp2, 1802 PRegister pgtmp) { 1803 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1804 assert_different_registers(dst, src, vtmp1, vtmp2); 1805 assert_different_registers(mask, pgtmp); 1806 1807 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 1808 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 1809 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 1810 sve_dup(vtmp2, H, 0); 1811 1812 // Extend lowest half to type INT. 1813 // dst = 00004444 00003333 00002222 00001111 1814 sve_uunpklo(dst, S, src); 1815 // pgtmp = 00000001 00000000 00000001 00000001 1816 sve_punpklo(pgtmp, mask); 1817 // Pack the active elements in size of type INT to the right, 1818 // and fill the remainings with zero. 1819 // dst = 00000000 00004444 00002222 00001111 1820 sve_compact(dst, S, dst, pgtmp); 1821 // Narrow the result back to type SHORT. 1822 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 1823 sve_uzp1(dst, H, dst, vtmp2); 1824 // Count the active elements of lowest half. 1825 // rscratch1 = 3 1826 sve_cntp(rscratch1, S, ptrue, pgtmp); 1827 1828 // Repeat to the highest half. 1829 // pgtmp = 00000001 00000000 00000000 00000001 1830 sve_punpkhi(pgtmp, mask); 1831 // vtmp1 = 00008888 00007777 00006666 00005555 1832 sve_uunpkhi(vtmp1, S, src); 1833 // vtmp1 = 00000000 00000000 00008888 00005555 1834 sve_compact(vtmp1, S, vtmp1, pgtmp); 1835 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1836 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 1837 1838 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 1839 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1840 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1841 // TRUE_CNT is the number of active elements in the compressed low. 1842 neg(rscratch1, rscratch1); 1843 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1844 sve_index(vtmp2, H, rscratch1, 1); 1845 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 1846 sve_tbl(vtmp1, H, vtmp1, vtmp2); 1847 1848 // Combine the compressed high(after shifted) with the compressed low. 1849 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 1850 sve_orr(dst, dst, vtmp1); 1851 } 1852 1853 // Clobbers: rscratch1, rscratch2 1854 // Preserves: src, mask 1855 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 1856 FloatRegister vtmp1, FloatRegister vtmp2, 1857 FloatRegister vtmp3, FloatRegister vtmp4, 1858 PRegister ptmp, PRegister pgtmp) { 1859 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1860 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 1861 assert_different_registers(mask, ptmp, pgtmp); 1862 // Example input: src = 88 77 66 55 44 33 22 11 1863 // mask = 01 00 00 01 01 00 01 01 1864 // Expected result: dst = 00 00 00 88 55 44 22 11 1865 1866 sve_dup(vtmp4, B, 0); 1867 // Extend lowest half to type SHORT. 1868 // vtmp1 = 0044 0033 0022 0011 1869 sve_uunpklo(vtmp1, H, src); 1870 // ptmp = 0001 0000 0001 0001 1871 sve_punpklo(ptmp, mask); 1872 // Count the active elements of lowest half. 1873 // rscratch2 = 3 1874 sve_cntp(rscratch2, H, ptrue, ptmp); 1875 // Pack the active elements in size of type SHORT to the right, 1876 // and fill the remainings with zero. 1877 // dst = 0000 0044 0022 0011 1878 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 1879 // Narrow the result back to type BYTE. 1880 // dst = 00 00 00 00 00 44 22 11 1881 sve_uzp1(dst, B, dst, vtmp4); 1882 1883 // Repeat to the highest half. 1884 // ptmp = 0001 0000 0000 0001 1885 sve_punpkhi(ptmp, mask); 1886 // vtmp1 = 0088 0077 0066 0055 1887 sve_uunpkhi(vtmp2, H, src); 1888 // vtmp1 = 0000 0000 0088 0055 1889 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 1890 1891 sve_dup(vtmp4, B, 0); 1892 // vtmp1 = 00 00 00 00 00 00 88 55 1893 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 1894 1895 // Compressed low: dst = 00 00 00 00 00 44 22 11 1896 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 1897 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1898 // TRUE_CNT is the number of active elements in the compressed low. 1899 neg(rscratch2, rscratch2); 1900 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1901 sve_index(vtmp2, B, rscratch2, 1); 1902 // vtmp1 = 00 00 00 88 55 00 00 00 1903 sve_tbl(vtmp1, B, vtmp1, vtmp2); 1904 // Combine the compressed high(after shifted) with the compressed low. 1905 // dst = 00 00 00 88 55 44 22 11 1906 sve_orr(dst, dst, vtmp1); 1907 } 1908 1909 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1910 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1911 SIMD_Arrangement size = isQ ? T16B : T8B; 1912 if (bt == T_BYTE) { 1913 rbit(dst, size, src); 1914 } else { 1915 neon_reverse_bytes(dst, src, bt, isQ); 1916 rbit(dst, size, dst); 1917 } 1918 } 1919 1920 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1921 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1922 SIMD_Arrangement size = isQ ? T16B : T8B; 1923 switch (bt) { 1924 case T_BYTE: 1925 if (dst != src) { 1926 orr(dst, size, src, src); 1927 } 1928 break; 1929 case T_SHORT: 1930 rev16(dst, size, src); 1931 break; 1932 case T_INT: 1933 rev32(dst, size, src); 1934 break; 1935 case T_LONG: 1936 rev64(dst, size, src); 1937 break; 1938 default: 1939 assert(false, "unsupported"); 1940 ShouldNotReachHere(); 1941 } 1942 } 1943 1944 // Extract a scalar element from an sve vector at position 'idx'. 1945 // The input elements in src are expected to be of integral type. 1946 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 1947 int idx, FloatRegister vtmp) { 1948 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1949 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1950 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 1951 if (bt == T_INT || bt == T_LONG) { 1952 umov(dst, src, size, idx); 1953 } else { 1954 smov(dst, src, size, idx); 1955 } 1956 } else { 1957 sve_orr(vtmp, src, src); 1958 sve_ext(vtmp, vtmp, idx << size); 1959 if (bt == T_INT || bt == T_LONG) { 1960 umov(dst, vtmp, size, 0); 1961 } else { 1962 smov(dst, vtmp, size, 0); 1963 } 1964 } 1965 } 1966 1967 // java.lang.Math::round intrinsics 1968 1969 // Clobbers: rscratch1, rflags 1970 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1971 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 1972 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 1973 switch (T) { 1974 case T2S: 1975 case T4S: 1976 fmovs(tmp1, T, 0.5f); 1977 mov(rscratch1, jint_cast(0x1.0p23f)); 1978 break; 1979 case T2D: 1980 fmovd(tmp1, T, 0.5); 1981 mov(rscratch1, julong_cast(0x1.0p52)); 1982 break; 1983 default: 1984 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 1985 } 1986 fadd(tmp1, T, tmp1, src); 1987 fcvtms(tmp1, T, tmp1); 1988 // tmp1 = floor(src + 0.5, ties to even) 1989 1990 fcvtas(dst, T, src); 1991 // dst = round(src), ties to away 1992 1993 fneg(tmp3, T, src); 1994 dup(tmp2, T, rscratch1); 1995 cm(HS, tmp3, T, tmp3, tmp2); 1996 // tmp3 is now a set of flags 1997 1998 bif(dst, T16B, tmp1, tmp3); 1999 // result in dst 2000 } 2001 2002 // Clobbers: rscratch1, rflags 2003 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2004 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2005 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2006 assert_different_registers(tmp1, tmp2, src, dst); 2007 2008 switch (T) { 2009 case S: 2010 mov(rscratch1, jint_cast(0x1.0p23f)); 2011 break; 2012 case D: 2013 mov(rscratch1, julong_cast(0x1.0p52)); 2014 break; 2015 default: 2016 assert(T == S || T == D, "invalid register variant"); 2017 } 2018 2019 sve_frinta(dst, T, ptrue, src); 2020 // dst = round(src), ties to away 2021 2022 Label none; 2023 2024 sve_fneg(tmp1, T, ptrue, src); 2025 sve_dup(tmp2, T, rscratch1); 2026 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2027 br(EQ, none); 2028 { 2029 sve_cpy(tmp1, T, pgtmp, 0.5); 2030 sve_fadd(tmp1, T, pgtmp, src); 2031 sve_frintm(dst, T, pgtmp, tmp1); 2032 // dst = floor(src + 0.5, ties to even) 2033 } 2034 bind(none); 2035 2036 sve_fcvtzs(dst, T, ptrue, dst, T); 2037 // result in dst 2038 } 2039 2040 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2041 FloatRegister one, SIMD_Arrangement T) { 2042 assert_different_registers(dst, src, zero, one); 2043 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2044 2045 facgt(dst, T, src, zero); 2046 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2047 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2048 } 2049 2050 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2051 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2052 assert_different_registers(dst, src, zero, one, vtmp); 2053 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2054 2055 sve_orr(vtmp, src, src); 2056 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2057 switch (T) { 2058 case S: 2059 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2060 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2061 // on the sign of the float value 2062 break; 2063 case D: 2064 sve_and(vtmp, T, min_jlong); 2065 sve_orr(vtmp, T, jlong_cast(1.0)); 2066 break; 2067 default: 2068 assert(false, "unsupported"); 2069 ShouldNotReachHere(); 2070 } 2071 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2072 // Result in dst 2073 } 2074 2075 bool C2_MacroAssembler::in_scratch_emit_size() { 2076 if (ciEnv::current()->task() != nullptr) { 2077 PhaseOutput* phase_output = Compile::current()->output(); 2078 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2079 return true; 2080 } 2081 } 2082 return MacroAssembler::in_scratch_emit_size(); 2083 }