1 /* 2 * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/subnode.hpp" 31 #include "runtime/stubRoutines.hpp" 32 33 #ifdef PRODUCT 34 #define BLOCK_COMMENT(str) /* nothing */ 35 #define STOP(error) stop(error) 36 #else 37 #define BLOCK_COMMENT(str) block_comment(str) 38 #define STOP(error) block_comment(error); stop(error) 39 #endif 40 41 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 42 43 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 44 45 // Search for str1 in str2 and return index or -1 46 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 47 Register cnt2, Register cnt1, 48 Register tmp1, Register tmp2, 49 Register tmp3, Register tmp4, 50 Register tmp5, Register tmp6, 51 int icnt1, Register result, int ae) { 52 // NOTE: tmp5, tmp6 can be zr depending on specific method version 53 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 54 55 Register ch1 = rscratch1; 56 Register ch2 = rscratch2; 57 Register cnt1tmp = tmp1; 58 Register cnt2tmp = tmp2; 59 Register cnt1_neg = cnt1; 60 Register cnt2_neg = cnt2; 61 Register result_tmp = tmp4; 62 63 bool isL = ae == StrIntrinsicNode::LL; 64 65 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 66 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 67 int str1_chr_shift = str1_isL ? 0:1; 68 int str2_chr_shift = str2_isL ? 0:1; 69 int str1_chr_size = str1_isL ? 1:2; 70 int str2_chr_size = str2_isL ? 1:2; 71 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 72 (chr_insn)&MacroAssembler::ldrh; 73 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 74 (chr_insn)&MacroAssembler::ldrh; 75 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 76 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 77 78 // Note, inline_string_indexOf() generates checks: 79 // if (substr.count > string.count) return -1; 80 // if (substr.count == 0) return 0; 81 82 // We have two strings, a source string in str2, cnt2 and a pattern string 83 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 84 85 // For larger pattern and source we use a simplified Boyer Moore algorithm. 86 // With a small pattern and source we use linear scan. 87 88 if (icnt1 == -1) { 89 sub(result_tmp, cnt2, cnt1); 90 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 91 br(LT, LINEARSEARCH); 92 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 93 subs(zr, cnt1, 256); 94 lsr(tmp1, cnt2, 2); 95 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 96 br(GE, LINEARSTUB); 97 } 98 99 // The Boyer Moore alogorithm is based on the description here:- 100 // 101 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 102 // 103 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 104 // and the 'Good Suffix' rule. 105 // 106 // These rules are essentially heuristics for how far we can shift the 107 // pattern along the search string. 108 // 109 // The implementation here uses the 'Bad Character' rule only because of the 110 // complexity of initialisation for the 'Good Suffix' rule. 111 // 112 // This is also known as the Boyer-Moore-Horspool algorithm:- 113 // 114 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 115 // 116 // This particular implementation has few java-specific optimizations. 117 // 118 // #define ASIZE 256 119 // 120 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 121 // int i, j; 122 // unsigned c; 123 // unsigned char bc[ASIZE]; 124 // 125 // /* Preprocessing */ 126 // for (i = 0; i < ASIZE; ++i) 127 // bc[i] = m; 128 // for (i = 0; i < m - 1; ) { 129 // c = x[i]; 130 // ++i; 131 // // c < 256 for Latin1 string, so, no need for branch 132 // #ifdef PATTERN_STRING_IS_LATIN1 133 // bc[c] = m - i; 134 // #else 135 // if (c < ASIZE) bc[c] = m - i; 136 // #endif 137 // } 138 // 139 // /* Searching */ 140 // j = 0; 141 // while (j <= n - m) { 142 // c = y[i+j]; 143 // if (x[m-1] == c) 144 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 145 // if (i < 0) return j; 146 // // c < 256 for Latin1 string, so, no need for branch 147 // #ifdef SOURCE_STRING_IS_LATIN1 148 // // LL case: (c< 256) always true. Remove branch 149 // j += bc[y[j+m-1]]; 150 // #endif 151 // #ifndef PATTERN_STRING_IS_UTF 152 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 153 // if (c < ASIZE) 154 // j += bc[y[j+m-1]]; 155 // else 156 // j += 1 157 // #endif 158 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 159 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 160 // if (c < ASIZE) 161 // j += bc[y[j+m-1]]; 162 // else 163 // j += m 164 // #endif 165 // } 166 // } 167 168 if (icnt1 == -1) { 169 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 170 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 171 Register cnt1end = tmp2; 172 Register str2end = cnt2; 173 Register skipch = tmp2; 174 175 // str1 length is >=8, so, we can read at least 1 register for cases when 176 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 177 // UL case. We'll re-read last character in inner pre-loop code to have 178 // single outer pre-loop load 179 const int firstStep = isL ? 7 : 3; 180 181 const int ASIZE = 256; 182 const int STORED_BYTES = 32; // amount of bytes stored per instruction 183 sub(sp, sp, ASIZE); 184 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 185 mov(ch1, sp); 186 BIND(BM_INIT_LOOP); 187 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 188 subs(tmp5, tmp5, 1); 189 br(GT, BM_INIT_LOOP); 190 191 sub(cnt1tmp, cnt1, 1); 192 mov(tmp5, str2); 193 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 194 sub(ch2, cnt1, 1); 195 mov(tmp3, str1); 196 BIND(BCLOOP); 197 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 198 if (!str1_isL) { 199 subs(zr, ch1, ASIZE); 200 br(HS, BCSKIP); 201 } 202 strb(ch2, Address(sp, ch1)); 203 BIND(BCSKIP); 204 subs(ch2, ch2, 1); 205 br(GT, BCLOOP); 206 207 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 208 if (str1_isL == str2_isL) { 209 // load last 8 bytes (8LL/4UU symbols) 210 ldr(tmp6, Address(tmp6, -wordSize)); 211 } else { 212 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 213 // convert Latin1 to UTF. We'll have to wait until load completed, but 214 // it's still faster than per-character loads+checks 215 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 216 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 217 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 218 andr(tmp6, tmp6, 0xFF); // str1[N-4] 219 orr(ch2, ch1, ch2, LSL, 16); 220 orr(tmp6, tmp6, tmp3, LSL, 48); 221 orr(tmp6, tmp6, ch2, LSL, 16); 222 } 223 BIND(BMLOOPSTR2); 224 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 225 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 226 if (str1_isL == str2_isL) { 227 // re-init tmp3. It's for free because it's executed in parallel with 228 // load above. Alternative is to initialize it before loop, but it'll 229 // affect performance on in-order systems with 2 or more ld/st pipelines 230 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 231 } 232 if (!isL) { // UU/UL case 233 lsl(ch2, cnt1tmp, 1); // offset in bytes 234 } 235 cmp(tmp3, skipch); 236 br(NE, BMSKIP); 237 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 238 mov(ch1, tmp6); 239 if (isL) { 240 b(BMLOOPSTR1_AFTER_LOAD); 241 } else { 242 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 243 b(BMLOOPSTR1_CMP); 244 } 245 BIND(BMLOOPSTR1); 246 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 247 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 248 BIND(BMLOOPSTR1_AFTER_LOAD); 249 subs(cnt1tmp, cnt1tmp, 1); 250 br(LT, BMLOOPSTR1_LASTCMP); 251 BIND(BMLOOPSTR1_CMP); 252 cmp(ch1, ch2); 253 br(EQ, BMLOOPSTR1); 254 BIND(BMSKIP); 255 if (!isL) { 256 // if we've met UTF symbol while searching Latin1 pattern, then we can 257 // skip cnt1 symbols 258 if (str1_isL != str2_isL) { 259 mov(result_tmp, cnt1); 260 } else { 261 mov(result_tmp, 1); 262 } 263 subs(zr, skipch, ASIZE); 264 br(HS, BMADV); 265 } 266 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 267 BIND(BMADV); 268 sub(cnt1tmp, cnt1, 1); 269 add(str2, str2, result_tmp, LSL, str2_chr_shift); 270 cmp(str2, str2end); 271 br(LE, BMLOOPSTR2); 272 add(sp, sp, ASIZE); 273 b(NOMATCH); 274 BIND(BMLOOPSTR1_LASTCMP); 275 cmp(ch1, ch2); 276 br(NE, BMSKIP); 277 BIND(BMMATCH); 278 sub(result, str2, tmp5); 279 if (!str2_isL) lsr(result, result, 1); 280 add(sp, sp, ASIZE); 281 b(DONE); 282 283 BIND(LINEARSTUB); 284 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 285 br(LT, LINEAR_MEDIUM); 286 mov(result, zr); 287 RuntimeAddress stub = NULL; 288 if (isL) { 289 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 290 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 291 } else if (str1_isL) { 292 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 293 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 294 } else { 295 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 296 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 297 } 298 trampoline_call(stub); 299 b(DONE); 300 } 301 302 BIND(LINEARSEARCH); 303 { 304 Label DO1, DO2, DO3; 305 306 Register str2tmp = tmp2; 307 Register first = tmp3; 308 309 if (icnt1 == -1) 310 { 311 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 312 313 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 314 br(LT, DOSHORT); 315 BIND(LINEAR_MEDIUM); 316 (this->*str1_load_1chr)(first, Address(str1)); 317 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 318 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 319 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 320 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 321 322 BIND(FIRST_LOOP); 323 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 324 cmp(first, ch2); 325 br(EQ, STR1_LOOP); 326 BIND(STR2_NEXT); 327 adds(cnt2_neg, cnt2_neg, str2_chr_size); 328 br(LE, FIRST_LOOP); 329 b(NOMATCH); 330 331 BIND(STR1_LOOP); 332 adds(cnt1tmp, cnt1_neg, str1_chr_size); 333 add(cnt2tmp, cnt2_neg, str2_chr_size); 334 br(GE, MATCH); 335 336 BIND(STR1_NEXT); 337 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 338 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 339 cmp(ch1, ch2); 340 br(NE, STR2_NEXT); 341 adds(cnt1tmp, cnt1tmp, str1_chr_size); 342 add(cnt2tmp, cnt2tmp, str2_chr_size); 343 br(LT, STR1_NEXT); 344 b(MATCH); 345 346 BIND(DOSHORT); 347 if (str1_isL == str2_isL) { 348 cmp(cnt1, (u1)2); 349 br(LT, DO1); 350 br(GT, DO3); 351 } 352 } 353 354 if (icnt1 == 4) { 355 Label CH1_LOOP; 356 357 (this->*load_4chr)(ch1, str1); 358 sub(result_tmp, cnt2, 4); 359 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 360 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 361 362 BIND(CH1_LOOP); 363 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 364 cmp(ch1, ch2); 365 br(EQ, MATCH); 366 adds(cnt2_neg, cnt2_neg, str2_chr_size); 367 br(LE, CH1_LOOP); 368 b(NOMATCH); 369 } 370 371 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 372 Label CH1_LOOP; 373 374 BIND(DO2); 375 (this->*load_2chr)(ch1, str1); 376 if (icnt1 == 2) { 377 sub(result_tmp, cnt2, 2); 378 } 379 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 380 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 381 BIND(CH1_LOOP); 382 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 383 cmp(ch1, ch2); 384 br(EQ, MATCH); 385 adds(cnt2_neg, cnt2_neg, str2_chr_size); 386 br(LE, CH1_LOOP); 387 b(NOMATCH); 388 } 389 390 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 391 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 392 393 BIND(DO3); 394 (this->*load_2chr)(first, str1); 395 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 396 if (icnt1 == 3) { 397 sub(result_tmp, cnt2, 3); 398 } 399 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 400 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 401 BIND(FIRST_LOOP); 402 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 403 cmpw(first, ch2); 404 br(EQ, STR1_LOOP); 405 BIND(STR2_NEXT); 406 adds(cnt2_neg, cnt2_neg, str2_chr_size); 407 br(LE, FIRST_LOOP); 408 b(NOMATCH); 409 410 BIND(STR1_LOOP); 411 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 412 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 413 cmp(ch1, ch2); 414 br(NE, STR2_NEXT); 415 b(MATCH); 416 } 417 418 if (icnt1 == -1 || icnt1 == 1) { 419 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 420 421 BIND(DO1); 422 (this->*str1_load_1chr)(ch1, str1); 423 cmp(cnt2, (u1)8); 424 br(LT, DO1_SHORT); 425 426 sub(result_tmp, cnt2, 8/str2_chr_size); 427 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 428 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 429 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 430 431 if (str2_isL) { 432 orr(ch1, ch1, ch1, LSL, 8); 433 } 434 orr(ch1, ch1, ch1, LSL, 16); 435 orr(ch1, ch1, ch1, LSL, 32); 436 BIND(CH1_LOOP); 437 ldr(ch2, Address(str2, cnt2_neg)); 438 eor(ch2, ch1, ch2); 439 sub(tmp1, ch2, tmp3); 440 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 441 bics(tmp1, tmp1, tmp2); 442 br(NE, HAS_ZERO); 443 adds(cnt2_neg, cnt2_neg, 8); 444 br(LT, CH1_LOOP); 445 446 cmp(cnt2_neg, (u1)8); 447 mov(cnt2_neg, 0); 448 br(LT, CH1_LOOP); 449 b(NOMATCH); 450 451 BIND(HAS_ZERO); 452 rev(tmp1, tmp1); 453 clz(tmp1, tmp1); 454 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 455 b(MATCH); 456 457 BIND(DO1_SHORT); 458 mov(result_tmp, cnt2); 459 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 460 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 461 BIND(DO1_LOOP); 462 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 463 cmpw(ch1, ch2); 464 br(EQ, MATCH); 465 adds(cnt2_neg, cnt2_neg, str2_chr_size); 466 br(LT, DO1_LOOP); 467 } 468 } 469 BIND(NOMATCH); 470 mov(result, -1); 471 b(DONE); 472 BIND(MATCH); 473 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 474 BIND(DONE); 475 } 476 477 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 478 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 479 480 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 481 Register ch, Register result, 482 Register tmp1, Register tmp2, Register tmp3) 483 { 484 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 485 Register cnt1_neg = cnt1; 486 Register ch1 = rscratch1; 487 Register result_tmp = rscratch2; 488 489 cbz(cnt1, NOMATCH); 490 491 cmp(cnt1, (u1)4); 492 br(LT, DO1_SHORT); 493 494 orr(ch, ch, ch, LSL, 16); 495 orr(ch, ch, ch, LSL, 32); 496 497 sub(cnt1, cnt1, 4); 498 mov(result_tmp, cnt1); 499 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 500 sub(cnt1_neg, zr, cnt1, LSL, 1); 501 502 mov(tmp3, 0x0001000100010001); 503 504 BIND(CH1_LOOP); 505 ldr(ch1, Address(str1, cnt1_neg)); 506 eor(ch1, ch, ch1); 507 sub(tmp1, ch1, tmp3); 508 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 509 bics(tmp1, tmp1, tmp2); 510 br(NE, HAS_ZERO); 511 adds(cnt1_neg, cnt1_neg, 8); 512 br(LT, CH1_LOOP); 513 514 cmp(cnt1_neg, (u1)8); 515 mov(cnt1_neg, 0); 516 br(LT, CH1_LOOP); 517 b(NOMATCH); 518 519 BIND(HAS_ZERO); 520 rev(tmp1, tmp1); 521 clz(tmp1, tmp1); 522 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 523 b(MATCH); 524 525 BIND(DO1_SHORT); 526 mov(result_tmp, cnt1); 527 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 528 sub(cnt1_neg, zr, cnt1, LSL, 1); 529 BIND(DO1_LOOP); 530 ldrh(ch1, Address(str1, cnt1_neg)); 531 cmpw(ch, ch1); 532 br(EQ, MATCH); 533 adds(cnt1_neg, cnt1_neg, 2); 534 br(LT, DO1_LOOP); 535 BIND(NOMATCH); 536 mov(result, -1); 537 b(DONE); 538 BIND(MATCH); 539 add(result, result_tmp, cnt1_neg, ASR, 1); 540 BIND(DONE); 541 } 542 543 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 544 Register ch, Register result, 545 FloatRegister ztmp1, 546 FloatRegister ztmp2, 547 PRegister tmp_pg, 548 PRegister tmp_pdn, bool isL) 549 { 550 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 551 assert(tmp_pg->is_governing(), 552 "this register has to be a governing predicate register"); 553 554 Label LOOP, MATCH, DONE, NOMATCH; 555 Register vec_len = rscratch1; 556 Register idx = rscratch2; 557 558 SIMD_RegVariant T = (isL == true) ? B : H; 559 560 cbz(cnt1, NOMATCH); 561 562 // Assign the particular char throughout the vector. 563 sve_dup(ztmp2, T, ch); 564 if (isL) { 565 sve_cntb(vec_len); 566 } else { 567 sve_cnth(vec_len); 568 } 569 mov(idx, 0); 570 571 // Generate a predicate to control the reading of input string. 572 sve_whilelt(tmp_pg, T, idx, cnt1); 573 574 BIND(LOOP); 575 // Read a vector of 8- or 16-bit data depending on the string type. Note 576 // that inactive elements indicated by the predicate register won't cause 577 // a data read from memory to the destination vector. 578 if (isL) { 579 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 580 } else { 581 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 582 } 583 add(idx, idx, vec_len); 584 585 // Perform the comparison. An element of the destination predicate is set 586 // to active if the particular char is matched. 587 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 588 589 // Branch if the particular char is found. 590 br(NE, MATCH); 591 592 sve_whilelt(tmp_pg, T, idx, cnt1); 593 594 // Loop back if the particular char not found. 595 br(MI, LOOP); 596 597 BIND(NOMATCH); 598 mov(result, -1); 599 b(DONE); 600 601 BIND(MATCH); 602 // Undo the index increment. 603 sub(idx, idx, vec_len); 604 605 // Crop the vector to find its location. 606 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 607 add(result, idx, -1); 608 sve_incp(result, T, tmp_pdn); 609 BIND(DONE); 610 } 611 612 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 613 Register ch, Register result, 614 Register tmp1, Register tmp2, Register tmp3) 615 { 616 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 617 Register cnt1_neg = cnt1; 618 Register ch1 = rscratch1; 619 Register result_tmp = rscratch2; 620 621 cbz(cnt1, NOMATCH); 622 623 cmp(cnt1, (u1)8); 624 br(LT, DO1_SHORT); 625 626 orr(ch, ch, ch, LSL, 8); 627 orr(ch, ch, ch, LSL, 16); 628 orr(ch, ch, ch, LSL, 32); 629 630 sub(cnt1, cnt1, 8); 631 mov(result_tmp, cnt1); 632 lea(str1, Address(str1, cnt1)); 633 sub(cnt1_neg, zr, cnt1); 634 635 mov(tmp3, 0x0101010101010101); 636 637 BIND(CH1_LOOP); 638 ldr(ch1, Address(str1, cnt1_neg)); 639 eor(ch1, ch, ch1); 640 sub(tmp1, ch1, tmp3); 641 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 642 bics(tmp1, tmp1, tmp2); 643 br(NE, HAS_ZERO); 644 adds(cnt1_neg, cnt1_neg, 8); 645 br(LT, CH1_LOOP); 646 647 cmp(cnt1_neg, (u1)8); 648 mov(cnt1_neg, 0); 649 br(LT, CH1_LOOP); 650 b(NOMATCH); 651 652 BIND(HAS_ZERO); 653 rev(tmp1, tmp1); 654 clz(tmp1, tmp1); 655 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 656 b(MATCH); 657 658 BIND(DO1_SHORT); 659 mov(result_tmp, cnt1); 660 lea(str1, Address(str1, cnt1)); 661 sub(cnt1_neg, zr, cnt1); 662 BIND(DO1_LOOP); 663 ldrb(ch1, Address(str1, cnt1_neg)); 664 cmp(ch, ch1); 665 br(EQ, MATCH); 666 adds(cnt1_neg, cnt1_neg, 1); 667 br(LT, DO1_LOOP); 668 BIND(NOMATCH); 669 mov(result, -1); 670 b(DONE); 671 BIND(MATCH); 672 add(result, result_tmp, cnt1_neg); 673 BIND(DONE); 674 } 675 676 // Compare strings. 677 void C2_MacroAssembler::string_compare(Register str1, Register str2, 678 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 679 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 680 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 681 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 682 SHORT_LOOP_START, TAIL_CHECK; 683 684 bool isLL = ae == StrIntrinsicNode::LL; 685 bool isLU = ae == StrIntrinsicNode::LU; 686 bool isUL = ae == StrIntrinsicNode::UL; 687 688 // The stub threshold for LL strings is: 72 (64 + 8) chars 689 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 690 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 691 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 692 693 bool str1_isL = isLL || isLU; 694 bool str2_isL = isLL || isUL; 695 696 int str1_chr_shift = str1_isL ? 0 : 1; 697 int str2_chr_shift = str2_isL ? 0 : 1; 698 int str1_chr_size = str1_isL ? 1 : 2; 699 int str2_chr_size = str2_isL ? 1 : 2; 700 int minCharsInWord = isLL ? wordSize : wordSize/2; 701 702 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 703 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 704 (chr_insn)&MacroAssembler::ldrh; 705 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 706 (chr_insn)&MacroAssembler::ldrh; 707 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 708 (uxt_insn)&MacroAssembler::uxthw; 709 710 BLOCK_COMMENT("string_compare {"); 711 712 // Bizzarely, the counts are passed in bytes, regardless of whether they 713 // are L or U strings, however the result is always in characters. 714 if (!str1_isL) asrw(cnt1, cnt1, 1); 715 if (!str2_isL) asrw(cnt2, cnt2, 1); 716 717 // Compute the minimum of the string lengths and save the difference. 718 subsw(result, cnt1, cnt2); 719 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 720 721 // A very short string 722 cmpw(cnt2, minCharsInWord); 723 br(Assembler::LE, SHORT_STRING); 724 725 // Compare longwords 726 // load first parts of strings and finish initialization while loading 727 { 728 if (str1_isL == str2_isL) { // LL or UU 729 ldr(tmp1, Address(str1)); 730 cmp(str1, str2); 731 br(Assembler::EQ, DONE); 732 ldr(tmp2, Address(str2)); 733 cmp(cnt2, stub_threshold); 734 br(GE, STUB); 735 subsw(cnt2, cnt2, minCharsInWord); 736 br(EQ, TAIL_CHECK); 737 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 738 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 739 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 740 } else if (isLU) { 741 ldrs(vtmp, Address(str1)); 742 ldr(tmp2, Address(str2)); 743 cmp(cnt2, stub_threshold); 744 br(GE, STUB); 745 subw(cnt2, cnt2, 4); 746 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 747 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 748 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 749 zip1(vtmp, T8B, vtmp, vtmpZ); 750 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 751 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 752 add(cnt1, cnt1, 4); 753 fmovd(tmp1, vtmp); 754 } else { // UL case 755 ldr(tmp1, Address(str1)); 756 ldrs(vtmp, Address(str2)); 757 cmp(cnt2, stub_threshold); 758 br(GE, STUB); 759 subw(cnt2, cnt2, 4); 760 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 761 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 762 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 763 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 764 zip1(vtmp, T8B, vtmp, vtmpZ); 765 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 766 add(cnt1, cnt1, 8); 767 fmovd(tmp2, vtmp); 768 } 769 adds(cnt2, cnt2, isUL ? 4 : 8); 770 br(GE, TAIL); 771 eor(rscratch2, tmp1, tmp2); 772 cbnz(rscratch2, DIFF); 773 // main loop 774 bind(NEXT_WORD); 775 if (str1_isL == str2_isL) { 776 ldr(tmp1, Address(str1, cnt2)); 777 ldr(tmp2, Address(str2, cnt2)); 778 adds(cnt2, cnt2, 8); 779 } else if (isLU) { 780 ldrs(vtmp, Address(str1, cnt1)); 781 ldr(tmp2, Address(str2, cnt2)); 782 add(cnt1, cnt1, 4); 783 zip1(vtmp, T8B, vtmp, vtmpZ); 784 fmovd(tmp1, vtmp); 785 adds(cnt2, cnt2, 8); 786 } else { // UL 787 ldrs(vtmp, Address(str2, cnt2)); 788 ldr(tmp1, Address(str1, cnt1)); 789 zip1(vtmp, T8B, vtmp, vtmpZ); 790 add(cnt1, cnt1, 8); 791 fmovd(tmp2, vtmp); 792 adds(cnt2, cnt2, 4); 793 } 794 br(GE, TAIL); 795 796 eor(rscratch2, tmp1, tmp2); 797 cbz(rscratch2, NEXT_WORD); 798 b(DIFF); 799 bind(TAIL); 800 eor(rscratch2, tmp1, tmp2); 801 cbnz(rscratch2, DIFF); 802 // Last longword. In the case where length == 4 we compare the 803 // same longword twice, but that's still faster than another 804 // conditional branch. 805 if (str1_isL == str2_isL) { 806 ldr(tmp1, Address(str1)); 807 ldr(tmp2, Address(str2)); 808 } else if (isLU) { 809 ldrs(vtmp, Address(str1)); 810 ldr(tmp2, Address(str2)); 811 zip1(vtmp, T8B, vtmp, vtmpZ); 812 fmovd(tmp1, vtmp); 813 } else { // UL 814 ldrs(vtmp, Address(str2)); 815 ldr(tmp1, Address(str1)); 816 zip1(vtmp, T8B, vtmp, vtmpZ); 817 fmovd(tmp2, vtmp); 818 } 819 bind(TAIL_CHECK); 820 eor(rscratch2, tmp1, tmp2); 821 cbz(rscratch2, DONE); 822 823 // Find the first different characters in the longwords and 824 // compute their difference. 825 bind(DIFF); 826 rev(rscratch2, rscratch2); 827 clz(rscratch2, rscratch2); 828 andr(rscratch2, rscratch2, isLL ? -8 : -16); 829 lsrv(tmp1, tmp1, rscratch2); 830 (this->*ext_chr)(tmp1, tmp1); 831 lsrv(tmp2, tmp2, rscratch2); 832 (this->*ext_chr)(tmp2, tmp2); 833 subw(result, tmp1, tmp2); 834 b(DONE); 835 } 836 837 bind(STUB); 838 RuntimeAddress stub = NULL; 839 switch(ae) { 840 case StrIntrinsicNode::LL: 841 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 842 break; 843 case StrIntrinsicNode::UU: 844 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 845 break; 846 case StrIntrinsicNode::LU: 847 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 848 break; 849 case StrIntrinsicNode::UL: 850 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 851 break; 852 default: 853 ShouldNotReachHere(); 854 } 855 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 856 trampoline_call(stub); 857 b(DONE); 858 859 bind(SHORT_STRING); 860 // Is the minimum length zero? 861 cbz(cnt2, DONE); 862 // arrange code to do most branches while loading and loading next characters 863 // while comparing previous 864 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 865 subs(cnt2, cnt2, 1); 866 br(EQ, SHORT_LAST_INIT); 867 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 868 b(SHORT_LOOP_START); 869 bind(SHORT_LOOP); 870 subs(cnt2, cnt2, 1); 871 br(EQ, SHORT_LAST); 872 bind(SHORT_LOOP_START); 873 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 874 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 875 cmp(tmp1, cnt1); 876 br(NE, SHORT_LOOP_TAIL); 877 subs(cnt2, cnt2, 1); 878 br(EQ, SHORT_LAST2); 879 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 880 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 881 cmp(tmp2, rscratch1); 882 br(EQ, SHORT_LOOP); 883 sub(result, tmp2, rscratch1); 884 b(DONE); 885 bind(SHORT_LOOP_TAIL); 886 sub(result, tmp1, cnt1); 887 b(DONE); 888 bind(SHORT_LAST2); 889 cmp(tmp2, rscratch1); 890 br(EQ, DONE); 891 sub(result, tmp2, rscratch1); 892 893 b(DONE); 894 bind(SHORT_LAST_INIT); 895 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 896 bind(SHORT_LAST); 897 cmp(tmp1, cnt1); 898 br(EQ, DONE); 899 sub(result, tmp1, cnt1); 900 901 bind(DONE); 902 903 BLOCK_COMMENT("} string_compare"); 904 } 905 906 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 907 FloatRegister src2, int cond, bool isQ) { 908 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 909 if (bt == T_FLOAT || bt == T_DOUBLE) { 910 switch (cond) { 911 case BoolTest::eq: fcmeq(dst, size, src1, src2); break; 912 case BoolTest::ne: { 913 fcmeq(dst, size, src1, src2); 914 notr(dst, T16B, dst); 915 break; 916 } 917 case BoolTest::ge: fcmge(dst, size, src1, src2); break; 918 case BoolTest::gt: fcmgt(dst, size, src1, src2); break; 919 case BoolTest::le: fcmge(dst, size, src2, src1); break; 920 case BoolTest::lt: fcmgt(dst, size, src2, src1); break; 921 default: 922 assert(false, "unsupported"); 923 ShouldNotReachHere(); 924 } 925 } else { 926 switch (cond) { 927 case BoolTest::eq: cmeq(dst, size, src1, src2); break; 928 case BoolTest::ne: { 929 cmeq(dst, size, src1, src2); 930 notr(dst, T16B, dst); 931 break; 932 } 933 case BoolTest::ge: cmge(dst, size, src1, src2); break; 934 case BoolTest::gt: cmgt(dst, size, src1, src2); break; 935 case BoolTest::le: cmge(dst, size, src2, src1); break; 936 case BoolTest::lt: cmgt(dst, size, src2, src1); break; 937 case BoolTest::uge: cmhs(dst, size, src1, src2); break; 938 case BoolTest::ugt: cmhi(dst, size, src1, src2); break; 939 case BoolTest::ult: cmhi(dst, size, src2, src1); break; 940 case BoolTest::ule: cmhs(dst, size, src2, src1); break; 941 default: 942 assert(false, "unsupported"); 943 ShouldNotReachHere(); 944 } 945 } 946 } 947 948 // Compress the least significant bit of each byte to the rightmost and clear 949 // the higher garbage bits. 950 void C2_MacroAssembler::bytemask_compress(Register dst) { 951 // Example input, dst = 0x01 00 00 00 01 01 00 01 952 // The "??" bytes are garbage. 953 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 954 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 955 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 956 andr(dst, dst, 0xff); // dst = 0x8D 957 } 958 959 // Pack the lowest-numbered bit of each mask element in src into a long value 960 // in dst, at most the first 64 lane elements. 961 // Clobbers: rscratch1 962 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 963 FloatRegister vtmp1, FloatRegister vtmp2) { 964 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 965 assert_different_registers(dst, rscratch1); 966 assert_different_registers(vtmp1, vtmp2); 967 968 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 969 970 // Pack the mask into vector with sequential bytes. 971 sve_cpy(vtmp1, size, src, 1, false); 972 if (bt != T_BYTE) { 973 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 974 } 975 976 // Compress the lowest 8 bytes. 977 fmovd(dst, vtmp1); 978 bytemask_compress(dst); 979 if (lane_cnt <= 8) return; 980 981 // Repeat on higher bytes and join the results. 982 // Compress 8 bytes in each iteration. 983 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 984 sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2); 985 bytemask_compress(rscratch1); 986 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 987 } 988 } 989 990 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 991 FloatRegister zn, FloatRegister zm, int cond) { 992 assert(pg->is_governing(), "This register has to be a governing predicate register"); 993 FloatRegister z1 = zn, z2 = zm; 994 // Convert the original BoolTest condition to Assembler::condition. 995 Condition condition; 996 switch (cond) { 997 case BoolTest::eq: condition = Assembler::EQ; break; 998 case BoolTest::ne: condition = Assembler::NE; break; 999 case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break; 1000 case BoolTest::ge: condition = Assembler::GE; break; 1001 case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break; 1002 case BoolTest::gt: condition = Assembler::GT; break; 1003 default: 1004 assert(false, "unsupported compare condition"); 1005 ShouldNotReachHere(); 1006 } 1007 1008 SIMD_RegVariant size = elemType_to_regVariant(bt); 1009 if (bt == T_FLOAT || bt == T_DOUBLE) { 1010 sve_fcm(condition, pd, size, pg, z1, z2); 1011 } else { 1012 assert(is_integral_type(bt), "unsupported element type"); 1013 sve_cmp(condition, pd, size, pg, z1, z2); 1014 } 1015 } 1016 1017 // Get index of the last mask lane that is set 1018 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1019 SIMD_RegVariant size = elemType_to_regVariant(bt); 1020 sve_rev(ptmp, size, src); 1021 sve_brkb(ptmp, ptrue, ptmp, false); 1022 sve_cntp(dst, size, ptrue, ptmp); 1023 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1024 subw(dst, rscratch1, dst); 1025 } 1026 1027 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1028 FloatRegister src, SIMD_RegVariant src_size) { 1029 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1030 if (src_size == B) { 1031 switch (dst_size) { 1032 case H: 1033 sve_sunpklo(dst, H, src); 1034 break; 1035 case S: 1036 sve_sunpklo(dst, H, src); 1037 sve_sunpklo(dst, S, dst); 1038 break; 1039 case D: 1040 sve_sunpklo(dst, H, src); 1041 sve_sunpklo(dst, S, dst); 1042 sve_sunpklo(dst, D, dst); 1043 break; 1044 default: 1045 ShouldNotReachHere(); 1046 } 1047 } else if (src_size == H) { 1048 if (dst_size == S) { 1049 sve_sunpklo(dst, S, src); 1050 } else { // D 1051 sve_sunpklo(dst, S, src); 1052 sve_sunpklo(dst, D, dst); 1053 } 1054 } else if (src_size == S) { 1055 sve_sunpklo(dst, D, src); 1056 } 1057 } 1058 1059 // Vector narrow from src to dst with specified element sizes. 1060 // High part of dst vector will be filled with zero. 1061 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1062 FloatRegister src, SIMD_RegVariant src_size, 1063 FloatRegister tmp) { 1064 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1065 assert_different_registers(src, tmp); 1066 sve_dup(tmp, src_size, 0); 1067 if (src_size == D) { 1068 switch (dst_size) { 1069 case S: 1070 sve_uzp1(dst, S, src, tmp); 1071 break; 1072 case H: 1073 assert_different_registers(dst, tmp); 1074 sve_uzp1(dst, S, src, tmp); 1075 sve_uzp1(dst, H, dst, tmp); 1076 break; 1077 case B: 1078 assert_different_registers(dst, tmp); 1079 sve_uzp1(dst, S, src, tmp); 1080 sve_uzp1(dst, H, dst, tmp); 1081 sve_uzp1(dst, B, dst, tmp); 1082 break; 1083 default: 1084 ShouldNotReachHere(); 1085 } 1086 } else if (src_size == S) { 1087 if (dst_size == H) { 1088 sve_uzp1(dst, H, src, tmp); 1089 } else { // B 1090 assert_different_registers(dst, tmp); 1091 sve_uzp1(dst, H, src, tmp); 1092 sve_uzp1(dst, B, dst, tmp); 1093 } 1094 } else if (src_size == H) { 1095 sve_uzp1(dst, B, src, tmp); 1096 } 1097 } 1098 1099 // Extend src predicate to dst predicate with the same lane count but larger 1100 // element size, e.g. 64Byte -> 512Long 1101 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1102 uint dst_element_length_in_bytes, 1103 uint src_element_length_in_bytes) { 1104 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1105 sve_punpklo(dst, src); 1106 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1107 sve_punpklo(dst, src); 1108 sve_punpklo(dst, dst); 1109 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1110 sve_punpklo(dst, src); 1111 sve_punpklo(dst, dst); 1112 sve_punpklo(dst, dst); 1113 } else { 1114 assert(false, "unsupported"); 1115 ShouldNotReachHere(); 1116 } 1117 } 1118 1119 // Narrow src predicate to dst predicate with the same lane count but 1120 // smaller element size, e.g. 512Long -> 64Byte 1121 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, 1122 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1123 // The insignificant bits in src predicate are expected to be zero. 1124 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1125 sve_uzp1(dst, B, src, src); 1126 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1127 sve_uzp1(dst, H, src, src); 1128 sve_uzp1(dst, B, dst, dst); 1129 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1130 sve_uzp1(dst, S, src, src); 1131 sve_uzp1(dst, H, dst, dst); 1132 sve_uzp1(dst, B, dst, dst); 1133 } else { 1134 assert(false, "unsupported"); 1135 ShouldNotReachHere(); 1136 } 1137 } 1138 1139 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1140 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1141 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1142 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1143 assert_different_registers(src1, dst); 1144 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1145 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1146 switch (opc) { 1147 case Op_AddReductionVI: { 1148 sve_uaddv(tmp, size, pg, src2); 1149 smov(dst, tmp, size, 0); 1150 if (bt == T_BYTE) { 1151 addw(dst, src1, dst, ext::sxtb); 1152 } else if (bt == T_SHORT) { 1153 addw(dst, src1, dst, ext::sxth); 1154 } else { 1155 addw(dst, dst, src1); 1156 } 1157 break; 1158 } 1159 case Op_AddReductionVL: { 1160 sve_uaddv(tmp, size, pg, src2); 1161 umov(dst, tmp, size, 0); 1162 add(dst, dst, src1); 1163 break; 1164 } 1165 case Op_AndReductionV: { 1166 sve_andv(tmp, size, pg, src2); 1167 if (bt == T_LONG) { 1168 umov(dst, tmp, size, 0); 1169 andr(dst, dst, src1); 1170 } else { 1171 smov(dst, tmp, size, 0); 1172 andw(dst, dst, src1); 1173 } 1174 break; 1175 } 1176 case Op_OrReductionV: { 1177 sve_orv(tmp, size, pg, src2); 1178 if (bt == T_LONG) { 1179 umov(dst, tmp, size, 0); 1180 orr(dst, dst, src1); 1181 } else { 1182 smov(dst, tmp, size, 0); 1183 orrw(dst, dst, src1); 1184 } 1185 break; 1186 } 1187 case Op_XorReductionV: { 1188 sve_eorv(tmp, size, pg, src2); 1189 if (bt == T_LONG) { 1190 umov(dst, tmp, size, 0); 1191 eor(dst, dst, src1); 1192 } else { 1193 smov(dst, tmp, size, 0); 1194 eorw(dst, dst, src1); 1195 } 1196 break; 1197 } 1198 case Op_MaxReductionV: { 1199 sve_smaxv(tmp, size, pg, src2); 1200 if (bt == T_LONG) { 1201 umov(dst, tmp, size, 0); 1202 cmp(dst, src1); 1203 csel(dst, dst, src1, Assembler::GT); 1204 } else { 1205 smov(dst, tmp, size, 0); 1206 cmpw(dst, src1); 1207 cselw(dst, dst, src1, Assembler::GT); 1208 } 1209 break; 1210 } 1211 case Op_MinReductionV: { 1212 sve_sminv(tmp, size, pg, src2); 1213 if (bt == T_LONG) { 1214 umov(dst, tmp, size, 0); 1215 cmp(dst, src1); 1216 csel(dst, dst, src1, Assembler::LT); 1217 } else { 1218 smov(dst, tmp, size, 0); 1219 cmpw(dst, src1); 1220 cselw(dst, dst, src1, Assembler::LT); 1221 } 1222 break; 1223 } 1224 default: 1225 assert(false, "unsupported"); 1226 ShouldNotReachHere(); 1227 } 1228 1229 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1230 if (bt == T_BYTE) { 1231 sxtb(dst, dst); 1232 } else if (bt == T_SHORT) { 1233 sxth(dst, dst); 1234 } 1235 } 1236 } 1237 1238 // Set elements of the dst predicate to true if the element number is 1239 // in the range of [0, lane_cnt), or to false otherwise. 1240 void C2_MacroAssembler::sve_ptrue_lanecnt(PRegister dst, SIMD_RegVariant size, int lane_cnt) { 1241 assert(size != Q, "invalid size"); 1242 switch(lane_cnt) { 1243 case 1: /* VL1 */ 1244 case 2: /* VL2 */ 1245 case 3: /* VL3 */ 1246 case 4: /* VL4 */ 1247 case 5: /* VL5 */ 1248 case 6: /* VL6 */ 1249 case 7: /* VL7 */ 1250 case 8: /* VL8 */ 1251 sve_ptrue(dst, size, lane_cnt); 1252 break; 1253 case 16: 1254 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1255 break; 1256 case 32: 1257 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1258 break; 1259 case 64: 1260 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1261 break; 1262 case 128: 1263 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1264 break; 1265 case 256: 1266 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1267 break; 1268 default: 1269 assert(false, "unsupported"); 1270 ShouldNotReachHere(); 1271 } 1272 } 1273 1274 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1275 // Any remaining elements of dst will be filled with zero. 1276 // Clobbers: rscratch1 1277 // Preserves: src, mask 1278 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 1279 FloatRegister vtmp1, FloatRegister vtmp2, 1280 PRegister pgtmp) { 1281 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1282 assert_different_registers(dst, src, vtmp1, vtmp2); 1283 assert_different_registers(mask, pgtmp); 1284 1285 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 1286 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 1287 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 1288 sve_dup(vtmp2, H, 0); 1289 1290 // Extend lowest half to type INT. 1291 // dst = 00004444 00003333 00002222 00001111 1292 sve_uunpklo(dst, S, src); 1293 // pgtmp = 00000001 00000000 00000001 00000001 1294 sve_punpklo(pgtmp, mask); 1295 // Pack the active elements in size of type INT to the right, 1296 // and fill the remainings with zero. 1297 // dst = 00000000 00004444 00002222 00001111 1298 sve_compact(dst, S, dst, pgtmp); 1299 // Narrow the result back to type SHORT. 1300 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 1301 sve_uzp1(dst, H, dst, vtmp2); 1302 // Count the active elements of lowest half. 1303 // rscratch1 = 3 1304 sve_cntp(rscratch1, S, ptrue, pgtmp); 1305 1306 // Repeat to the highest half. 1307 // pgtmp = 00000001 00000000 00000000 00000001 1308 sve_punpkhi(pgtmp, mask); 1309 // vtmp1 = 00008888 00007777 00006666 00005555 1310 sve_uunpkhi(vtmp1, S, src); 1311 // vtmp1 = 00000000 00000000 00008888 00005555 1312 sve_compact(vtmp1, S, vtmp1, pgtmp); 1313 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1314 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 1315 1316 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 1317 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1318 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1319 // TRUE_CNT is the number of active elements in the compressed low. 1320 neg(rscratch1, rscratch1); 1321 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1322 sve_index(vtmp2, H, rscratch1, 1); 1323 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 1324 sve_tbl(vtmp1, H, vtmp1, vtmp2); 1325 1326 // Combine the compressed high(after shifted) with the compressed low. 1327 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 1328 sve_orr(dst, dst, vtmp1); 1329 } 1330 1331 // Clobbers: rscratch1, rscratch2 1332 // Preserves: src, mask 1333 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 1334 FloatRegister vtmp1, FloatRegister vtmp2, 1335 FloatRegister vtmp3, FloatRegister vtmp4, 1336 PRegister ptmp, PRegister pgtmp) { 1337 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1338 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 1339 assert_different_registers(mask, ptmp, pgtmp); 1340 // Example input: src = 88 77 66 45 44 33 22 11 1341 // mask = 01 00 00 01 01 00 01 01 1342 // Expected result: dst = 00 00 00 88 55 44 22 11 1343 1344 sve_dup(vtmp4, B, 0); 1345 // Extend lowest half to type SHORT. 1346 // vtmp1 = 0044 0033 0022 0011 1347 sve_uunpklo(vtmp1, H, src); 1348 // ptmp = 0001 0000 0001 0001 1349 sve_punpklo(ptmp, mask); 1350 // Count the active elements of lowest half. 1351 // rscratch2 = 3 1352 sve_cntp(rscratch2, H, ptrue, ptmp); 1353 // Pack the active elements in size of type SHORT to the right, 1354 // and fill the remainings with zero. 1355 // dst = 0000 0044 0022 0011 1356 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 1357 // Narrow the result back to type BYTE. 1358 // dst = 00 00 00 00 00 44 22 11 1359 sve_uzp1(dst, B, dst, vtmp4); 1360 1361 // Repeat to the highest half. 1362 // ptmp = 0001 0000 0000 0001 1363 sve_punpkhi(ptmp, mask); 1364 // vtmp1 = 0088 0077 0066 0055 1365 sve_uunpkhi(vtmp2, H, src); 1366 // vtmp1 = 0000 0000 0088 0055 1367 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 1368 1369 sve_dup(vtmp4, B, 0); 1370 // vtmp1 = 00 00 00 00 00 00 88 55 1371 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 1372 1373 // Compressed low: dst = 00 00 00 00 00 44 22 11 1374 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 1375 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1376 // TRUE_CNT is the number of active elements in the compressed low. 1377 neg(rscratch2, rscratch2); 1378 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1379 sve_index(vtmp2, B, rscratch2, 1); 1380 // vtmp1 = 00 00 00 88 55 00 00 00 1381 sve_tbl(vtmp1, B, vtmp1, vtmp2); 1382 // Combine the compressed high(after shifted) with the compressed low. 1383 // dst = 00 00 00 88 55 44 22 11 1384 sve_orr(dst, dst, vtmp1); 1385 } 1386 1387 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1388 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1389 SIMD_Arrangement size = isQ ? T16B : T8B; 1390 if (bt == T_BYTE) { 1391 rbit(dst, size, src); 1392 } else { 1393 neon_reverse_bytes(dst, src, bt, isQ); 1394 rbit(dst, size, dst); 1395 } 1396 } 1397 1398 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1399 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1400 SIMD_Arrangement size = isQ ? T16B : T8B; 1401 switch (bt) { 1402 case T_BYTE: 1403 if (dst != src) { 1404 orr(dst, size, src, src); 1405 } 1406 break; 1407 case T_SHORT: 1408 rev16(dst, size, src); 1409 break; 1410 case T_INT: 1411 rev32(dst, size, src); 1412 break; 1413 case T_LONG: 1414 rev64(dst, size, src); 1415 break; 1416 default: 1417 assert(false, "unsupported"); 1418 ShouldNotReachHere(); 1419 } 1420 } 1421 1422 // Extract a scalar element from an sve vector at position 'idx'. 1423 // The input elements in src are expected to be of integral type. 1424 void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx, 1425 bool is_signed, FloatRegister vtmp) { 1426 assert(UseSVE > 0 && size != Q, "unsupported"); 1427 assert(!(is_signed && size == D), "signed extract (D) not supported."); 1428 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 1429 is_signed ? smov(dst, src, size, idx) : umov(dst, src, size, idx); 1430 } else { 1431 sve_orr(vtmp, src, src); 1432 sve_ext(vtmp, vtmp, idx << size); 1433 is_signed ? smov(dst, vtmp, size, 0) : umov(dst, vtmp, size, 0); 1434 } 1435 } 1436 1437 // java.lang.Math::round intrinsics 1438 1439 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1440 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 1441 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 1442 switch (T) { 1443 case T2S: 1444 case T4S: 1445 fmovs(tmp1, T, 0.5f); 1446 mov(rscratch1, jint_cast(0x1.0p23f)); 1447 break; 1448 case T2D: 1449 fmovd(tmp1, T, 0.5); 1450 mov(rscratch1, julong_cast(0x1.0p52)); 1451 break; 1452 default: 1453 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 1454 } 1455 fadd(tmp1, T, tmp1, src); 1456 fcvtms(tmp1, T, tmp1); 1457 // tmp1 = floor(src + 0.5, ties to even) 1458 1459 fcvtas(dst, T, src); 1460 // dst = round(src), ties to away 1461 1462 fneg(tmp3, T, src); 1463 dup(tmp2, T, rscratch1); 1464 cmhs(tmp3, T, tmp3, tmp2); 1465 // tmp3 is now a set of flags 1466 1467 bif(dst, T16B, tmp1, tmp3); 1468 // result in dst 1469 } 1470 1471 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1472 FloatRegister tmp2, PRegister ptmp, SIMD_RegVariant T) { 1473 assert_different_registers(tmp1, tmp2, src, dst); 1474 1475 switch (T) { 1476 case S: 1477 mov(rscratch1, jint_cast(0x1.0p23f)); 1478 break; 1479 case D: 1480 mov(rscratch1, julong_cast(0x1.0p52)); 1481 break; 1482 default: 1483 assert(T == S || T == D, "invalid arrangement"); 1484 } 1485 1486 sve_frinta(dst, T, ptrue, src); 1487 // dst = round(src), ties to away 1488 1489 Label none; 1490 1491 sve_fneg(tmp1, T, ptrue, src); 1492 sve_dup(tmp2, T, rscratch1); 1493 sve_cmp(HS, ptmp, T, ptrue, tmp2, tmp1); 1494 br(EQ, none); 1495 { 1496 sve_cpy(tmp1, T, ptmp, 0.5); 1497 sve_fadd(tmp1, T, ptmp, src); 1498 sve_frintm(dst, T, ptmp, tmp1); 1499 // dst = floor(src + 0.5, ties to even) 1500 } 1501 bind(none); 1502 1503 sve_fcvtzs(dst, T, ptrue, dst, T); 1504 // result in dst 1505 }