1 /* 2 * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/intrinsicnode.hpp" 30 #include "opto/subnode.hpp" 31 #include "runtime/stubRoutines.hpp" 32 33 #ifdef PRODUCT 34 #define BLOCK_COMMENT(str) /* nothing */ 35 #define STOP(error) stop(error) 36 #else 37 #define BLOCK_COMMENT(str) block_comment(str) 38 #define STOP(error) block_comment(error); stop(error) 39 #endif 40 41 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 42 43 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 44 45 // Search for str1 in str2 and return index or -1 46 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 47 Register cnt2, Register cnt1, 48 Register tmp1, Register tmp2, 49 Register tmp3, Register tmp4, 50 Register tmp5, Register tmp6, 51 int icnt1, Register result, int ae) { 52 // NOTE: tmp5, tmp6 can be zr depending on specific method version 53 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 54 55 Register ch1 = rscratch1; 56 Register ch2 = rscratch2; 57 Register cnt1tmp = tmp1; 58 Register cnt2tmp = tmp2; 59 Register cnt1_neg = cnt1; 60 Register cnt2_neg = cnt2; 61 Register result_tmp = tmp4; 62 63 bool isL = ae == StrIntrinsicNode::LL; 64 65 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 66 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 67 int str1_chr_shift = str1_isL ? 0:1; 68 int str2_chr_shift = str2_isL ? 0:1; 69 int str1_chr_size = str1_isL ? 1:2; 70 int str2_chr_size = str2_isL ? 1:2; 71 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 72 (chr_insn)&MacroAssembler::ldrh; 73 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 74 (chr_insn)&MacroAssembler::ldrh; 75 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 76 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 77 78 // Note, inline_string_indexOf() generates checks: 79 // if (substr.count > string.count) return -1; 80 // if (substr.count == 0) return 0; 81 82 // We have two strings, a source string in str2, cnt2 and a pattern string 83 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 84 85 // For larger pattern and source we use a simplified Boyer Moore algorithm. 86 // With a small pattern and source we use linear scan. 87 88 if (icnt1 == -1) { 89 sub(result_tmp, cnt2, cnt1); 90 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 91 br(LT, LINEARSEARCH); 92 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 93 subs(zr, cnt1, 256); 94 lsr(tmp1, cnt2, 2); 95 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 96 br(GE, LINEARSTUB); 97 } 98 99 // The Boyer Moore alogorithm is based on the description here:- 100 // 101 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 102 // 103 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 104 // and the 'Good Suffix' rule. 105 // 106 // These rules are essentially heuristics for how far we can shift the 107 // pattern along the search string. 108 // 109 // The implementation here uses the 'Bad Character' rule only because of the 110 // complexity of initialisation for the 'Good Suffix' rule. 111 // 112 // This is also known as the Boyer-Moore-Horspool algorithm:- 113 // 114 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 115 // 116 // This particular implementation has few java-specific optimizations. 117 // 118 // #define ASIZE 256 119 // 120 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 121 // int i, j; 122 // unsigned c; 123 // unsigned char bc[ASIZE]; 124 // 125 // /* Preprocessing */ 126 // for (i = 0; i < ASIZE; ++i) 127 // bc[i] = m; 128 // for (i = 0; i < m - 1; ) { 129 // c = x[i]; 130 // ++i; 131 // // c < 256 for Latin1 string, so, no need for branch 132 // #ifdef PATTERN_STRING_IS_LATIN1 133 // bc[c] = m - i; 134 // #else 135 // if (c < ASIZE) bc[c] = m - i; 136 // #endif 137 // } 138 // 139 // /* Searching */ 140 // j = 0; 141 // while (j <= n - m) { 142 // c = y[i+j]; 143 // if (x[m-1] == c) 144 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 145 // if (i < 0) return j; 146 // // c < 256 for Latin1 string, so, no need for branch 147 // #ifdef SOURCE_STRING_IS_LATIN1 148 // // LL case: (c< 256) always true. Remove branch 149 // j += bc[y[j+m-1]]; 150 // #endif 151 // #ifndef PATTERN_STRING_IS_UTF 152 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 153 // if (c < ASIZE) 154 // j += bc[y[j+m-1]]; 155 // else 156 // j += 1 157 // #endif 158 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 159 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 160 // if (c < ASIZE) 161 // j += bc[y[j+m-1]]; 162 // else 163 // j += m 164 // #endif 165 // } 166 // } 167 168 if (icnt1 == -1) { 169 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 170 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 171 Register cnt1end = tmp2; 172 Register str2end = cnt2; 173 Register skipch = tmp2; 174 175 // str1 length is >=8, so, we can read at least 1 register for cases when 176 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 177 // UL case. We'll re-read last character in inner pre-loop code to have 178 // single outer pre-loop load 179 const int firstStep = isL ? 7 : 3; 180 181 const int ASIZE = 256; 182 const int STORED_BYTES = 32; // amount of bytes stored per instruction 183 sub(sp, sp, ASIZE); 184 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 185 mov(ch1, sp); 186 BIND(BM_INIT_LOOP); 187 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 188 subs(tmp5, tmp5, 1); 189 br(GT, BM_INIT_LOOP); 190 191 sub(cnt1tmp, cnt1, 1); 192 mov(tmp5, str2); 193 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 194 sub(ch2, cnt1, 1); 195 mov(tmp3, str1); 196 BIND(BCLOOP); 197 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 198 if (!str1_isL) { 199 subs(zr, ch1, ASIZE); 200 br(HS, BCSKIP); 201 } 202 strb(ch2, Address(sp, ch1)); 203 BIND(BCSKIP); 204 subs(ch2, ch2, 1); 205 br(GT, BCLOOP); 206 207 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 208 if (str1_isL == str2_isL) { 209 // load last 8 bytes (8LL/4UU symbols) 210 ldr(tmp6, Address(tmp6, -wordSize)); 211 } else { 212 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 213 // convert Latin1 to UTF. We'll have to wait until load completed, but 214 // it's still faster than per-character loads+checks 215 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 216 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 217 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 218 andr(tmp6, tmp6, 0xFF); // str1[N-4] 219 orr(ch2, ch1, ch2, LSL, 16); 220 orr(tmp6, tmp6, tmp3, LSL, 48); 221 orr(tmp6, tmp6, ch2, LSL, 16); 222 } 223 BIND(BMLOOPSTR2); 224 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 225 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 226 if (str1_isL == str2_isL) { 227 // re-init tmp3. It's for free because it's executed in parallel with 228 // load above. Alternative is to initialize it before loop, but it'll 229 // affect performance on in-order systems with 2 or more ld/st pipelines 230 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 231 } 232 if (!isL) { // UU/UL case 233 lsl(ch2, cnt1tmp, 1); // offset in bytes 234 } 235 cmp(tmp3, skipch); 236 br(NE, BMSKIP); 237 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 238 mov(ch1, tmp6); 239 if (isL) { 240 b(BMLOOPSTR1_AFTER_LOAD); 241 } else { 242 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 243 b(BMLOOPSTR1_CMP); 244 } 245 BIND(BMLOOPSTR1); 246 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 247 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 248 BIND(BMLOOPSTR1_AFTER_LOAD); 249 subs(cnt1tmp, cnt1tmp, 1); 250 br(LT, BMLOOPSTR1_LASTCMP); 251 BIND(BMLOOPSTR1_CMP); 252 cmp(ch1, ch2); 253 br(EQ, BMLOOPSTR1); 254 BIND(BMSKIP); 255 if (!isL) { 256 // if we've met UTF symbol while searching Latin1 pattern, then we can 257 // skip cnt1 symbols 258 if (str1_isL != str2_isL) { 259 mov(result_tmp, cnt1); 260 } else { 261 mov(result_tmp, 1); 262 } 263 subs(zr, skipch, ASIZE); 264 br(HS, BMADV); 265 } 266 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 267 BIND(BMADV); 268 sub(cnt1tmp, cnt1, 1); 269 add(str2, str2, result_tmp, LSL, str2_chr_shift); 270 cmp(str2, str2end); 271 br(LE, BMLOOPSTR2); 272 add(sp, sp, ASIZE); 273 b(NOMATCH); 274 BIND(BMLOOPSTR1_LASTCMP); 275 cmp(ch1, ch2); 276 br(NE, BMSKIP); 277 BIND(BMMATCH); 278 sub(result, str2, tmp5); 279 if (!str2_isL) lsr(result, result, 1); 280 add(sp, sp, ASIZE); 281 b(DONE); 282 283 BIND(LINEARSTUB); 284 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 285 br(LT, LINEAR_MEDIUM); 286 mov(result, zr); 287 RuntimeAddress stub = NULL; 288 if (isL) { 289 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 290 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 291 } else if (str1_isL) { 292 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 293 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 294 } else { 295 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 296 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 297 } 298 trampoline_call(stub); 299 b(DONE); 300 } 301 302 BIND(LINEARSEARCH); 303 { 304 Label DO1, DO2, DO3; 305 306 Register str2tmp = tmp2; 307 Register first = tmp3; 308 309 if (icnt1 == -1) 310 { 311 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 312 313 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 314 br(LT, DOSHORT); 315 BIND(LINEAR_MEDIUM); 316 (this->*str1_load_1chr)(first, Address(str1)); 317 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 318 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 319 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 320 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 321 322 BIND(FIRST_LOOP); 323 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 324 cmp(first, ch2); 325 br(EQ, STR1_LOOP); 326 BIND(STR2_NEXT); 327 adds(cnt2_neg, cnt2_neg, str2_chr_size); 328 br(LE, FIRST_LOOP); 329 b(NOMATCH); 330 331 BIND(STR1_LOOP); 332 adds(cnt1tmp, cnt1_neg, str1_chr_size); 333 add(cnt2tmp, cnt2_neg, str2_chr_size); 334 br(GE, MATCH); 335 336 BIND(STR1_NEXT); 337 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 338 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 339 cmp(ch1, ch2); 340 br(NE, STR2_NEXT); 341 adds(cnt1tmp, cnt1tmp, str1_chr_size); 342 add(cnt2tmp, cnt2tmp, str2_chr_size); 343 br(LT, STR1_NEXT); 344 b(MATCH); 345 346 BIND(DOSHORT); 347 if (str1_isL == str2_isL) { 348 cmp(cnt1, (u1)2); 349 br(LT, DO1); 350 br(GT, DO3); 351 } 352 } 353 354 if (icnt1 == 4) { 355 Label CH1_LOOP; 356 357 (this->*load_4chr)(ch1, str1); 358 sub(result_tmp, cnt2, 4); 359 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 360 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 361 362 BIND(CH1_LOOP); 363 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 364 cmp(ch1, ch2); 365 br(EQ, MATCH); 366 adds(cnt2_neg, cnt2_neg, str2_chr_size); 367 br(LE, CH1_LOOP); 368 b(NOMATCH); 369 } 370 371 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 372 Label CH1_LOOP; 373 374 BIND(DO2); 375 (this->*load_2chr)(ch1, str1); 376 if (icnt1 == 2) { 377 sub(result_tmp, cnt2, 2); 378 } 379 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 380 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 381 BIND(CH1_LOOP); 382 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 383 cmp(ch1, ch2); 384 br(EQ, MATCH); 385 adds(cnt2_neg, cnt2_neg, str2_chr_size); 386 br(LE, CH1_LOOP); 387 b(NOMATCH); 388 } 389 390 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 391 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 392 393 BIND(DO3); 394 (this->*load_2chr)(first, str1); 395 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 396 if (icnt1 == 3) { 397 sub(result_tmp, cnt2, 3); 398 } 399 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 400 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 401 BIND(FIRST_LOOP); 402 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 403 cmpw(first, ch2); 404 br(EQ, STR1_LOOP); 405 BIND(STR2_NEXT); 406 adds(cnt2_neg, cnt2_neg, str2_chr_size); 407 br(LE, FIRST_LOOP); 408 b(NOMATCH); 409 410 BIND(STR1_LOOP); 411 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 412 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 413 cmp(ch1, ch2); 414 br(NE, STR2_NEXT); 415 b(MATCH); 416 } 417 418 if (icnt1 == -1 || icnt1 == 1) { 419 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 420 421 BIND(DO1); 422 (this->*str1_load_1chr)(ch1, str1); 423 cmp(cnt2, (u1)8); 424 br(LT, DO1_SHORT); 425 426 sub(result_tmp, cnt2, 8/str2_chr_size); 427 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 428 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 429 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 430 431 if (str2_isL) { 432 orr(ch1, ch1, ch1, LSL, 8); 433 } 434 orr(ch1, ch1, ch1, LSL, 16); 435 orr(ch1, ch1, ch1, LSL, 32); 436 BIND(CH1_LOOP); 437 ldr(ch2, Address(str2, cnt2_neg)); 438 eor(ch2, ch1, ch2); 439 sub(tmp1, ch2, tmp3); 440 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 441 bics(tmp1, tmp1, tmp2); 442 br(NE, HAS_ZERO); 443 adds(cnt2_neg, cnt2_neg, 8); 444 br(LT, CH1_LOOP); 445 446 cmp(cnt2_neg, (u1)8); 447 mov(cnt2_neg, 0); 448 br(LT, CH1_LOOP); 449 b(NOMATCH); 450 451 BIND(HAS_ZERO); 452 rev(tmp1, tmp1); 453 clz(tmp1, tmp1); 454 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 455 b(MATCH); 456 457 BIND(DO1_SHORT); 458 mov(result_tmp, cnt2); 459 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 460 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 461 BIND(DO1_LOOP); 462 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 463 cmpw(ch1, ch2); 464 br(EQ, MATCH); 465 adds(cnt2_neg, cnt2_neg, str2_chr_size); 466 br(LT, DO1_LOOP); 467 } 468 } 469 BIND(NOMATCH); 470 mov(result, -1); 471 b(DONE); 472 BIND(MATCH); 473 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 474 BIND(DONE); 475 } 476 477 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 478 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 479 480 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 481 Register ch, Register result, 482 Register tmp1, Register tmp2, Register tmp3) 483 { 484 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 485 Register cnt1_neg = cnt1; 486 Register ch1 = rscratch1; 487 Register result_tmp = rscratch2; 488 489 cbz(cnt1, NOMATCH); 490 491 cmp(cnt1, (u1)4); 492 br(LT, DO1_SHORT); 493 494 orr(ch, ch, ch, LSL, 16); 495 orr(ch, ch, ch, LSL, 32); 496 497 sub(cnt1, cnt1, 4); 498 mov(result_tmp, cnt1); 499 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 500 sub(cnt1_neg, zr, cnt1, LSL, 1); 501 502 mov(tmp3, 0x0001000100010001); 503 504 BIND(CH1_LOOP); 505 ldr(ch1, Address(str1, cnt1_neg)); 506 eor(ch1, ch, ch1); 507 sub(tmp1, ch1, tmp3); 508 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 509 bics(tmp1, tmp1, tmp2); 510 br(NE, HAS_ZERO); 511 adds(cnt1_neg, cnt1_neg, 8); 512 br(LT, CH1_LOOP); 513 514 cmp(cnt1_neg, (u1)8); 515 mov(cnt1_neg, 0); 516 br(LT, CH1_LOOP); 517 b(NOMATCH); 518 519 BIND(HAS_ZERO); 520 rev(tmp1, tmp1); 521 clz(tmp1, tmp1); 522 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 523 b(MATCH); 524 525 BIND(DO1_SHORT); 526 mov(result_tmp, cnt1); 527 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 528 sub(cnt1_neg, zr, cnt1, LSL, 1); 529 BIND(DO1_LOOP); 530 ldrh(ch1, Address(str1, cnt1_neg)); 531 cmpw(ch, ch1); 532 br(EQ, MATCH); 533 adds(cnt1_neg, cnt1_neg, 2); 534 br(LT, DO1_LOOP); 535 BIND(NOMATCH); 536 mov(result, -1); 537 b(DONE); 538 BIND(MATCH); 539 add(result, result_tmp, cnt1_neg, ASR, 1); 540 BIND(DONE); 541 } 542 543 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 544 Register ch, Register result, 545 FloatRegister ztmp1, 546 FloatRegister ztmp2, 547 PRegister tmp_pg, 548 PRegister tmp_pdn, bool isL) 549 { 550 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 551 assert(tmp_pg->is_governing(), 552 "this register has to be a governing predicate register"); 553 554 Label LOOP, MATCH, DONE, NOMATCH; 555 Register vec_len = rscratch1; 556 Register idx = rscratch2; 557 558 SIMD_RegVariant T = (isL == true) ? B : H; 559 560 cbz(cnt1, NOMATCH); 561 562 // Assign the particular char throughout the vector. 563 sve_dup(ztmp2, T, ch); 564 if (isL) { 565 sve_cntb(vec_len); 566 } else { 567 sve_cnth(vec_len); 568 } 569 mov(idx, 0); 570 571 // Generate a predicate to control the reading of input string. 572 sve_whilelt(tmp_pg, T, idx, cnt1); 573 574 BIND(LOOP); 575 // Read a vector of 8- or 16-bit data depending on the string type. Note 576 // that inactive elements indicated by the predicate register won't cause 577 // a data read from memory to the destination vector. 578 if (isL) { 579 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 580 } else { 581 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 582 } 583 add(idx, idx, vec_len); 584 585 // Perform the comparison. An element of the destination predicate is set 586 // to active if the particular char is matched. 587 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 588 589 // Branch if the particular char is found. 590 br(NE, MATCH); 591 592 sve_whilelt(tmp_pg, T, idx, cnt1); 593 594 // Loop back if the particular char not found. 595 br(MI, LOOP); 596 597 BIND(NOMATCH); 598 mov(result, -1); 599 b(DONE); 600 601 BIND(MATCH); 602 // Undo the index increment. 603 sub(idx, idx, vec_len); 604 605 // Crop the vector to find its location. 606 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 607 add(result, idx, -1); 608 sve_incp(result, T, tmp_pdn); 609 BIND(DONE); 610 } 611 612 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 613 Register ch, Register result, 614 Register tmp1, Register tmp2, Register tmp3) 615 { 616 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 617 Register cnt1_neg = cnt1; 618 Register ch1 = rscratch1; 619 Register result_tmp = rscratch2; 620 621 cbz(cnt1, NOMATCH); 622 623 cmp(cnt1, (u1)8); 624 br(LT, DO1_SHORT); 625 626 orr(ch, ch, ch, LSL, 8); 627 orr(ch, ch, ch, LSL, 16); 628 orr(ch, ch, ch, LSL, 32); 629 630 sub(cnt1, cnt1, 8); 631 mov(result_tmp, cnt1); 632 lea(str1, Address(str1, cnt1)); 633 sub(cnt1_neg, zr, cnt1); 634 635 mov(tmp3, 0x0101010101010101); 636 637 BIND(CH1_LOOP); 638 ldr(ch1, Address(str1, cnt1_neg)); 639 eor(ch1, ch, ch1); 640 sub(tmp1, ch1, tmp3); 641 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 642 bics(tmp1, tmp1, tmp2); 643 br(NE, HAS_ZERO); 644 adds(cnt1_neg, cnt1_neg, 8); 645 br(LT, CH1_LOOP); 646 647 cmp(cnt1_neg, (u1)8); 648 mov(cnt1_neg, 0); 649 br(LT, CH1_LOOP); 650 b(NOMATCH); 651 652 BIND(HAS_ZERO); 653 rev(tmp1, tmp1); 654 clz(tmp1, tmp1); 655 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 656 b(MATCH); 657 658 BIND(DO1_SHORT); 659 mov(result_tmp, cnt1); 660 lea(str1, Address(str1, cnt1)); 661 sub(cnt1_neg, zr, cnt1); 662 BIND(DO1_LOOP); 663 ldrb(ch1, Address(str1, cnt1_neg)); 664 cmp(ch, ch1); 665 br(EQ, MATCH); 666 adds(cnt1_neg, cnt1_neg, 1); 667 br(LT, DO1_LOOP); 668 BIND(NOMATCH); 669 mov(result, -1); 670 b(DONE); 671 BIND(MATCH); 672 add(result, result_tmp, cnt1_neg); 673 BIND(DONE); 674 } 675 676 // Compare strings. 677 void C2_MacroAssembler::string_compare(Register str1, Register str2, 678 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 679 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 680 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 681 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 682 SHORT_LOOP_START, TAIL_CHECK; 683 684 bool isLL = ae == StrIntrinsicNode::LL; 685 bool isLU = ae == StrIntrinsicNode::LU; 686 bool isUL = ae == StrIntrinsicNode::UL; 687 688 // The stub threshold for LL strings is: 72 (64 + 8) chars 689 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 690 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 691 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 692 693 bool str1_isL = isLL || isLU; 694 bool str2_isL = isLL || isUL; 695 696 int str1_chr_shift = str1_isL ? 0 : 1; 697 int str2_chr_shift = str2_isL ? 0 : 1; 698 int str1_chr_size = str1_isL ? 1 : 2; 699 int str2_chr_size = str2_isL ? 1 : 2; 700 int minCharsInWord = isLL ? wordSize : wordSize/2; 701 702 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 703 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 704 (chr_insn)&MacroAssembler::ldrh; 705 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 706 (chr_insn)&MacroAssembler::ldrh; 707 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 708 (uxt_insn)&MacroAssembler::uxthw; 709 710 BLOCK_COMMENT("string_compare {"); 711 712 // Bizzarely, the counts are passed in bytes, regardless of whether they 713 // are L or U strings, however the result is always in characters. 714 if (!str1_isL) asrw(cnt1, cnt1, 1); 715 if (!str2_isL) asrw(cnt2, cnt2, 1); 716 717 // Compute the minimum of the string lengths and save the difference. 718 subsw(result, cnt1, cnt2); 719 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 720 721 // A very short string 722 cmpw(cnt2, minCharsInWord); 723 br(Assembler::LE, SHORT_STRING); 724 725 // Compare longwords 726 // load first parts of strings and finish initialization while loading 727 { 728 if (str1_isL == str2_isL) { // LL or UU 729 ldr(tmp1, Address(str1)); 730 cmp(str1, str2); 731 br(Assembler::EQ, DONE); 732 ldr(tmp2, Address(str2)); 733 cmp(cnt2, stub_threshold); 734 br(GE, STUB); 735 subsw(cnt2, cnt2, minCharsInWord); 736 br(EQ, TAIL_CHECK); 737 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 738 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 739 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 740 } else if (isLU) { 741 ldrs(vtmp, Address(str1)); 742 ldr(tmp2, Address(str2)); 743 cmp(cnt2, stub_threshold); 744 br(GE, STUB); 745 subw(cnt2, cnt2, 4); 746 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 747 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 748 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 749 zip1(vtmp, T8B, vtmp, vtmpZ); 750 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 751 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 752 add(cnt1, cnt1, 4); 753 fmovd(tmp1, vtmp); 754 } else { // UL case 755 ldr(tmp1, Address(str1)); 756 ldrs(vtmp, Address(str2)); 757 cmp(cnt2, stub_threshold); 758 br(GE, STUB); 759 subw(cnt2, cnt2, 4); 760 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 761 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 762 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 763 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 764 zip1(vtmp, T8B, vtmp, vtmpZ); 765 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 766 add(cnt1, cnt1, 8); 767 fmovd(tmp2, vtmp); 768 } 769 adds(cnt2, cnt2, isUL ? 4 : 8); 770 br(GE, TAIL); 771 eor(rscratch2, tmp1, tmp2); 772 cbnz(rscratch2, DIFF); 773 // main loop 774 bind(NEXT_WORD); 775 if (str1_isL == str2_isL) { 776 ldr(tmp1, Address(str1, cnt2)); 777 ldr(tmp2, Address(str2, cnt2)); 778 adds(cnt2, cnt2, 8); 779 } else if (isLU) { 780 ldrs(vtmp, Address(str1, cnt1)); 781 ldr(tmp2, Address(str2, cnt2)); 782 add(cnt1, cnt1, 4); 783 zip1(vtmp, T8B, vtmp, vtmpZ); 784 fmovd(tmp1, vtmp); 785 adds(cnt2, cnt2, 8); 786 } else { // UL 787 ldrs(vtmp, Address(str2, cnt2)); 788 ldr(tmp1, Address(str1, cnt1)); 789 zip1(vtmp, T8B, vtmp, vtmpZ); 790 add(cnt1, cnt1, 8); 791 fmovd(tmp2, vtmp); 792 adds(cnt2, cnt2, 4); 793 } 794 br(GE, TAIL); 795 796 eor(rscratch2, tmp1, tmp2); 797 cbz(rscratch2, NEXT_WORD); 798 b(DIFF); 799 bind(TAIL); 800 eor(rscratch2, tmp1, tmp2); 801 cbnz(rscratch2, DIFF); 802 // Last longword. In the case where length == 4 we compare the 803 // same longword twice, but that's still faster than another 804 // conditional branch. 805 if (str1_isL == str2_isL) { 806 ldr(tmp1, Address(str1)); 807 ldr(tmp2, Address(str2)); 808 } else if (isLU) { 809 ldrs(vtmp, Address(str1)); 810 ldr(tmp2, Address(str2)); 811 zip1(vtmp, T8B, vtmp, vtmpZ); 812 fmovd(tmp1, vtmp); 813 } else { // UL 814 ldrs(vtmp, Address(str2)); 815 ldr(tmp1, Address(str1)); 816 zip1(vtmp, T8B, vtmp, vtmpZ); 817 fmovd(tmp2, vtmp); 818 } 819 bind(TAIL_CHECK); 820 eor(rscratch2, tmp1, tmp2); 821 cbz(rscratch2, DONE); 822 823 // Find the first different characters in the longwords and 824 // compute their difference. 825 bind(DIFF); 826 rev(rscratch2, rscratch2); 827 clz(rscratch2, rscratch2); 828 andr(rscratch2, rscratch2, isLL ? -8 : -16); 829 lsrv(tmp1, tmp1, rscratch2); 830 (this->*ext_chr)(tmp1, tmp1); 831 lsrv(tmp2, tmp2, rscratch2); 832 (this->*ext_chr)(tmp2, tmp2); 833 subw(result, tmp1, tmp2); 834 b(DONE); 835 } 836 837 bind(STUB); 838 RuntimeAddress stub = NULL; 839 switch(ae) { 840 case StrIntrinsicNode::LL: 841 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 842 break; 843 case StrIntrinsicNode::UU: 844 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 845 break; 846 case StrIntrinsicNode::LU: 847 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 848 break; 849 case StrIntrinsicNode::UL: 850 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 851 break; 852 default: 853 ShouldNotReachHere(); 854 } 855 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 856 trampoline_call(stub); 857 b(DONE); 858 859 bind(SHORT_STRING); 860 // Is the minimum length zero? 861 cbz(cnt2, DONE); 862 // arrange code to do most branches while loading and loading next characters 863 // while comparing previous 864 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 865 subs(cnt2, cnt2, 1); 866 br(EQ, SHORT_LAST_INIT); 867 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 868 b(SHORT_LOOP_START); 869 bind(SHORT_LOOP); 870 subs(cnt2, cnt2, 1); 871 br(EQ, SHORT_LAST); 872 bind(SHORT_LOOP_START); 873 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 874 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 875 cmp(tmp1, cnt1); 876 br(NE, SHORT_LOOP_TAIL); 877 subs(cnt2, cnt2, 1); 878 br(EQ, SHORT_LAST2); 879 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 880 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 881 cmp(tmp2, rscratch1); 882 br(EQ, SHORT_LOOP); 883 sub(result, tmp2, rscratch1); 884 b(DONE); 885 bind(SHORT_LOOP_TAIL); 886 sub(result, tmp1, cnt1); 887 b(DONE); 888 bind(SHORT_LAST2); 889 cmp(tmp2, rscratch1); 890 br(EQ, DONE); 891 sub(result, tmp2, rscratch1); 892 893 b(DONE); 894 bind(SHORT_LAST_INIT); 895 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 896 bind(SHORT_LAST); 897 cmp(tmp1, cnt1); 898 br(EQ, DONE); 899 sub(result, tmp1, cnt1); 900 901 bind(DONE); 902 903 BLOCK_COMMENT("} string_compare"); 904 } 905 906 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 907 FloatRegister src2, int cond, bool isQ) { 908 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 909 if (bt == T_FLOAT || bt == T_DOUBLE) { 910 switch (cond) { 911 case BoolTest::eq: fcmeq(dst, size, src1, src2); break; 912 case BoolTest::ne: { 913 fcmeq(dst, size, src1, src2); 914 notr(dst, T16B, dst); 915 break; 916 } 917 case BoolTest::ge: fcmge(dst, size, src1, src2); break; 918 case BoolTest::gt: fcmgt(dst, size, src1, src2); break; 919 case BoolTest::le: fcmge(dst, size, src2, src1); break; 920 case BoolTest::lt: fcmgt(dst, size, src2, src1); break; 921 default: 922 assert(false, "unsupported"); 923 ShouldNotReachHere(); 924 } 925 } else { 926 switch (cond) { 927 case BoolTest::eq: cmeq(dst, size, src1, src2); break; 928 case BoolTest::ne: { 929 cmeq(dst, size, src1, src2); 930 notr(dst, T16B, dst); 931 break; 932 } 933 case BoolTest::ge: cmge(dst, size, src1, src2); break; 934 case BoolTest::gt: cmgt(dst, size, src1, src2); break; 935 case BoolTest::le: cmge(dst, size, src2, src1); break; 936 case BoolTest::lt: cmgt(dst, size, src2, src1); break; 937 case BoolTest::uge: cmhs(dst, size, src1, src2); break; 938 case BoolTest::ugt: cmhi(dst, size, src1, src2); break; 939 case BoolTest::ult: cmhi(dst, size, src2, src1); break; 940 case BoolTest::ule: cmhs(dst, size, src2, src1); break; 941 default: 942 assert(false, "unsupported"); 943 ShouldNotReachHere(); 944 } 945 } 946 } 947 948 // Compress the least significant bit of each byte to the rightmost and clear 949 // the higher garbage bits. 950 void C2_MacroAssembler::bytemask_compress(Register dst) { 951 // Example input, dst = 0x01 00 00 00 01 01 00 01 952 // The "??" bytes are garbage. 953 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 954 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 955 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 956 andr(dst, dst, 0xff); // dst = 0x8D 957 } 958 959 // Pack the lowest-numbered bit of each mask element in src into a long value 960 // in dst, at most the first 64 lane elements. 961 // Clobbers: rscratch1 962 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 963 FloatRegister vtmp1, FloatRegister vtmp2) { 964 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 965 assert_different_registers(dst, rscratch1); 966 assert_different_registers(vtmp1, vtmp2); 967 968 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 969 970 // Pack the mask into vector with sequential bytes. 971 sve_cpy(vtmp1, size, src, 1, false); 972 if (bt != T_BYTE) { 973 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 974 } 975 976 // Compress the lowest 8 bytes. 977 fmovd(dst, vtmp1); 978 bytemask_compress(dst); 979 if (lane_cnt <= 8) return; 980 981 // Repeat on higher bytes and join the results. 982 // Compress 8 bytes in each iteration. 983 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 984 sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2); 985 bytemask_compress(rscratch1); 986 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 987 } 988 } 989 990 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 991 FloatRegister zn, FloatRegister zm, int cond) { 992 assert(pg->is_governing(), "This register has to be a governing predicate register"); 993 FloatRegister z1 = zn, z2 = zm; 994 // Convert the original BoolTest condition to Assembler::condition. 995 Condition condition; 996 switch (cond) { 997 case BoolTest::eq: condition = Assembler::EQ; break; 998 case BoolTest::ne: condition = Assembler::NE; break; 999 case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break; 1000 case BoolTest::ge: condition = Assembler::GE; break; 1001 case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break; 1002 case BoolTest::gt: condition = Assembler::GT; break; 1003 default: 1004 assert(false, "unsupported compare condition"); 1005 ShouldNotReachHere(); 1006 } 1007 1008 SIMD_RegVariant size = elemType_to_regVariant(bt); 1009 if (bt == T_FLOAT || bt == T_DOUBLE) { 1010 sve_fcm(condition, pd, size, pg, z1, z2); 1011 } else { 1012 assert(is_integral_type(bt), "unsupported element type"); 1013 sve_cmp(condition, pd, size, pg, z1, z2); 1014 } 1015 } 1016 1017 // Get index of the last mask lane that is set 1018 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1019 SIMD_RegVariant size = elemType_to_regVariant(bt); 1020 sve_rev(ptmp, size, src); 1021 sve_brkb(ptmp, ptrue, ptmp, false); 1022 sve_cntp(dst, size, ptrue, ptmp); 1023 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1024 subw(dst, rscratch1, dst); 1025 } 1026 1027 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1028 FloatRegister src, SIMD_RegVariant src_size) { 1029 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1030 if (src_size == B) { 1031 switch (dst_size) { 1032 case H: 1033 sve_sunpklo(dst, H, src); 1034 break; 1035 case S: 1036 sve_sunpklo(dst, H, src); 1037 sve_sunpklo(dst, S, dst); 1038 break; 1039 case D: 1040 sve_sunpklo(dst, H, src); 1041 sve_sunpklo(dst, S, dst); 1042 sve_sunpklo(dst, D, dst); 1043 break; 1044 default: 1045 ShouldNotReachHere(); 1046 } 1047 } else if (src_size == H) { 1048 if (dst_size == S) { 1049 sve_sunpklo(dst, S, src); 1050 } else { // D 1051 sve_sunpklo(dst, S, src); 1052 sve_sunpklo(dst, D, dst); 1053 } 1054 } else if (src_size == S) { 1055 sve_sunpklo(dst, D, src); 1056 } 1057 } 1058 1059 // Vector narrow from src to dst with specified element sizes. 1060 // High part of dst vector will be filled with zero. 1061 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1062 FloatRegister src, SIMD_RegVariant src_size, 1063 FloatRegister tmp) { 1064 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1065 assert_different_registers(src, tmp); 1066 sve_dup(tmp, src_size, 0); 1067 if (src_size == D) { 1068 switch (dst_size) { 1069 case S: 1070 sve_uzp1(dst, S, src, tmp); 1071 break; 1072 case H: 1073 sve_uzp1(dst, S, src, tmp); 1074 sve_uzp1(dst, H, dst, tmp); 1075 break; 1076 case B: 1077 sve_uzp1(dst, S, src, tmp); 1078 sve_uzp1(dst, H, dst, tmp); 1079 sve_uzp1(dst, B, dst, tmp); 1080 break; 1081 default: 1082 ShouldNotReachHere(); 1083 } 1084 } else if (src_size == S) { 1085 if (dst_size == H) { 1086 sve_uzp1(dst, H, src, tmp); 1087 } else { // B 1088 sve_uzp1(dst, H, src, tmp); 1089 sve_uzp1(dst, B, dst, tmp); 1090 } 1091 } else if (src_size == H) { 1092 sve_uzp1(dst, B, src, tmp); 1093 } 1094 } 1095 1096 // Extend src predicate to dst predicate with the same lane count but larger 1097 // element size, e.g. 64Byte -> 512Long 1098 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1099 uint dst_element_length_in_bytes, 1100 uint src_element_length_in_bytes) { 1101 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1102 sve_punpklo(dst, src); 1103 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1104 sve_punpklo(dst, src); 1105 sve_punpklo(dst, dst); 1106 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1107 sve_punpklo(dst, src); 1108 sve_punpklo(dst, dst); 1109 sve_punpklo(dst, dst); 1110 } else { 1111 assert(false, "unsupported"); 1112 ShouldNotReachHere(); 1113 } 1114 } 1115 1116 // Narrow src predicate to dst predicate with the same lane count but 1117 // smaller element size, e.g. 512Long -> 64Byte 1118 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, 1119 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1120 // The insignificant bits in src predicate are expected to be zero. 1121 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1122 sve_uzp1(dst, B, src, src); 1123 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1124 sve_uzp1(dst, H, src, src); 1125 sve_uzp1(dst, B, dst, dst); 1126 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1127 sve_uzp1(dst, S, src, src); 1128 sve_uzp1(dst, H, dst, dst); 1129 sve_uzp1(dst, B, dst, dst); 1130 } else { 1131 assert(false, "unsupported"); 1132 ShouldNotReachHere(); 1133 } 1134 } 1135 1136 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1137 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1138 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1139 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1140 assert_different_registers(src1, dst); 1141 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1142 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1143 switch (opc) { 1144 case Op_AddReductionVI: { 1145 sve_uaddv(tmp, size, pg, src2); 1146 smov(dst, tmp, size, 0); 1147 if (bt == T_BYTE) { 1148 addw(dst, src1, dst, ext::sxtb); 1149 } else if (bt == T_SHORT) { 1150 addw(dst, src1, dst, ext::sxth); 1151 } else { 1152 addw(dst, dst, src1); 1153 } 1154 break; 1155 } 1156 case Op_AddReductionVL: { 1157 sve_uaddv(tmp, size, pg, src2); 1158 umov(dst, tmp, size, 0); 1159 add(dst, dst, src1); 1160 break; 1161 } 1162 case Op_AndReductionV: { 1163 sve_andv(tmp, size, pg, src2); 1164 if (bt == T_LONG) { 1165 umov(dst, tmp, size, 0); 1166 andr(dst, dst, src1); 1167 } else { 1168 smov(dst, tmp, size, 0); 1169 andw(dst, dst, src1); 1170 } 1171 break; 1172 } 1173 case Op_OrReductionV: { 1174 sve_orv(tmp, size, pg, src2); 1175 if (bt == T_LONG) { 1176 umov(dst, tmp, size, 0); 1177 orr(dst, dst, src1); 1178 } else { 1179 smov(dst, tmp, size, 0); 1180 orrw(dst, dst, src1); 1181 } 1182 break; 1183 } 1184 case Op_XorReductionV: { 1185 sve_eorv(tmp, size, pg, src2); 1186 if (bt == T_LONG) { 1187 umov(dst, tmp, size, 0); 1188 eor(dst, dst, src1); 1189 } else { 1190 smov(dst, tmp, size, 0); 1191 eorw(dst, dst, src1); 1192 } 1193 break; 1194 } 1195 case Op_MaxReductionV: { 1196 sve_smaxv(tmp, size, pg, src2); 1197 if (bt == T_LONG) { 1198 umov(dst, tmp, size, 0); 1199 cmp(dst, src1); 1200 csel(dst, dst, src1, Assembler::GT); 1201 } else { 1202 smov(dst, tmp, size, 0); 1203 cmpw(dst, src1); 1204 cselw(dst, dst, src1, Assembler::GT); 1205 } 1206 break; 1207 } 1208 case Op_MinReductionV: { 1209 sve_sminv(tmp, size, pg, src2); 1210 if (bt == T_LONG) { 1211 umov(dst, tmp, size, 0); 1212 cmp(dst, src1); 1213 csel(dst, dst, src1, Assembler::LT); 1214 } else { 1215 smov(dst, tmp, size, 0); 1216 cmpw(dst, src1); 1217 cselw(dst, dst, src1, Assembler::LT); 1218 } 1219 break; 1220 } 1221 default: 1222 assert(false, "unsupported"); 1223 ShouldNotReachHere(); 1224 } 1225 1226 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1227 if (bt == T_BYTE) { 1228 sxtb(dst, dst); 1229 } else if (bt == T_SHORT) { 1230 sxth(dst, dst); 1231 } 1232 } 1233 } 1234 1235 // Set elements of the dst predicate to true if the element number is 1236 // in the range of [0, lane_cnt), or to false otherwise. 1237 void C2_MacroAssembler::sve_ptrue_lanecnt(PRegister dst, SIMD_RegVariant size, int lane_cnt) { 1238 assert(size != Q, "invalid size"); 1239 switch(lane_cnt) { 1240 case 1: /* VL1 */ 1241 case 2: /* VL2 */ 1242 case 3: /* VL3 */ 1243 case 4: /* VL4 */ 1244 case 5: /* VL5 */ 1245 case 6: /* VL6 */ 1246 case 7: /* VL7 */ 1247 case 8: /* VL8 */ 1248 sve_ptrue(dst, size, lane_cnt); 1249 break; 1250 case 16: 1251 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1252 break; 1253 case 32: 1254 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1255 break; 1256 case 64: 1257 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1258 break; 1259 case 128: 1260 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1261 break; 1262 case 256: 1263 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1264 break; 1265 default: 1266 assert(false, "unsupported"); 1267 ShouldNotReachHere(); 1268 } 1269 } 1270 1271 // Extract a scalar element from an sve vector at position 'idx'. 1272 // The input elements in src are expected to be of integral type. 1273 void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx, 1274 bool is_signed, FloatRegister vtmp) { 1275 assert(UseSVE > 0 && size != Q, "unsupported"); 1276 assert(!(is_signed && size == D), "signed extract (D) not supported."); 1277 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 1278 is_signed ? smov(dst, src, size, idx) : umov(dst, src, size, idx); 1279 } else { 1280 sve_orr(vtmp, src, src); 1281 sve_ext(vtmp, vtmp, idx << size); 1282 is_signed ? smov(dst, vtmp, size, 0) : umov(dst, vtmp, size, 0); 1283 } 1284 } 1285 1286 // java.lang.Math::round intrinsics 1287 1288 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1289 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 1290 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 1291 switch (T) { 1292 case T2S: 1293 case T4S: 1294 fmovs(tmp1, T, 0.5f); 1295 mov(rscratch1, jint_cast(0x1.0p23f)); 1296 break; 1297 case T2D: 1298 fmovd(tmp1, T, 0.5); 1299 mov(rscratch1, julong_cast(0x1.0p52)); 1300 break; 1301 default: 1302 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 1303 } 1304 fadd(tmp1, T, tmp1, src); 1305 fcvtms(tmp1, T, tmp1); 1306 // tmp1 = floor(src + 0.5, ties to even) 1307 1308 fcvtas(dst, T, src); 1309 // dst = round(src), ties to away 1310 1311 fneg(tmp3, T, src); 1312 dup(tmp2, T, rscratch1); 1313 cmhs(tmp3, T, tmp3, tmp2); 1314 // tmp3 is now a set of flags 1315 1316 bif(dst, T16B, tmp1, tmp3); 1317 // result in dst 1318 } 1319 1320 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1321 FloatRegister tmp2, PRegister ptmp, SIMD_RegVariant T) { 1322 assert_different_registers(tmp1, tmp2, src, dst); 1323 1324 switch (T) { 1325 case S: 1326 mov(rscratch1, jint_cast(0x1.0p23f)); 1327 break; 1328 case D: 1329 mov(rscratch1, julong_cast(0x1.0p52)); 1330 break; 1331 default: 1332 assert(T == S || T == D, "invalid arrangement"); 1333 } 1334 1335 sve_frinta(dst, T, ptrue, src); 1336 // dst = round(src), ties to away 1337 1338 Label none; 1339 1340 sve_fneg(tmp1, T, ptrue, src); 1341 sve_dup(tmp2, T, rscratch1); 1342 sve_cmp(HS, ptmp, T, ptrue, tmp2, tmp1); 1343 br(EQ, none); 1344 { 1345 sve_cpy(tmp1, T, ptmp, 0.5); 1346 sve_fadd(tmp1, T, ptmp, src); 1347 sve_frintm(dst, T, ptmp, tmp1); 1348 // dst = floor(src + 0.5, ties to even) 1349 } 1350 bind(none); 1351 1352 sve_fcvtzs(dst, T, ptrue, dst, T); 1353 // result in dst 1354 }