1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 47 48 // Search for str1 in str2 and return index or -1 49 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 50 Register cnt2, Register cnt1, 51 Register tmp1, Register tmp2, 52 Register tmp3, Register tmp4, 53 Register tmp5, Register tmp6, 54 int icnt1, Register result, int ae) { 55 // NOTE: tmp5, tmp6 can be zr depending on specific method version 56 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 57 58 Register ch1 = rscratch1; 59 Register ch2 = rscratch2; 60 Register cnt1tmp = tmp1; 61 Register cnt2tmp = tmp2; 62 Register cnt1_neg = cnt1; 63 Register cnt2_neg = cnt2; 64 Register result_tmp = tmp4; 65 66 bool isL = ae == StrIntrinsicNode::LL; 67 68 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 69 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 70 int str1_chr_shift = str1_isL ? 0:1; 71 int str2_chr_shift = str2_isL ? 0:1; 72 int str1_chr_size = str1_isL ? 1:2; 73 int str2_chr_size = str2_isL ? 1:2; 74 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 75 (chr_insn)&MacroAssembler::ldrh; 76 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 77 (chr_insn)&MacroAssembler::ldrh; 78 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 79 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 80 81 // Note, inline_string_indexOf() generates checks: 82 // if (substr.count > string.count) return -1; 83 // if (substr.count == 0) return 0; 84 85 // We have two strings, a source string in str2, cnt2 and a pattern string 86 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 87 88 // For larger pattern and source we use a simplified Boyer Moore algorithm. 89 // With a small pattern and source we use linear scan. 90 91 if (icnt1 == -1) { 92 sub(result_tmp, cnt2, cnt1); 93 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 94 br(LT, LINEARSEARCH); 95 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 96 subs(zr, cnt1, 256); 97 lsr(tmp1, cnt2, 2); 98 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 99 br(GE, LINEARSTUB); 100 } 101 102 // The Boyer Moore alogorithm is based on the description here:- 103 // 104 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 105 // 106 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 107 // and the 'Good Suffix' rule. 108 // 109 // These rules are essentially heuristics for how far we can shift the 110 // pattern along the search string. 111 // 112 // The implementation here uses the 'Bad Character' rule only because of the 113 // complexity of initialisation for the 'Good Suffix' rule. 114 // 115 // This is also known as the Boyer-Moore-Horspool algorithm:- 116 // 117 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 118 // 119 // This particular implementation has few java-specific optimizations. 120 // 121 // #define ASIZE 256 122 // 123 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 124 // int i, j; 125 // unsigned c; 126 // unsigned char bc[ASIZE]; 127 // 128 // /* Preprocessing */ 129 // for (i = 0; i < ASIZE; ++i) 130 // bc[i] = m; 131 // for (i = 0; i < m - 1; ) { 132 // c = x[i]; 133 // ++i; 134 // // c < 256 for Latin1 string, so, no need for branch 135 // #ifdef PATTERN_STRING_IS_LATIN1 136 // bc[c] = m - i; 137 // #else 138 // if (c < ASIZE) bc[c] = m - i; 139 // #endif 140 // } 141 // 142 // /* Searching */ 143 // j = 0; 144 // while (j <= n - m) { 145 // c = y[i+j]; 146 // if (x[m-1] == c) 147 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 148 // if (i < 0) return j; 149 // // c < 256 for Latin1 string, so, no need for branch 150 // #ifdef SOURCE_STRING_IS_LATIN1 151 // // LL case: (c< 256) always true. Remove branch 152 // j += bc[y[j+m-1]]; 153 // #endif 154 // #ifndef PATTERN_STRING_IS_UTF 155 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 156 // if (c < ASIZE) 157 // j += bc[y[j+m-1]]; 158 // else 159 // j += 1 160 // #endif 161 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 162 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 163 // if (c < ASIZE) 164 // j += bc[y[j+m-1]]; 165 // else 166 // j += m 167 // #endif 168 // } 169 // } 170 171 if (icnt1 == -1) { 172 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 173 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 174 Register cnt1end = tmp2; 175 Register str2end = cnt2; 176 Register skipch = tmp2; 177 178 // str1 length is >=8, so, we can read at least 1 register for cases when 179 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 180 // UL case. We'll re-read last character in inner pre-loop code to have 181 // single outer pre-loop load 182 const int firstStep = isL ? 7 : 3; 183 184 const int ASIZE = 256; 185 const int STORED_BYTES = 32; // amount of bytes stored per instruction 186 sub(sp, sp, ASIZE); 187 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 188 mov(ch1, sp); 189 BIND(BM_INIT_LOOP); 190 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 191 subs(tmp5, tmp5, 1); 192 br(GT, BM_INIT_LOOP); 193 194 sub(cnt1tmp, cnt1, 1); 195 mov(tmp5, str2); 196 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 197 sub(ch2, cnt1, 1); 198 mov(tmp3, str1); 199 BIND(BCLOOP); 200 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 201 if (!str1_isL) { 202 subs(zr, ch1, ASIZE); 203 br(HS, BCSKIP); 204 } 205 strb(ch2, Address(sp, ch1)); 206 BIND(BCSKIP); 207 subs(ch2, ch2, 1); 208 br(GT, BCLOOP); 209 210 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 211 if (str1_isL == str2_isL) { 212 // load last 8 bytes (8LL/4UU symbols) 213 ldr(tmp6, Address(tmp6, -wordSize)); 214 } else { 215 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 216 // convert Latin1 to UTF. We'll have to wait until load completed, but 217 // it's still faster than per-character loads+checks 218 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 219 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 220 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 221 andr(tmp6, tmp6, 0xFF); // str1[N-4] 222 orr(ch2, ch1, ch2, LSL, 16); 223 orr(tmp6, tmp6, tmp3, LSL, 48); 224 orr(tmp6, tmp6, ch2, LSL, 16); 225 } 226 BIND(BMLOOPSTR2); 227 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 228 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 229 if (str1_isL == str2_isL) { 230 // re-init tmp3. It's for free because it's executed in parallel with 231 // load above. Alternative is to initialize it before loop, but it'll 232 // affect performance on in-order systems with 2 or more ld/st pipelines 233 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 234 } 235 if (!isL) { // UU/UL case 236 lsl(ch2, cnt1tmp, 1); // offset in bytes 237 } 238 cmp(tmp3, skipch); 239 br(NE, BMSKIP); 240 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 241 mov(ch1, tmp6); 242 if (isL) { 243 b(BMLOOPSTR1_AFTER_LOAD); 244 } else { 245 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 246 b(BMLOOPSTR1_CMP); 247 } 248 BIND(BMLOOPSTR1); 249 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 250 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 251 BIND(BMLOOPSTR1_AFTER_LOAD); 252 subs(cnt1tmp, cnt1tmp, 1); 253 br(LT, BMLOOPSTR1_LASTCMP); 254 BIND(BMLOOPSTR1_CMP); 255 cmp(ch1, ch2); 256 br(EQ, BMLOOPSTR1); 257 BIND(BMSKIP); 258 if (!isL) { 259 // if we've met UTF symbol while searching Latin1 pattern, then we can 260 // skip cnt1 symbols 261 if (str1_isL != str2_isL) { 262 mov(result_tmp, cnt1); 263 } else { 264 mov(result_tmp, 1); 265 } 266 subs(zr, skipch, ASIZE); 267 br(HS, BMADV); 268 } 269 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 270 BIND(BMADV); 271 sub(cnt1tmp, cnt1, 1); 272 add(str2, str2, result_tmp, LSL, str2_chr_shift); 273 cmp(str2, str2end); 274 br(LE, BMLOOPSTR2); 275 add(sp, sp, ASIZE); 276 b(NOMATCH); 277 BIND(BMLOOPSTR1_LASTCMP); 278 cmp(ch1, ch2); 279 br(NE, BMSKIP); 280 BIND(BMMATCH); 281 sub(result, str2, tmp5); 282 if (!str2_isL) lsr(result, result, 1); 283 add(sp, sp, ASIZE); 284 b(DONE); 285 286 BIND(LINEARSTUB); 287 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 288 br(LT, LINEAR_MEDIUM); 289 mov(result, zr); 290 RuntimeAddress stub = NULL; 291 if (isL) { 292 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 293 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 294 } else if (str1_isL) { 295 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 296 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 297 } else { 298 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 299 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 300 } 301 address call = trampoline_call(stub); 302 if (call == nullptr) { 303 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 304 ciEnv::current()->record_failure("CodeCache is full"); 305 return; 306 } 307 b(DONE); 308 } 309 310 BIND(LINEARSEARCH); 311 { 312 Label DO1, DO2, DO3; 313 314 Register str2tmp = tmp2; 315 Register first = tmp3; 316 317 if (icnt1 == -1) 318 { 319 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 320 321 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 322 br(LT, DOSHORT); 323 BIND(LINEAR_MEDIUM); 324 (this->*str1_load_1chr)(first, Address(str1)); 325 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 326 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 327 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 328 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 329 330 BIND(FIRST_LOOP); 331 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 332 cmp(first, ch2); 333 br(EQ, STR1_LOOP); 334 BIND(STR2_NEXT); 335 adds(cnt2_neg, cnt2_neg, str2_chr_size); 336 br(LE, FIRST_LOOP); 337 b(NOMATCH); 338 339 BIND(STR1_LOOP); 340 adds(cnt1tmp, cnt1_neg, str1_chr_size); 341 add(cnt2tmp, cnt2_neg, str2_chr_size); 342 br(GE, MATCH); 343 344 BIND(STR1_NEXT); 345 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 346 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 347 cmp(ch1, ch2); 348 br(NE, STR2_NEXT); 349 adds(cnt1tmp, cnt1tmp, str1_chr_size); 350 add(cnt2tmp, cnt2tmp, str2_chr_size); 351 br(LT, STR1_NEXT); 352 b(MATCH); 353 354 BIND(DOSHORT); 355 if (str1_isL == str2_isL) { 356 cmp(cnt1, (u1)2); 357 br(LT, DO1); 358 br(GT, DO3); 359 } 360 } 361 362 if (icnt1 == 4) { 363 Label CH1_LOOP; 364 365 (this->*load_4chr)(ch1, str1); 366 sub(result_tmp, cnt2, 4); 367 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 368 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 369 370 BIND(CH1_LOOP); 371 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 372 cmp(ch1, ch2); 373 br(EQ, MATCH); 374 adds(cnt2_neg, cnt2_neg, str2_chr_size); 375 br(LE, CH1_LOOP); 376 b(NOMATCH); 377 } 378 379 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 380 Label CH1_LOOP; 381 382 BIND(DO2); 383 (this->*load_2chr)(ch1, str1); 384 if (icnt1 == 2) { 385 sub(result_tmp, cnt2, 2); 386 } 387 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 388 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 389 BIND(CH1_LOOP); 390 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 391 cmp(ch1, ch2); 392 br(EQ, MATCH); 393 adds(cnt2_neg, cnt2_neg, str2_chr_size); 394 br(LE, CH1_LOOP); 395 b(NOMATCH); 396 } 397 398 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 399 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 400 401 BIND(DO3); 402 (this->*load_2chr)(first, str1); 403 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 404 if (icnt1 == 3) { 405 sub(result_tmp, cnt2, 3); 406 } 407 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 408 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 409 BIND(FIRST_LOOP); 410 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 411 cmpw(first, ch2); 412 br(EQ, STR1_LOOP); 413 BIND(STR2_NEXT); 414 adds(cnt2_neg, cnt2_neg, str2_chr_size); 415 br(LE, FIRST_LOOP); 416 b(NOMATCH); 417 418 BIND(STR1_LOOP); 419 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 420 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 421 cmp(ch1, ch2); 422 br(NE, STR2_NEXT); 423 b(MATCH); 424 } 425 426 if (icnt1 == -1 || icnt1 == 1) { 427 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 428 429 BIND(DO1); 430 (this->*str1_load_1chr)(ch1, str1); 431 cmp(cnt2, (u1)8); 432 br(LT, DO1_SHORT); 433 434 sub(result_tmp, cnt2, 8/str2_chr_size); 435 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 436 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 437 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 438 439 if (str2_isL) { 440 orr(ch1, ch1, ch1, LSL, 8); 441 } 442 orr(ch1, ch1, ch1, LSL, 16); 443 orr(ch1, ch1, ch1, LSL, 32); 444 BIND(CH1_LOOP); 445 ldr(ch2, Address(str2, cnt2_neg)); 446 eor(ch2, ch1, ch2); 447 sub(tmp1, ch2, tmp3); 448 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 449 bics(tmp1, tmp1, tmp2); 450 br(NE, HAS_ZERO); 451 adds(cnt2_neg, cnt2_neg, 8); 452 br(LT, CH1_LOOP); 453 454 cmp(cnt2_neg, (u1)8); 455 mov(cnt2_neg, 0); 456 br(LT, CH1_LOOP); 457 b(NOMATCH); 458 459 BIND(HAS_ZERO); 460 rev(tmp1, tmp1); 461 clz(tmp1, tmp1); 462 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 463 b(MATCH); 464 465 BIND(DO1_SHORT); 466 mov(result_tmp, cnt2); 467 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 468 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 469 BIND(DO1_LOOP); 470 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 471 cmpw(ch1, ch2); 472 br(EQ, MATCH); 473 adds(cnt2_neg, cnt2_neg, str2_chr_size); 474 br(LT, DO1_LOOP); 475 } 476 } 477 BIND(NOMATCH); 478 mov(result, -1); 479 b(DONE); 480 BIND(MATCH); 481 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 482 BIND(DONE); 483 } 484 485 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 486 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 487 488 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 489 Register ch, Register result, 490 Register tmp1, Register tmp2, Register tmp3) 491 { 492 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 493 Register cnt1_neg = cnt1; 494 Register ch1 = rscratch1; 495 Register result_tmp = rscratch2; 496 497 cbz(cnt1, NOMATCH); 498 499 cmp(cnt1, (u1)4); 500 br(LT, DO1_SHORT); 501 502 orr(ch, ch, ch, LSL, 16); 503 orr(ch, ch, ch, LSL, 32); 504 505 sub(cnt1, cnt1, 4); 506 mov(result_tmp, cnt1); 507 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 508 sub(cnt1_neg, zr, cnt1, LSL, 1); 509 510 mov(tmp3, 0x0001000100010001); 511 512 BIND(CH1_LOOP); 513 ldr(ch1, Address(str1, cnt1_neg)); 514 eor(ch1, ch, ch1); 515 sub(tmp1, ch1, tmp3); 516 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 517 bics(tmp1, tmp1, tmp2); 518 br(NE, HAS_ZERO); 519 adds(cnt1_neg, cnt1_neg, 8); 520 br(LT, CH1_LOOP); 521 522 cmp(cnt1_neg, (u1)8); 523 mov(cnt1_neg, 0); 524 br(LT, CH1_LOOP); 525 b(NOMATCH); 526 527 BIND(HAS_ZERO); 528 rev(tmp1, tmp1); 529 clz(tmp1, tmp1); 530 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 531 b(MATCH); 532 533 BIND(DO1_SHORT); 534 mov(result_tmp, cnt1); 535 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 536 sub(cnt1_neg, zr, cnt1, LSL, 1); 537 BIND(DO1_LOOP); 538 ldrh(ch1, Address(str1, cnt1_neg)); 539 cmpw(ch, ch1); 540 br(EQ, MATCH); 541 adds(cnt1_neg, cnt1_neg, 2); 542 br(LT, DO1_LOOP); 543 BIND(NOMATCH); 544 mov(result, -1); 545 b(DONE); 546 BIND(MATCH); 547 add(result, result_tmp, cnt1_neg, ASR, 1); 548 BIND(DONE); 549 } 550 551 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 552 Register ch, Register result, 553 FloatRegister ztmp1, 554 FloatRegister ztmp2, 555 PRegister tmp_pg, 556 PRegister tmp_pdn, bool isL) 557 { 558 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 559 assert(tmp_pg->is_governing(), 560 "this register has to be a governing predicate register"); 561 562 Label LOOP, MATCH, DONE, NOMATCH; 563 Register vec_len = rscratch1; 564 Register idx = rscratch2; 565 566 SIMD_RegVariant T = (isL == true) ? B : H; 567 568 cbz(cnt1, NOMATCH); 569 570 // Assign the particular char throughout the vector. 571 sve_dup(ztmp2, T, ch); 572 if (isL) { 573 sve_cntb(vec_len); 574 } else { 575 sve_cnth(vec_len); 576 } 577 mov(idx, 0); 578 579 // Generate a predicate to control the reading of input string. 580 sve_whilelt(tmp_pg, T, idx, cnt1); 581 582 BIND(LOOP); 583 // Read a vector of 8- or 16-bit data depending on the string type. Note 584 // that inactive elements indicated by the predicate register won't cause 585 // a data read from memory to the destination vector. 586 if (isL) { 587 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 588 } else { 589 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 590 } 591 add(idx, idx, vec_len); 592 593 // Perform the comparison. An element of the destination predicate is set 594 // to active if the particular char is matched. 595 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 596 597 // Branch if the particular char is found. 598 br(NE, MATCH); 599 600 sve_whilelt(tmp_pg, T, idx, cnt1); 601 602 // Loop back if the particular char not found. 603 br(MI, LOOP); 604 605 BIND(NOMATCH); 606 mov(result, -1); 607 b(DONE); 608 609 BIND(MATCH); 610 // Undo the index increment. 611 sub(idx, idx, vec_len); 612 613 // Crop the vector to find its location. 614 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 615 add(result, idx, -1); 616 sve_incp(result, T, tmp_pdn); 617 BIND(DONE); 618 } 619 620 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 621 Register ch, Register result, 622 Register tmp1, Register tmp2, Register tmp3) 623 { 624 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 625 Register cnt1_neg = cnt1; 626 Register ch1 = rscratch1; 627 Register result_tmp = rscratch2; 628 629 cbz(cnt1, NOMATCH); 630 631 cmp(cnt1, (u1)8); 632 br(LT, DO1_SHORT); 633 634 orr(ch, ch, ch, LSL, 8); 635 orr(ch, ch, ch, LSL, 16); 636 orr(ch, ch, ch, LSL, 32); 637 638 sub(cnt1, cnt1, 8); 639 mov(result_tmp, cnt1); 640 lea(str1, Address(str1, cnt1)); 641 sub(cnt1_neg, zr, cnt1); 642 643 mov(tmp3, 0x0101010101010101); 644 645 BIND(CH1_LOOP); 646 ldr(ch1, Address(str1, cnt1_neg)); 647 eor(ch1, ch, ch1); 648 sub(tmp1, ch1, tmp3); 649 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 650 bics(tmp1, tmp1, tmp2); 651 br(NE, HAS_ZERO); 652 adds(cnt1_neg, cnt1_neg, 8); 653 br(LT, CH1_LOOP); 654 655 cmp(cnt1_neg, (u1)8); 656 mov(cnt1_neg, 0); 657 br(LT, CH1_LOOP); 658 b(NOMATCH); 659 660 BIND(HAS_ZERO); 661 rev(tmp1, tmp1); 662 clz(tmp1, tmp1); 663 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 664 b(MATCH); 665 666 BIND(DO1_SHORT); 667 mov(result_tmp, cnt1); 668 lea(str1, Address(str1, cnt1)); 669 sub(cnt1_neg, zr, cnt1); 670 BIND(DO1_LOOP); 671 ldrb(ch1, Address(str1, cnt1_neg)); 672 cmp(ch, ch1); 673 br(EQ, MATCH); 674 adds(cnt1_neg, cnt1_neg, 1); 675 br(LT, DO1_LOOP); 676 BIND(NOMATCH); 677 mov(result, -1); 678 b(DONE); 679 BIND(MATCH); 680 add(result, result_tmp, cnt1_neg); 681 BIND(DONE); 682 } 683 684 // Compare strings. 685 void C2_MacroAssembler::string_compare(Register str1, Register str2, 686 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 687 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 688 PRegister pgtmp1, PRegister pgtmp2, int ae) { 689 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 690 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 691 SHORT_LOOP_START, TAIL_CHECK; 692 693 bool isLL = ae == StrIntrinsicNode::LL; 694 bool isLU = ae == StrIntrinsicNode::LU; 695 bool isUL = ae == StrIntrinsicNode::UL; 696 697 // The stub threshold for LL strings is: 72 (64 + 8) chars 698 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 699 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 700 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 701 702 bool str1_isL = isLL || isLU; 703 bool str2_isL = isLL || isUL; 704 705 int str1_chr_shift = str1_isL ? 0 : 1; 706 int str2_chr_shift = str2_isL ? 0 : 1; 707 int str1_chr_size = str1_isL ? 1 : 2; 708 int str2_chr_size = str2_isL ? 1 : 2; 709 int minCharsInWord = isLL ? wordSize : wordSize/2; 710 711 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 712 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 713 (chr_insn)&MacroAssembler::ldrh; 714 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 715 (chr_insn)&MacroAssembler::ldrh; 716 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 717 (uxt_insn)&MacroAssembler::uxthw; 718 719 BLOCK_COMMENT("string_compare {"); 720 721 // Bizzarely, the counts are passed in bytes, regardless of whether they 722 // are L or U strings, however the result is always in characters. 723 if (!str1_isL) asrw(cnt1, cnt1, 1); 724 if (!str2_isL) asrw(cnt2, cnt2, 1); 725 726 // Compute the minimum of the string lengths and save the difference. 727 subsw(result, cnt1, cnt2); 728 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 729 730 // A very short string 731 cmpw(cnt2, minCharsInWord); 732 br(Assembler::LE, SHORT_STRING); 733 734 // Compare longwords 735 // load first parts of strings and finish initialization while loading 736 { 737 if (str1_isL == str2_isL) { // LL or UU 738 ldr(tmp1, Address(str1)); 739 cmp(str1, str2); 740 br(Assembler::EQ, DONE); 741 ldr(tmp2, Address(str2)); 742 cmp(cnt2, stub_threshold); 743 br(GE, STUB); 744 subsw(cnt2, cnt2, minCharsInWord); 745 br(EQ, TAIL_CHECK); 746 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 747 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 748 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 749 } else if (isLU) { 750 ldrs(vtmp, Address(str1)); 751 ldr(tmp2, Address(str2)); 752 cmp(cnt2, stub_threshold); 753 br(GE, STUB); 754 subw(cnt2, cnt2, 4); 755 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 756 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 757 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 758 zip1(vtmp, T8B, vtmp, vtmpZ); 759 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 760 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 761 add(cnt1, cnt1, 4); 762 fmovd(tmp1, vtmp); 763 } else { // UL case 764 ldr(tmp1, Address(str1)); 765 ldrs(vtmp, Address(str2)); 766 cmp(cnt2, stub_threshold); 767 br(GE, STUB); 768 subw(cnt2, cnt2, 4); 769 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 770 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 771 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 772 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 773 zip1(vtmp, T8B, vtmp, vtmpZ); 774 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 775 add(cnt1, cnt1, 8); 776 fmovd(tmp2, vtmp); 777 } 778 adds(cnt2, cnt2, isUL ? 4 : 8); 779 br(GE, TAIL); 780 eor(rscratch2, tmp1, tmp2); 781 cbnz(rscratch2, DIFF); 782 // main loop 783 bind(NEXT_WORD); 784 if (str1_isL == str2_isL) { 785 ldr(tmp1, Address(str1, cnt2)); 786 ldr(tmp2, Address(str2, cnt2)); 787 adds(cnt2, cnt2, 8); 788 } else if (isLU) { 789 ldrs(vtmp, Address(str1, cnt1)); 790 ldr(tmp2, Address(str2, cnt2)); 791 add(cnt1, cnt1, 4); 792 zip1(vtmp, T8B, vtmp, vtmpZ); 793 fmovd(tmp1, vtmp); 794 adds(cnt2, cnt2, 8); 795 } else { // UL 796 ldrs(vtmp, Address(str2, cnt2)); 797 ldr(tmp1, Address(str1, cnt1)); 798 zip1(vtmp, T8B, vtmp, vtmpZ); 799 add(cnt1, cnt1, 8); 800 fmovd(tmp2, vtmp); 801 adds(cnt2, cnt2, 4); 802 } 803 br(GE, TAIL); 804 805 eor(rscratch2, tmp1, tmp2); 806 cbz(rscratch2, NEXT_WORD); 807 b(DIFF); 808 bind(TAIL); 809 eor(rscratch2, tmp1, tmp2); 810 cbnz(rscratch2, DIFF); 811 // Last longword. In the case where length == 4 we compare the 812 // same longword twice, but that's still faster than another 813 // conditional branch. 814 if (str1_isL == str2_isL) { 815 ldr(tmp1, Address(str1)); 816 ldr(tmp2, Address(str2)); 817 } else if (isLU) { 818 ldrs(vtmp, Address(str1)); 819 ldr(tmp2, Address(str2)); 820 zip1(vtmp, T8B, vtmp, vtmpZ); 821 fmovd(tmp1, vtmp); 822 } else { // UL 823 ldrs(vtmp, Address(str2)); 824 ldr(tmp1, Address(str1)); 825 zip1(vtmp, T8B, vtmp, vtmpZ); 826 fmovd(tmp2, vtmp); 827 } 828 bind(TAIL_CHECK); 829 eor(rscratch2, tmp1, tmp2); 830 cbz(rscratch2, DONE); 831 832 // Find the first different characters in the longwords and 833 // compute their difference. 834 bind(DIFF); 835 rev(rscratch2, rscratch2); 836 clz(rscratch2, rscratch2); 837 andr(rscratch2, rscratch2, isLL ? -8 : -16); 838 lsrv(tmp1, tmp1, rscratch2); 839 (this->*ext_chr)(tmp1, tmp1); 840 lsrv(tmp2, tmp2, rscratch2); 841 (this->*ext_chr)(tmp2, tmp2); 842 subw(result, tmp1, tmp2); 843 b(DONE); 844 } 845 846 bind(STUB); 847 RuntimeAddress stub = NULL; 848 switch(ae) { 849 case StrIntrinsicNode::LL: 850 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 851 break; 852 case StrIntrinsicNode::UU: 853 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 854 break; 855 case StrIntrinsicNode::LU: 856 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 857 break; 858 case StrIntrinsicNode::UL: 859 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 860 break; 861 default: 862 ShouldNotReachHere(); 863 } 864 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 865 address call = trampoline_call(stub); 866 if (call == nullptr) { 867 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 868 ciEnv::current()->record_failure("CodeCache is full"); 869 return; 870 } 871 b(DONE); 872 873 bind(SHORT_STRING); 874 // Is the minimum length zero? 875 cbz(cnt2, DONE); 876 // arrange code to do most branches while loading and loading next characters 877 // while comparing previous 878 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 879 subs(cnt2, cnt2, 1); 880 br(EQ, SHORT_LAST_INIT); 881 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 882 b(SHORT_LOOP_START); 883 bind(SHORT_LOOP); 884 subs(cnt2, cnt2, 1); 885 br(EQ, SHORT_LAST); 886 bind(SHORT_LOOP_START); 887 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 888 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 889 cmp(tmp1, cnt1); 890 br(NE, SHORT_LOOP_TAIL); 891 subs(cnt2, cnt2, 1); 892 br(EQ, SHORT_LAST2); 893 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 894 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 895 cmp(tmp2, rscratch1); 896 br(EQ, SHORT_LOOP); 897 sub(result, tmp2, rscratch1); 898 b(DONE); 899 bind(SHORT_LOOP_TAIL); 900 sub(result, tmp1, cnt1); 901 b(DONE); 902 bind(SHORT_LAST2); 903 cmp(tmp2, rscratch1); 904 br(EQ, DONE); 905 sub(result, tmp2, rscratch1); 906 907 b(DONE); 908 bind(SHORT_LAST_INIT); 909 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 910 bind(SHORT_LAST); 911 cmp(tmp1, cnt1); 912 br(EQ, DONE); 913 sub(result, tmp1, cnt1); 914 915 bind(DONE); 916 917 BLOCK_COMMENT("} string_compare"); 918 } 919 920 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 921 FloatRegister src2, int cond, bool isQ) { 922 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 923 if (bt == T_FLOAT || bt == T_DOUBLE) { 924 switch (cond) { 925 case BoolTest::eq: fcmeq(dst, size, src1, src2); break; 926 case BoolTest::ne: { 927 fcmeq(dst, size, src1, src2); 928 notr(dst, T16B, dst); 929 break; 930 } 931 case BoolTest::ge: fcmge(dst, size, src1, src2); break; 932 case BoolTest::gt: fcmgt(dst, size, src1, src2); break; 933 case BoolTest::le: fcmge(dst, size, src2, src1); break; 934 case BoolTest::lt: fcmgt(dst, size, src2, src1); break; 935 default: 936 assert(false, "unsupported"); 937 ShouldNotReachHere(); 938 } 939 } else { 940 switch (cond) { 941 case BoolTest::eq: cmeq(dst, size, src1, src2); break; 942 case BoolTest::ne: { 943 cmeq(dst, size, src1, src2); 944 notr(dst, T16B, dst); 945 break; 946 } 947 case BoolTest::ge: cmge(dst, size, src1, src2); break; 948 case BoolTest::gt: cmgt(dst, size, src1, src2); break; 949 case BoolTest::le: cmge(dst, size, src2, src1); break; 950 case BoolTest::lt: cmgt(dst, size, src2, src1); break; 951 case BoolTest::uge: cmhs(dst, size, src1, src2); break; 952 case BoolTest::ugt: cmhi(dst, size, src1, src2); break; 953 case BoolTest::ult: cmhi(dst, size, src2, src1); break; 954 case BoolTest::ule: cmhs(dst, size, src2, src1); break; 955 default: 956 assert(false, "unsupported"); 957 ShouldNotReachHere(); 958 } 959 } 960 } 961 962 // Compress the least significant bit of each byte to the rightmost and clear 963 // the higher garbage bits. 964 void C2_MacroAssembler::bytemask_compress(Register dst) { 965 // Example input, dst = 0x01 00 00 00 01 01 00 01 966 // The "??" bytes are garbage. 967 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 968 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 969 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 970 andr(dst, dst, 0xff); // dst = 0x8D 971 } 972 973 // Pack the lowest-numbered bit of each mask element in src into a long value 974 // in dst, at most the first 64 lane elements. 975 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 976 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 977 FloatRegister vtmp1, FloatRegister vtmp2) { 978 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 979 assert_different_registers(dst, rscratch1); 980 assert_different_registers(vtmp1, vtmp2); 981 982 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 983 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 984 // Expected: dst = 0x658D 985 986 // Convert the mask into vector with sequential bytes. 987 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 988 sve_cpy(vtmp1, size, src, 1, false); 989 if (bt != T_BYTE) { 990 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 991 } 992 993 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 994 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 995 // is to compress each significant bit of the byte in a cross-lane way. Due 996 // to the lack of a cross-lane bit-compress instruction, we use BEXT 997 // (bit-compress in each lane) with the biggest lane size (T = D) then 998 // concatenate the results. 999 1000 // The second source input of BEXT, initialized with 0x01 in each byte. 1001 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1002 sve_dup(vtmp2, B, 1); 1003 1004 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1005 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1006 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1007 // --------------------------------------- 1008 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1009 sve_bext(vtmp1, D, vtmp1, vtmp2); 1010 1011 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1012 // result to dst. 1013 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1014 // dst = 0x658D 1015 if (lane_cnt <= 8) { 1016 // No need to concatenate. 1017 umov(dst, vtmp1, B, 0); 1018 } else if (lane_cnt <= 16) { 1019 ins(vtmp1, B, vtmp1, 1, 8); 1020 umov(dst, vtmp1, H, 0); 1021 } else { 1022 // As the lane count is 64 at most, the final expected value must be in 1023 // the lowest 64 bits after narrowing vtmp1 from D to B. 1024 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1025 umov(dst, vtmp1, D, 0); 1026 } 1027 } else if (UseSVE > 0) { 1028 // Compress the lowest 8 bytes. 1029 fmovd(dst, vtmp1); 1030 bytemask_compress(dst); 1031 if (lane_cnt <= 8) return; 1032 1033 // Repeat on higher bytes and join the results. 1034 // Compress 8 bytes in each iteration. 1035 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1036 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1037 bytemask_compress(rscratch1); 1038 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1039 } 1040 } else { 1041 assert(false, "unsupported"); 1042 ShouldNotReachHere(); 1043 } 1044 } 1045 1046 // Unpack the mask, a long value in src, into predicate register dst based on the 1047 // corresponding data type. Note that dst can support at most 64 lanes. 1048 // Below example gives the expected dst predicate register in different types, with 1049 // a valid src(0x658D) on a 1024-bit vector size machine. 1050 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1051 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1052 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1053 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1054 // 1055 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1056 // has 24 significant bits would be an invalid input if dst predicate register refers to 1057 // a LONG type 1024-bit vector, which has at most 16 lanes. 1058 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1059 FloatRegister vtmp1, FloatRegister vtmp2) { 1060 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1061 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1062 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1063 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1064 // Expected: dst = 0b01101001 10001101 1065 1066 // Put long value from general purpose register into the first lane of vector. 1067 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1068 sve_dup(vtmp1, B, 0); 1069 mov(vtmp1, D, 0, src); 1070 1071 // As sve_cmp generates mask value with the minimum unit in byte, we should 1072 // transform the value in the first lane which is mask in bit now to the 1073 // mask in byte, which can be done by SVE2's BDEP instruction. 1074 1075 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1076 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1077 if (lane_cnt <= 8) { 1078 // Nothing. As only one byte exsits. 1079 } else if (lane_cnt <= 16) { 1080 ins(vtmp1, B, vtmp1, 8, 1); 1081 mov(vtmp1, B, 1, zr); 1082 } else { 1083 sve_vector_extend(vtmp1, D, vtmp1, B); 1084 } 1085 1086 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1087 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1088 sve_dup(vtmp2, B, 1); 1089 1090 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1091 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1092 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1093 // --------------------------------------- 1094 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1095 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1096 1097 if (bt != T_BYTE) { 1098 sve_vector_extend(vtmp1, size, vtmp1, B); 1099 } 1100 // Generate mask according to the given vector, in which the elements have been 1101 // extended to expected type. 1102 // dst = 0b01101001 10001101 1103 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1104 } 1105 1106 // Clobbers: rflags 1107 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1108 FloatRegister zn, FloatRegister zm, int cond) { 1109 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1110 FloatRegister z1 = zn, z2 = zm; 1111 // Convert the original BoolTest condition to Assembler::condition. 1112 Condition condition; 1113 switch (cond) { 1114 case BoolTest::eq: condition = Assembler::EQ; break; 1115 case BoolTest::ne: condition = Assembler::NE; break; 1116 case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break; 1117 case BoolTest::ge: condition = Assembler::GE; break; 1118 case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break; 1119 case BoolTest::gt: condition = Assembler::GT; break; 1120 default: 1121 assert(false, "unsupported compare condition"); 1122 ShouldNotReachHere(); 1123 } 1124 1125 SIMD_RegVariant size = elemType_to_regVariant(bt); 1126 if (bt == T_FLOAT || bt == T_DOUBLE) { 1127 sve_fcm(condition, pd, size, pg, z1, z2); 1128 } else { 1129 assert(is_integral_type(bt), "unsupported element type"); 1130 sve_cmp(condition, pd, size, pg, z1, z2); 1131 } 1132 } 1133 1134 // Get index of the last mask lane that is set 1135 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1136 SIMD_RegVariant size = elemType_to_regVariant(bt); 1137 sve_rev(ptmp, size, src); 1138 sve_brkb(ptmp, ptrue, ptmp, false); 1139 sve_cntp(dst, size, ptrue, ptmp); 1140 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1141 subw(dst, rscratch1, dst); 1142 } 1143 1144 // Extend integer vector src to dst with the same lane count 1145 // but larger element size, e.g. 4B -> 4I 1146 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1147 FloatRegister src, BasicType src_bt) { 1148 if (src_bt == T_BYTE) { 1149 if (dst_bt == T_SHORT) { 1150 // 4B/8B to 4S/8S 1151 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1152 sxtl(dst, T8H, src, T8B); 1153 } else { 1154 // 4B to 4I 1155 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1156 sxtl(dst, T8H, src, T8B); 1157 sxtl(dst, T4S, dst, T4H); 1158 } 1159 } else if (src_bt == T_SHORT) { 1160 // 4S to 4I 1161 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1162 sxtl(dst, T4S, src, T4H); 1163 } else if (src_bt == T_INT) { 1164 // 2I to 2L 1165 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1166 sxtl(dst, T2D, src, T2S); 1167 } else { 1168 ShouldNotReachHere(); 1169 } 1170 } 1171 1172 // Narrow integer vector src down to dst with the same lane count 1173 // but smaller element size, e.g. 4I -> 4B 1174 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1175 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1176 if (src_bt == T_SHORT) { 1177 // 4S/8S to 4B/8B 1178 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1179 assert(dst_bt == T_BYTE, "unsupported"); 1180 xtn(dst, T8B, src, T8H); 1181 } else if (src_bt == T_INT) { 1182 // 4I to 4B/4S 1183 assert(src_vlen_in_bytes == 16, "unsupported"); 1184 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1185 xtn(dst, T4H, src, T4S); 1186 if (dst_bt == T_BYTE) { 1187 xtn(dst, T8B, dst, T8H); 1188 } 1189 } else if (src_bt == T_LONG) { 1190 // 2L to 2I 1191 assert(src_vlen_in_bytes == 16, "unsupported"); 1192 assert(dst_bt == T_INT, "unsupported"); 1193 xtn(dst, T2S, src, T2D); 1194 } else { 1195 ShouldNotReachHere(); 1196 } 1197 } 1198 1199 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1200 FloatRegister src, SIMD_RegVariant src_size) { 1201 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1202 if (src_size == B) { 1203 switch (dst_size) { 1204 case H: 1205 sve_sunpklo(dst, H, src); 1206 break; 1207 case S: 1208 sve_sunpklo(dst, H, src); 1209 sve_sunpklo(dst, S, dst); 1210 break; 1211 case D: 1212 sve_sunpklo(dst, H, src); 1213 sve_sunpklo(dst, S, dst); 1214 sve_sunpklo(dst, D, dst); 1215 break; 1216 default: 1217 ShouldNotReachHere(); 1218 } 1219 } else if (src_size == H) { 1220 if (dst_size == S) { 1221 sve_sunpklo(dst, S, src); 1222 } else { // D 1223 sve_sunpklo(dst, S, src); 1224 sve_sunpklo(dst, D, dst); 1225 } 1226 } else if (src_size == S) { 1227 sve_sunpklo(dst, D, src); 1228 } 1229 } 1230 1231 // Vector narrow from src to dst with specified element sizes. 1232 // High part of dst vector will be filled with zero. 1233 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1234 FloatRegister src, SIMD_RegVariant src_size, 1235 FloatRegister tmp) { 1236 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1237 assert_different_registers(src, tmp); 1238 sve_dup(tmp, src_size, 0); 1239 if (src_size == D) { 1240 switch (dst_size) { 1241 case S: 1242 sve_uzp1(dst, S, src, tmp); 1243 break; 1244 case H: 1245 assert_different_registers(dst, tmp); 1246 sve_uzp1(dst, S, src, tmp); 1247 sve_uzp1(dst, H, dst, tmp); 1248 break; 1249 case B: 1250 assert_different_registers(dst, tmp); 1251 sve_uzp1(dst, S, src, tmp); 1252 sve_uzp1(dst, H, dst, tmp); 1253 sve_uzp1(dst, B, dst, tmp); 1254 break; 1255 default: 1256 ShouldNotReachHere(); 1257 } 1258 } else if (src_size == S) { 1259 if (dst_size == H) { 1260 sve_uzp1(dst, H, src, tmp); 1261 } else { // B 1262 assert_different_registers(dst, tmp); 1263 sve_uzp1(dst, H, src, tmp); 1264 sve_uzp1(dst, B, dst, tmp); 1265 } 1266 } else if (src_size == H) { 1267 sve_uzp1(dst, B, src, tmp); 1268 } 1269 } 1270 1271 // Extend src predicate to dst predicate with the same lane count but larger 1272 // element size, e.g. 64Byte -> 512Long 1273 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1274 uint dst_element_length_in_bytes, 1275 uint src_element_length_in_bytes) { 1276 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1277 sve_punpklo(dst, src); 1278 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1279 sve_punpklo(dst, src); 1280 sve_punpklo(dst, dst); 1281 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1282 sve_punpklo(dst, src); 1283 sve_punpklo(dst, dst); 1284 sve_punpklo(dst, dst); 1285 } else { 1286 assert(false, "unsupported"); 1287 ShouldNotReachHere(); 1288 } 1289 } 1290 1291 // Narrow src predicate to dst predicate with the same lane count but 1292 // smaller element size, e.g. 512Long -> 64Byte 1293 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, 1294 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1295 // The insignificant bits in src predicate are expected to be zero. 1296 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1297 sve_uzp1(dst, B, src, src); 1298 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1299 sve_uzp1(dst, H, src, src); 1300 sve_uzp1(dst, B, dst, dst); 1301 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1302 sve_uzp1(dst, S, src, src); 1303 sve_uzp1(dst, H, dst, dst); 1304 sve_uzp1(dst, B, dst, dst); 1305 } else { 1306 assert(false, "unsupported"); 1307 ShouldNotReachHere(); 1308 } 1309 } 1310 1311 // Vector reduction add for integral type with ASIMD instructions. 1312 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1313 Register isrc, FloatRegister vsrc, 1314 unsigned vector_length_in_bytes, 1315 FloatRegister vtmp) { 1316 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1317 assert_different_registers(dst, isrc); 1318 bool isQ = vector_length_in_bytes == 16; 1319 1320 BLOCK_COMMENT("neon_reduce_add_integral {"); 1321 switch(bt) { 1322 case T_BYTE: 1323 addv(vtmp, isQ ? T16B : T8B, vsrc); 1324 smov(dst, vtmp, B, 0); 1325 addw(dst, dst, isrc, ext::sxtb); 1326 break; 1327 case T_SHORT: 1328 addv(vtmp, isQ ? T8H : T4H, vsrc); 1329 smov(dst, vtmp, H, 0); 1330 addw(dst, dst, isrc, ext::sxth); 1331 break; 1332 case T_INT: 1333 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1334 umov(dst, vtmp, S, 0); 1335 addw(dst, dst, isrc); 1336 break; 1337 case T_LONG: 1338 assert(isQ, "unsupported"); 1339 addpd(vtmp, vsrc); 1340 umov(dst, vtmp, D, 0); 1341 add(dst, dst, isrc); 1342 break; 1343 default: 1344 assert(false, "unsupported"); 1345 ShouldNotReachHere(); 1346 } 1347 BLOCK_COMMENT("} neon_reduce_add_integral"); 1348 } 1349 1350 // Vector reduction multiply for integral type with ASIMD instructions. 1351 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1352 // Clobbers: rscratch1 1353 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1354 Register isrc, FloatRegister vsrc, 1355 unsigned vector_length_in_bytes, 1356 FloatRegister vtmp1, FloatRegister vtmp2) { 1357 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1358 bool isQ = vector_length_in_bytes == 16; 1359 1360 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1361 switch(bt) { 1362 case T_BYTE: 1363 if (isQ) { 1364 // Multiply the lower half and higher half of vector iteratively. 1365 // vtmp1 = vsrc[8:15] 1366 ins(vtmp1, D, vsrc, 0, 1); 1367 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1368 mulv(vtmp1, T8B, vtmp1, vsrc); 1369 // vtmp2 = vtmp1[4:7] 1370 ins(vtmp2, S, vtmp1, 0, 1); 1371 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1372 mulv(vtmp1, T8B, vtmp2, vtmp1); 1373 } else { 1374 ins(vtmp1, S, vsrc, 0, 1); 1375 mulv(vtmp1, T8B, vtmp1, vsrc); 1376 } 1377 // vtmp2 = vtmp1[2:3] 1378 ins(vtmp2, H, vtmp1, 0, 1); 1379 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1380 mulv(vtmp2, T8B, vtmp2, vtmp1); 1381 // dst = vtmp2[0] * isrc * vtmp2[1] 1382 umov(rscratch1, vtmp2, B, 0); 1383 mulw(dst, rscratch1, isrc); 1384 sxtb(dst, dst); 1385 umov(rscratch1, vtmp2, B, 1); 1386 mulw(dst, rscratch1, dst); 1387 sxtb(dst, dst); 1388 break; 1389 case T_SHORT: 1390 if (isQ) { 1391 ins(vtmp2, D, vsrc, 0, 1); 1392 mulv(vtmp2, T4H, vtmp2, vsrc); 1393 ins(vtmp1, S, vtmp2, 0, 1); 1394 mulv(vtmp1, T4H, vtmp1, vtmp2); 1395 } else { 1396 ins(vtmp1, S, vsrc, 0, 1); 1397 mulv(vtmp1, T4H, vtmp1, vsrc); 1398 } 1399 umov(rscratch1, vtmp1, H, 0); 1400 mulw(dst, rscratch1, isrc); 1401 sxth(dst, dst); 1402 umov(rscratch1, vtmp1, H, 1); 1403 mulw(dst, rscratch1, dst); 1404 sxth(dst, dst); 1405 break; 1406 case T_INT: 1407 if (isQ) { 1408 ins(vtmp1, D, vsrc, 0, 1); 1409 mulv(vtmp1, T2S, vtmp1, vsrc); 1410 } else { 1411 vtmp1 = vsrc; 1412 } 1413 umov(rscratch1, vtmp1, S, 0); 1414 mul(dst, rscratch1, isrc); 1415 umov(rscratch1, vtmp1, S, 1); 1416 mul(dst, rscratch1, dst); 1417 break; 1418 case T_LONG: 1419 umov(rscratch1, vsrc, D, 0); 1420 mul(dst, isrc, rscratch1); 1421 umov(rscratch1, vsrc, D, 1); 1422 mul(dst, dst, rscratch1); 1423 break; 1424 default: 1425 assert(false, "unsupported"); 1426 ShouldNotReachHere(); 1427 } 1428 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1429 } 1430 1431 // Vector reduction multiply for floating-point type with ASIMD instructions. 1432 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1433 FloatRegister fsrc, FloatRegister vsrc, 1434 unsigned vector_length_in_bytes, 1435 FloatRegister vtmp) { 1436 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1437 bool isQ = vector_length_in_bytes == 16; 1438 1439 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1440 switch(bt) { 1441 case T_FLOAT: 1442 fmuls(dst, fsrc, vsrc); 1443 ins(vtmp, S, vsrc, 0, 1); 1444 fmuls(dst, dst, vtmp); 1445 if (isQ) { 1446 ins(vtmp, S, vsrc, 0, 2); 1447 fmuls(dst, dst, vtmp); 1448 ins(vtmp, S, vsrc, 0, 3); 1449 fmuls(dst, dst, vtmp); 1450 } 1451 break; 1452 case T_DOUBLE: 1453 assert(isQ, "unsupported"); 1454 fmuld(dst, fsrc, vsrc); 1455 ins(vtmp, D, vsrc, 0, 1); 1456 fmuld(dst, dst, vtmp); 1457 break; 1458 default: 1459 assert(false, "unsupported"); 1460 ShouldNotReachHere(); 1461 } 1462 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1463 } 1464 1465 // Helper to select logical instruction 1466 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1467 Register Rn, Register Rm, 1468 enum shift_kind kind, unsigned shift) { 1469 switch(opc) { 1470 case Op_AndReductionV: 1471 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1472 break; 1473 case Op_OrReductionV: 1474 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1475 break; 1476 case Op_XorReductionV: 1477 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1478 break; 1479 default: 1480 assert(false, "unsupported"); 1481 ShouldNotReachHere(); 1482 } 1483 } 1484 1485 // Vector reduction logical operations And, Or, Xor 1486 // Clobbers: rscratch1 1487 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1488 Register isrc, FloatRegister vsrc, 1489 unsigned vector_length_in_bytes) { 1490 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1491 "unsupported"); 1492 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1493 assert_different_registers(dst, isrc); 1494 bool isQ = vector_length_in_bytes == 16; 1495 1496 BLOCK_COMMENT("neon_reduce_logical {"); 1497 umov(rscratch1, vsrc, isQ ? D : S, 0); 1498 umov(dst, vsrc, isQ ? D : S, 1); 1499 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1500 switch(bt) { 1501 case T_BYTE: 1502 if (isQ) { 1503 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1504 } 1505 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1506 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1507 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1508 sxtb(dst, dst); 1509 break; 1510 case T_SHORT: 1511 if (isQ) { 1512 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1513 } 1514 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1515 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1516 sxth(dst, dst); 1517 break; 1518 case T_INT: 1519 if (isQ) { 1520 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1521 } 1522 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1523 break; 1524 case T_LONG: 1525 assert(isQ, "unsupported"); 1526 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1527 break; 1528 default: 1529 assert(false, "unsupported"); 1530 ShouldNotReachHere(); 1531 } 1532 BLOCK_COMMENT("} neon_reduce_logical"); 1533 } 1534 1535 // Vector reduction min/max for integral type with ASIMD instructions. 1536 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1537 // Clobbers: rscratch1, rflags 1538 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1539 Register isrc, FloatRegister vsrc, 1540 unsigned vector_length_in_bytes, 1541 FloatRegister vtmp) { 1542 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1543 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1544 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1545 assert_different_registers(dst, isrc); 1546 bool isQ = vector_length_in_bytes == 16; 1547 bool is_min = opc == Op_MinReductionV; 1548 1549 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1550 if (bt == T_LONG) { 1551 assert(vtmp == fnoreg, "should be"); 1552 assert(isQ, "should be"); 1553 umov(rscratch1, vsrc, D, 0); 1554 cmp(isrc, rscratch1); 1555 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1556 umov(rscratch1, vsrc, D, 1); 1557 cmp(dst, rscratch1); 1558 csel(dst, dst, rscratch1, is_min ? LT : GT); 1559 } else { 1560 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1561 if (size == T2S) { 1562 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 1563 } else { 1564 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 1565 } 1566 if (bt == T_INT) { 1567 umov(dst, vtmp, S, 0); 1568 } else { 1569 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 1570 } 1571 cmpw(dst, isrc); 1572 cselw(dst, dst, isrc, is_min ? LT : GT); 1573 } 1574 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 1575 } 1576 1577 // Vector reduction for integral type with SVE instruction. 1578 // Supported operations are Add, And, Or, Xor, Max, Min. 1579 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 1580 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1581 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1582 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1583 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1584 assert_different_registers(src1, dst); 1585 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1586 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1587 switch (opc) { 1588 case Op_AddReductionVI: { 1589 sve_uaddv(tmp, size, pg, src2); 1590 if (bt == T_BYTE) { 1591 smov(dst, tmp, size, 0); 1592 addw(dst, src1, dst, ext::sxtb); 1593 } else if (bt == T_SHORT) { 1594 smov(dst, tmp, size, 0); 1595 addw(dst, src1, dst, ext::sxth); 1596 } else { 1597 umov(dst, tmp, size, 0); 1598 addw(dst, dst, src1); 1599 } 1600 break; 1601 } 1602 case Op_AddReductionVL: { 1603 sve_uaddv(tmp, size, pg, src2); 1604 umov(dst, tmp, size, 0); 1605 add(dst, dst, src1); 1606 break; 1607 } 1608 case Op_AndReductionV: { 1609 sve_andv(tmp, size, pg, src2); 1610 if (bt == T_INT || bt == T_LONG) { 1611 umov(dst, tmp, size, 0); 1612 } else { 1613 smov(dst, tmp, size, 0); 1614 } 1615 if (bt == T_LONG) { 1616 andr(dst, dst, src1); 1617 } else { 1618 andw(dst, dst, src1); 1619 } 1620 break; 1621 } 1622 case Op_OrReductionV: { 1623 sve_orv(tmp, size, pg, src2); 1624 if (bt == T_INT || bt == T_LONG) { 1625 umov(dst, tmp, size, 0); 1626 } else { 1627 smov(dst, tmp, size, 0); 1628 } 1629 if (bt == T_LONG) { 1630 orr(dst, dst, src1); 1631 } else { 1632 orrw(dst, dst, src1); 1633 } 1634 break; 1635 } 1636 case Op_XorReductionV: { 1637 sve_eorv(tmp, size, pg, src2); 1638 if (bt == T_INT || bt == T_LONG) { 1639 umov(dst, tmp, size, 0); 1640 } else { 1641 smov(dst, tmp, size, 0); 1642 } 1643 if (bt == T_LONG) { 1644 eor(dst, dst, src1); 1645 } else { 1646 eorw(dst, dst, src1); 1647 } 1648 break; 1649 } 1650 case Op_MaxReductionV: { 1651 sve_smaxv(tmp, size, pg, src2); 1652 if (bt == T_INT || bt == T_LONG) { 1653 umov(dst, tmp, size, 0); 1654 } else { 1655 smov(dst, tmp, size, 0); 1656 } 1657 if (bt == T_LONG) { 1658 cmp(dst, src1); 1659 csel(dst, dst, src1, Assembler::GT); 1660 } else { 1661 cmpw(dst, src1); 1662 cselw(dst, dst, src1, Assembler::GT); 1663 } 1664 break; 1665 } 1666 case Op_MinReductionV: { 1667 sve_sminv(tmp, size, pg, src2); 1668 if (bt == T_INT || bt == T_LONG) { 1669 umov(dst, tmp, size, 0); 1670 } else { 1671 smov(dst, tmp, size, 0); 1672 } 1673 if (bt == T_LONG) { 1674 cmp(dst, src1); 1675 csel(dst, dst, src1, Assembler::LT); 1676 } else { 1677 cmpw(dst, src1); 1678 cselw(dst, dst, src1, Assembler::LT); 1679 } 1680 break; 1681 } 1682 default: 1683 assert(false, "unsupported"); 1684 ShouldNotReachHere(); 1685 } 1686 1687 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1688 if (bt == T_BYTE) { 1689 sxtb(dst, dst); 1690 } else if (bt == T_SHORT) { 1691 sxth(dst, dst); 1692 } 1693 } 1694 } 1695 1696 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 1697 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 1698 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 1699 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 1700 uint32_t max_vector_length = Matcher::max_vector_size(bt); 1701 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 1702 1703 // Set all elements to false if the input "lane_cnt" is zero. 1704 if (lane_cnt == 0) { 1705 sve_pfalse(dst); 1706 return; 1707 } 1708 1709 SIMD_RegVariant size = elemType_to_regVariant(bt); 1710 assert(size != Q, "invalid size"); 1711 1712 // Set all true if "lane_cnt" equals to the max lane count. 1713 if (lane_cnt == max_vector_length) { 1714 sve_ptrue(dst, size, /* ALL */ 0b11111); 1715 return; 1716 } 1717 1718 // Fixed numbers for "ptrue". 1719 switch(lane_cnt) { 1720 case 1: /* VL1 */ 1721 case 2: /* VL2 */ 1722 case 3: /* VL3 */ 1723 case 4: /* VL4 */ 1724 case 5: /* VL5 */ 1725 case 6: /* VL6 */ 1726 case 7: /* VL7 */ 1727 case 8: /* VL8 */ 1728 sve_ptrue(dst, size, lane_cnt); 1729 return; 1730 case 16: 1731 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1732 return; 1733 case 32: 1734 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1735 return; 1736 case 64: 1737 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1738 return; 1739 case 128: 1740 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1741 return; 1742 case 256: 1743 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1744 return; 1745 default: 1746 break; 1747 } 1748 1749 // Special patterns for "ptrue". 1750 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 1751 sve_ptrue(dst, size, /* POW2 */ 0b00000); 1752 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 1753 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 1754 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 1755 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 1756 } else { 1757 // Encode to "whilelow" for the remaining cases. 1758 mov(rscratch1, lane_cnt); 1759 sve_whilelow(dst, size, zr, rscratch1); 1760 } 1761 } 1762 1763 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1764 // Any remaining elements of dst will be filled with zero. 1765 // Clobbers: rscratch1 1766 // Preserves: src, mask 1767 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 1768 FloatRegister vtmp1, FloatRegister vtmp2, 1769 PRegister pgtmp) { 1770 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1771 assert_different_registers(dst, src, vtmp1, vtmp2); 1772 assert_different_registers(mask, pgtmp); 1773 1774 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 1775 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 1776 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 1777 sve_dup(vtmp2, H, 0); 1778 1779 // Extend lowest half to type INT. 1780 // dst = 00004444 00003333 00002222 00001111 1781 sve_uunpklo(dst, S, src); 1782 // pgtmp = 00000001 00000000 00000001 00000001 1783 sve_punpklo(pgtmp, mask); 1784 // Pack the active elements in size of type INT to the right, 1785 // and fill the remainings with zero. 1786 // dst = 00000000 00004444 00002222 00001111 1787 sve_compact(dst, S, dst, pgtmp); 1788 // Narrow the result back to type SHORT. 1789 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 1790 sve_uzp1(dst, H, dst, vtmp2); 1791 // Count the active elements of lowest half. 1792 // rscratch1 = 3 1793 sve_cntp(rscratch1, S, ptrue, pgtmp); 1794 1795 // Repeat to the highest half. 1796 // pgtmp = 00000001 00000000 00000000 00000001 1797 sve_punpkhi(pgtmp, mask); 1798 // vtmp1 = 00008888 00007777 00006666 00005555 1799 sve_uunpkhi(vtmp1, S, src); 1800 // vtmp1 = 00000000 00000000 00008888 00005555 1801 sve_compact(vtmp1, S, vtmp1, pgtmp); 1802 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1803 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 1804 1805 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 1806 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1807 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1808 // TRUE_CNT is the number of active elements in the compressed low. 1809 neg(rscratch1, rscratch1); 1810 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1811 sve_index(vtmp2, H, rscratch1, 1); 1812 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 1813 sve_tbl(vtmp1, H, vtmp1, vtmp2); 1814 1815 // Combine the compressed high(after shifted) with the compressed low. 1816 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 1817 sve_orr(dst, dst, vtmp1); 1818 } 1819 1820 // Clobbers: rscratch1, rscratch2 1821 // Preserves: src, mask 1822 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 1823 FloatRegister vtmp1, FloatRegister vtmp2, 1824 FloatRegister vtmp3, FloatRegister vtmp4, 1825 PRegister ptmp, PRegister pgtmp) { 1826 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1827 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 1828 assert_different_registers(mask, ptmp, pgtmp); 1829 // Example input: src = 88 77 66 55 44 33 22 11 1830 // mask = 01 00 00 01 01 00 01 01 1831 // Expected result: dst = 00 00 00 88 55 44 22 11 1832 1833 sve_dup(vtmp4, B, 0); 1834 // Extend lowest half to type SHORT. 1835 // vtmp1 = 0044 0033 0022 0011 1836 sve_uunpklo(vtmp1, H, src); 1837 // ptmp = 0001 0000 0001 0001 1838 sve_punpklo(ptmp, mask); 1839 // Count the active elements of lowest half. 1840 // rscratch2 = 3 1841 sve_cntp(rscratch2, H, ptrue, ptmp); 1842 // Pack the active elements in size of type SHORT to the right, 1843 // and fill the remainings with zero. 1844 // dst = 0000 0044 0022 0011 1845 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 1846 // Narrow the result back to type BYTE. 1847 // dst = 00 00 00 00 00 44 22 11 1848 sve_uzp1(dst, B, dst, vtmp4); 1849 1850 // Repeat to the highest half. 1851 // ptmp = 0001 0000 0000 0001 1852 sve_punpkhi(ptmp, mask); 1853 // vtmp1 = 0088 0077 0066 0055 1854 sve_uunpkhi(vtmp2, H, src); 1855 // vtmp1 = 0000 0000 0088 0055 1856 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 1857 1858 sve_dup(vtmp4, B, 0); 1859 // vtmp1 = 00 00 00 00 00 00 88 55 1860 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 1861 1862 // Compressed low: dst = 00 00 00 00 00 44 22 11 1863 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 1864 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1865 // TRUE_CNT is the number of active elements in the compressed low. 1866 neg(rscratch2, rscratch2); 1867 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1868 sve_index(vtmp2, B, rscratch2, 1); 1869 // vtmp1 = 00 00 00 88 55 00 00 00 1870 sve_tbl(vtmp1, B, vtmp1, vtmp2); 1871 // Combine the compressed high(after shifted) with the compressed low. 1872 // dst = 00 00 00 88 55 44 22 11 1873 sve_orr(dst, dst, vtmp1); 1874 } 1875 1876 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1877 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1878 SIMD_Arrangement size = isQ ? T16B : T8B; 1879 if (bt == T_BYTE) { 1880 rbit(dst, size, src); 1881 } else { 1882 neon_reverse_bytes(dst, src, bt, isQ); 1883 rbit(dst, size, dst); 1884 } 1885 } 1886 1887 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1888 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1889 SIMD_Arrangement size = isQ ? T16B : T8B; 1890 switch (bt) { 1891 case T_BYTE: 1892 if (dst != src) { 1893 orr(dst, size, src, src); 1894 } 1895 break; 1896 case T_SHORT: 1897 rev16(dst, size, src); 1898 break; 1899 case T_INT: 1900 rev32(dst, size, src); 1901 break; 1902 case T_LONG: 1903 rev64(dst, size, src); 1904 break; 1905 default: 1906 assert(false, "unsupported"); 1907 ShouldNotReachHere(); 1908 } 1909 } 1910 1911 // Extract a scalar element from an sve vector at position 'idx'. 1912 // The input elements in src are expected to be of integral type. 1913 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 1914 int idx, FloatRegister vtmp) { 1915 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1916 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1917 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 1918 if (bt == T_INT || bt == T_LONG) { 1919 umov(dst, src, size, idx); 1920 } else { 1921 smov(dst, src, size, idx); 1922 } 1923 } else { 1924 sve_orr(vtmp, src, src); 1925 sve_ext(vtmp, vtmp, idx << size); 1926 if (bt == T_INT || bt == T_LONG) { 1927 umov(dst, vtmp, size, 0); 1928 } else { 1929 smov(dst, vtmp, size, 0); 1930 } 1931 } 1932 } 1933 1934 // java.lang.Math::round intrinsics 1935 1936 // Clobbers: rscratch1, rflags 1937 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1938 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 1939 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 1940 switch (T) { 1941 case T2S: 1942 case T4S: 1943 fmovs(tmp1, T, 0.5f); 1944 mov(rscratch1, jint_cast(0x1.0p23f)); 1945 break; 1946 case T2D: 1947 fmovd(tmp1, T, 0.5); 1948 mov(rscratch1, julong_cast(0x1.0p52)); 1949 break; 1950 default: 1951 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 1952 } 1953 fadd(tmp1, T, tmp1, src); 1954 fcvtms(tmp1, T, tmp1); 1955 // tmp1 = floor(src + 0.5, ties to even) 1956 1957 fcvtas(dst, T, src); 1958 // dst = round(src), ties to away 1959 1960 fneg(tmp3, T, src); 1961 dup(tmp2, T, rscratch1); 1962 cmhs(tmp3, T, tmp3, tmp2); 1963 // tmp3 is now a set of flags 1964 1965 bif(dst, T16B, tmp1, tmp3); 1966 // result in dst 1967 } 1968 1969 // Clobbers: rscratch1, rflags 1970 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1971 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 1972 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1973 assert_different_registers(tmp1, tmp2, src, dst); 1974 1975 switch (T) { 1976 case S: 1977 mov(rscratch1, jint_cast(0x1.0p23f)); 1978 break; 1979 case D: 1980 mov(rscratch1, julong_cast(0x1.0p52)); 1981 break; 1982 default: 1983 assert(T == S || T == D, "invalid register variant"); 1984 } 1985 1986 sve_frinta(dst, T, ptrue, src); 1987 // dst = round(src), ties to away 1988 1989 Label none; 1990 1991 sve_fneg(tmp1, T, ptrue, src); 1992 sve_dup(tmp2, T, rscratch1); 1993 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 1994 br(EQ, none); 1995 { 1996 sve_cpy(tmp1, T, pgtmp, 0.5); 1997 sve_fadd(tmp1, T, pgtmp, src); 1998 sve_frintm(dst, T, pgtmp, tmp1); 1999 // dst = floor(src + 0.5, ties to even) 2000 } 2001 bind(none); 2002 2003 sve_fcvtzs(dst, T, ptrue, dst, T); 2004 // result in dst 2005 } 2006 2007 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2008 FloatRegister one, SIMD_Arrangement T) { 2009 assert_different_registers(dst, src, zero, one); 2010 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2011 2012 facgt(dst, T, src, zero); 2013 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2014 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2015 } 2016 2017 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2018 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2019 assert_different_registers(dst, src, zero, one, vtmp); 2020 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2021 2022 sve_orr(vtmp, src, src); 2023 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2024 switch (T) { 2025 case S: 2026 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2027 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2028 // on the sign of the float value 2029 break; 2030 case D: 2031 sve_and(vtmp, T, min_jlong); 2032 sve_orr(vtmp, T, jlong_cast(1.0)); 2033 break; 2034 default: 2035 assert(false, "unsupported"); 2036 ShouldNotReachHere(); 2037 } 2038 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2039 // Result in dst 2040 } 2041 2042 bool C2_MacroAssembler::in_scratch_emit_size() { 2043 if (ciEnv::current()->task() != NULL) { 2044 PhaseOutput* phase_output = Compile::current()->output(); 2045 if (phase_output != NULL && phase_output->in_scratch_emit_size()) { 2046 return true; 2047 } 2048 } 2049 return MacroAssembler::in_scratch_emit_size(); 2050 }