1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 47 48 // Search for str1 in str2 and return index or -1 49 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 50 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 51 Register cnt2, Register cnt1, 52 Register tmp1, Register tmp2, 53 Register tmp3, Register tmp4, 54 Register tmp5, Register tmp6, 55 int icnt1, Register result, int ae) { 56 // NOTE: tmp5, tmp6 can be zr depending on specific method version 57 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 58 59 Register ch1 = rscratch1; 60 Register ch2 = rscratch2; 61 Register cnt1tmp = tmp1; 62 Register cnt2tmp = tmp2; 63 Register cnt1_neg = cnt1; 64 Register cnt2_neg = cnt2; 65 Register result_tmp = tmp4; 66 67 bool isL = ae == StrIntrinsicNode::LL; 68 69 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 70 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 71 int str1_chr_shift = str1_isL ? 0:1; 72 int str2_chr_shift = str2_isL ? 0:1; 73 int str1_chr_size = str1_isL ? 1:2; 74 int str2_chr_size = str2_isL ? 1:2; 75 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 76 (chr_insn)&MacroAssembler::ldrh; 77 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 78 (chr_insn)&MacroAssembler::ldrh; 79 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 80 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 81 82 // Note, inline_string_indexOf() generates checks: 83 // if (substr.count > string.count) return -1; 84 // if (substr.count == 0) return 0; 85 86 // We have two strings, a source string in str2, cnt2 and a pattern string 87 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 88 89 // For larger pattern and source we use a simplified Boyer Moore algorithm. 90 // With a small pattern and source we use linear scan. 91 92 if (icnt1 == -1) { 93 sub(result_tmp, cnt2, cnt1); 94 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 95 br(LT, LINEARSEARCH); 96 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 97 subs(zr, cnt1, 256); 98 lsr(tmp1, cnt2, 2); 99 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 100 br(GE, LINEARSTUB); 101 } 102 103 // The Boyer Moore alogorithm is based on the description here:- 104 // 105 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 106 // 107 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 108 // and the 'Good Suffix' rule. 109 // 110 // These rules are essentially heuristics for how far we can shift the 111 // pattern along the search string. 112 // 113 // The implementation here uses the 'Bad Character' rule only because of the 114 // complexity of initialisation for the 'Good Suffix' rule. 115 // 116 // This is also known as the Boyer-Moore-Horspool algorithm:- 117 // 118 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 119 // 120 // This particular implementation has few java-specific optimizations. 121 // 122 // #define ASIZE 256 123 // 124 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 125 // int i, j; 126 // unsigned c; 127 // unsigned char bc[ASIZE]; 128 // 129 // /* Preprocessing */ 130 // for (i = 0; i < ASIZE; ++i) 131 // bc[i] = m; 132 // for (i = 0; i < m - 1; ) { 133 // c = x[i]; 134 // ++i; 135 // // c < 256 for Latin1 string, so, no need for branch 136 // #ifdef PATTERN_STRING_IS_LATIN1 137 // bc[c] = m - i; 138 // #else 139 // if (c < ASIZE) bc[c] = m - i; 140 // #endif 141 // } 142 // 143 // /* Searching */ 144 // j = 0; 145 // while (j <= n - m) { 146 // c = y[i+j]; 147 // if (x[m-1] == c) 148 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 149 // if (i < 0) return j; 150 // // c < 256 for Latin1 string, so, no need for branch 151 // #ifdef SOURCE_STRING_IS_LATIN1 152 // // LL case: (c< 256) always true. Remove branch 153 // j += bc[y[j+m-1]]; 154 // #endif 155 // #ifndef PATTERN_STRING_IS_UTF 156 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 157 // if (c < ASIZE) 158 // j += bc[y[j+m-1]]; 159 // else 160 // j += 1 161 // #endif 162 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 163 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 164 // if (c < ASIZE) 165 // j += bc[y[j+m-1]]; 166 // else 167 // j += m 168 // #endif 169 // } 170 // } 171 172 if (icnt1 == -1) { 173 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 174 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 175 Register cnt1end = tmp2; 176 Register str2end = cnt2; 177 Register skipch = tmp2; 178 179 // str1 length is >=8, so, we can read at least 1 register for cases when 180 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 181 // UL case. We'll re-read last character in inner pre-loop code to have 182 // single outer pre-loop load 183 const int firstStep = isL ? 7 : 3; 184 185 const int ASIZE = 256; 186 const int STORED_BYTES = 32; // amount of bytes stored per instruction 187 sub(sp, sp, ASIZE); 188 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 189 mov(ch1, sp); 190 BIND(BM_INIT_LOOP); 191 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 192 subs(tmp5, tmp5, 1); 193 br(GT, BM_INIT_LOOP); 194 195 sub(cnt1tmp, cnt1, 1); 196 mov(tmp5, str2); 197 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 198 sub(ch2, cnt1, 1); 199 mov(tmp3, str1); 200 BIND(BCLOOP); 201 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 202 if (!str1_isL) { 203 subs(zr, ch1, ASIZE); 204 br(HS, BCSKIP); 205 } 206 strb(ch2, Address(sp, ch1)); 207 BIND(BCSKIP); 208 subs(ch2, ch2, 1); 209 br(GT, BCLOOP); 210 211 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 212 if (str1_isL == str2_isL) { 213 // load last 8 bytes (8LL/4UU symbols) 214 ldr(tmp6, Address(tmp6, -wordSize)); 215 } else { 216 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 217 // convert Latin1 to UTF. We'll have to wait until load completed, but 218 // it's still faster than per-character loads+checks 219 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 220 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 221 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 222 andr(tmp6, tmp6, 0xFF); // str1[N-4] 223 orr(ch2, ch1, ch2, LSL, 16); 224 orr(tmp6, tmp6, tmp3, LSL, 48); 225 orr(tmp6, tmp6, ch2, LSL, 16); 226 } 227 BIND(BMLOOPSTR2); 228 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 229 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 230 if (str1_isL == str2_isL) { 231 // re-init tmp3. It's for free because it's executed in parallel with 232 // load above. Alternative is to initialize it before loop, but it'll 233 // affect performance on in-order systems with 2 or more ld/st pipelines 234 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 235 } 236 if (!isL) { // UU/UL case 237 lsl(ch2, cnt1tmp, 1); // offset in bytes 238 } 239 cmp(tmp3, skipch); 240 br(NE, BMSKIP); 241 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 242 mov(ch1, tmp6); 243 if (isL) { 244 b(BMLOOPSTR1_AFTER_LOAD); 245 } else { 246 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 247 b(BMLOOPSTR1_CMP); 248 } 249 BIND(BMLOOPSTR1); 250 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 251 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 252 BIND(BMLOOPSTR1_AFTER_LOAD); 253 subs(cnt1tmp, cnt1tmp, 1); 254 br(LT, BMLOOPSTR1_LASTCMP); 255 BIND(BMLOOPSTR1_CMP); 256 cmp(ch1, ch2); 257 br(EQ, BMLOOPSTR1); 258 BIND(BMSKIP); 259 if (!isL) { 260 // if we've met UTF symbol while searching Latin1 pattern, then we can 261 // skip cnt1 symbols 262 if (str1_isL != str2_isL) { 263 mov(result_tmp, cnt1); 264 } else { 265 mov(result_tmp, 1); 266 } 267 subs(zr, skipch, ASIZE); 268 br(HS, BMADV); 269 } 270 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 271 BIND(BMADV); 272 sub(cnt1tmp, cnt1, 1); 273 add(str2, str2, result_tmp, LSL, str2_chr_shift); 274 cmp(str2, str2end); 275 br(LE, BMLOOPSTR2); 276 add(sp, sp, ASIZE); 277 b(NOMATCH); 278 BIND(BMLOOPSTR1_LASTCMP); 279 cmp(ch1, ch2); 280 br(NE, BMSKIP); 281 BIND(BMMATCH); 282 sub(result, str2, tmp5); 283 if (!str2_isL) lsr(result, result, 1); 284 add(sp, sp, ASIZE); 285 b(DONE); 286 287 BIND(LINEARSTUB); 288 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 289 br(LT, LINEAR_MEDIUM); 290 mov(result, zr); 291 RuntimeAddress stub = nullptr; 292 if (isL) { 293 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 294 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 295 } else if (str1_isL) { 296 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 297 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 298 } else { 299 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 300 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 301 } 302 address call = trampoline_call(stub); 303 if (call == nullptr) { 304 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 305 ciEnv::current()->record_failure("CodeCache is full"); 306 return; 307 } 308 b(DONE); 309 } 310 311 BIND(LINEARSEARCH); 312 { 313 Label DO1, DO2, DO3; 314 315 Register str2tmp = tmp2; 316 Register first = tmp3; 317 318 if (icnt1 == -1) 319 { 320 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 321 322 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 323 br(LT, DOSHORT); 324 BIND(LINEAR_MEDIUM); 325 (this->*str1_load_1chr)(first, Address(str1)); 326 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 327 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 328 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 329 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 330 331 BIND(FIRST_LOOP); 332 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 333 cmp(first, ch2); 334 br(EQ, STR1_LOOP); 335 BIND(STR2_NEXT); 336 adds(cnt2_neg, cnt2_neg, str2_chr_size); 337 br(LE, FIRST_LOOP); 338 b(NOMATCH); 339 340 BIND(STR1_LOOP); 341 adds(cnt1tmp, cnt1_neg, str1_chr_size); 342 add(cnt2tmp, cnt2_neg, str2_chr_size); 343 br(GE, MATCH); 344 345 BIND(STR1_NEXT); 346 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 347 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 348 cmp(ch1, ch2); 349 br(NE, STR2_NEXT); 350 adds(cnt1tmp, cnt1tmp, str1_chr_size); 351 add(cnt2tmp, cnt2tmp, str2_chr_size); 352 br(LT, STR1_NEXT); 353 b(MATCH); 354 355 BIND(DOSHORT); 356 if (str1_isL == str2_isL) { 357 cmp(cnt1, (u1)2); 358 br(LT, DO1); 359 br(GT, DO3); 360 } 361 } 362 363 if (icnt1 == 4) { 364 Label CH1_LOOP; 365 366 (this->*load_4chr)(ch1, str1); 367 sub(result_tmp, cnt2, 4); 368 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 369 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 370 371 BIND(CH1_LOOP); 372 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 373 cmp(ch1, ch2); 374 br(EQ, MATCH); 375 adds(cnt2_neg, cnt2_neg, str2_chr_size); 376 br(LE, CH1_LOOP); 377 b(NOMATCH); 378 } 379 380 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 381 Label CH1_LOOP; 382 383 BIND(DO2); 384 (this->*load_2chr)(ch1, str1); 385 if (icnt1 == 2) { 386 sub(result_tmp, cnt2, 2); 387 } 388 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 389 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 390 BIND(CH1_LOOP); 391 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 392 cmp(ch1, ch2); 393 br(EQ, MATCH); 394 adds(cnt2_neg, cnt2_neg, str2_chr_size); 395 br(LE, CH1_LOOP); 396 b(NOMATCH); 397 } 398 399 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 400 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 401 402 BIND(DO3); 403 (this->*load_2chr)(first, str1); 404 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 405 if (icnt1 == 3) { 406 sub(result_tmp, cnt2, 3); 407 } 408 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 409 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 410 BIND(FIRST_LOOP); 411 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 412 cmpw(first, ch2); 413 br(EQ, STR1_LOOP); 414 BIND(STR2_NEXT); 415 adds(cnt2_neg, cnt2_neg, str2_chr_size); 416 br(LE, FIRST_LOOP); 417 b(NOMATCH); 418 419 BIND(STR1_LOOP); 420 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 421 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 422 cmp(ch1, ch2); 423 br(NE, STR2_NEXT); 424 b(MATCH); 425 } 426 427 if (icnt1 == -1 || icnt1 == 1) { 428 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 429 430 BIND(DO1); 431 (this->*str1_load_1chr)(ch1, str1); 432 cmp(cnt2, (u1)8); 433 br(LT, DO1_SHORT); 434 435 sub(result_tmp, cnt2, 8/str2_chr_size); 436 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 437 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 438 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 439 440 if (str2_isL) { 441 orr(ch1, ch1, ch1, LSL, 8); 442 } 443 orr(ch1, ch1, ch1, LSL, 16); 444 orr(ch1, ch1, ch1, LSL, 32); 445 BIND(CH1_LOOP); 446 ldr(ch2, Address(str2, cnt2_neg)); 447 eor(ch2, ch1, ch2); 448 sub(tmp1, ch2, tmp3); 449 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 450 bics(tmp1, tmp1, tmp2); 451 br(NE, HAS_ZERO); 452 adds(cnt2_neg, cnt2_neg, 8); 453 br(LT, CH1_LOOP); 454 455 cmp(cnt2_neg, (u1)8); 456 mov(cnt2_neg, 0); 457 br(LT, CH1_LOOP); 458 b(NOMATCH); 459 460 BIND(HAS_ZERO); 461 rev(tmp1, tmp1); 462 clz(tmp1, tmp1); 463 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 464 b(MATCH); 465 466 BIND(DO1_SHORT); 467 mov(result_tmp, cnt2); 468 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 469 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 470 BIND(DO1_LOOP); 471 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 472 cmpw(ch1, ch2); 473 br(EQ, MATCH); 474 adds(cnt2_neg, cnt2_neg, str2_chr_size); 475 br(LT, DO1_LOOP); 476 } 477 } 478 BIND(NOMATCH); 479 mov(result, -1); 480 b(DONE); 481 BIND(MATCH); 482 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 483 BIND(DONE); 484 } 485 486 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 487 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 488 489 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 490 Register ch, Register result, 491 Register tmp1, Register tmp2, Register tmp3) 492 { 493 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 494 Register cnt1_neg = cnt1; 495 Register ch1 = rscratch1; 496 Register result_tmp = rscratch2; 497 498 cbz(cnt1, NOMATCH); 499 500 cmp(cnt1, (u1)4); 501 br(LT, DO1_SHORT); 502 503 orr(ch, ch, ch, LSL, 16); 504 orr(ch, ch, ch, LSL, 32); 505 506 sub(cnt1, cnt1, 4); 507 mov(result_tmp, cnt1); 508 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 509 sub(cnt1_neg, zr, cnt1, LSL, 1); 510 511 mov(tmp3, 0x0001000100010001); 512 513 BIND(CH1_LOOP); 514 ldr(ch1, Address(str1, cnt1_neg)); 515 eor(ch1, ch, ch1); 516 sub(tmp1, ch1, tmp3); 517 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 518 bics(tmp1, tmp1, tmp2); 519 br(NE, HAS_ZERO); 520 adds(cnt1_neg, cnt1_neg, 8); 521 br(LT, CH1_LOOP); 522 523 cmp(cnt1_neg, (u1)8); 524 mov(cnt1_neg, 0); 525 br(LT, CH1_LOOP); 526 b(NOMATCH); 527 528 BIND(HAS_ZERO); 529 rev(tmp1, tmp1); 530 clz(tmp1, tmp1); 531 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 532 b(MATCH); 533 534 BIND(DO1_SHORT); 535 mov(result_tmp, cnt1); 536 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 537 sub(cnt1_neg, zr, cnt1, LSL, 1); 538 BIND(DO1_LOOP); 539 ldrh(ch1, Address(str1, cnt1_neg)); 540 cmpw(ch, ch1); 541 br(EQ, MATCH); 542 adds(cnt1_neg, cnt1_neg, 2); 543 br(LT, DO1_LOOP); 544 BIND(NOMATCH); 545 mov(result, -1); 546 b(DONE); 547 BIND(MATCH); 548 add(result, result_tmp, cnt1_neg, ASR, 1); 549 BIND(DONE); 550 } 551 552 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 553 Register ch, Register result, 554 FloatRegister ztmp1, 555 FloatRegister ztmp2, 556 PRegister tmp_pg, 557 PRegister tmp_pdn, bool isL) 558 { 559 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 560 assert(tmp_pg->is_governing(), 561 "this register has to be a governing predicate register"); 562 563 Label LOOP, MATCH, DONE, NOMATCH; 564 Register vec_len = rscratch1; 565 Register idx = rscratch2; 566 567 SIMD_RegVariant T = (isL == true) ? B : H; 568 569 cbz(cnt1, NOMATCH); 570 571 // Assign the particular char throughout the vector. 572 sve_dup(ztmp2, T, ch); 573 if (isL) { 574 sve_cntb(vec_len); 575 } else { 576 sve_cnth(vec_len); 577 } 578 mov(idx, 0); 579 580 // Generate a predicate to control the reading of input string. 581 sve_whilelt(tmp_pg, T, idx, cnt1); 582 583 BIND(LOOP); 584 // Read a vector of 8- or 16-bit data depending on the string type. Note 585 // that inactive elements indicated by the predicate register won't cause 586 // a data read from memory to the destination vector. 587 if (isL) { 588 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 589 } else { 590 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 591 } 592 add(idx, idx, vec_len); 593 594 // Perform the comparison. An element of the destination predicate is set 595 // to active if the particular char is matched. 596 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 597 598 // Branch if the particular char is found. 599 br(NE, MATCH); 600 601 sve_whilelt(tmp_pg, T, idx, cnt1); 602 603 // Loop back if the particular char not found. 604 br(MI, LOOP); 605 606 BIND(NOMATCH); 607 mov(result, -1); 608 b(DONE); 609 610 BIND(MATCH); 611 // Undo the index increment. 612 sub(idx, idx, vec_len); 613 614 // Crop the vector to find its location. 615 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 616 add(result, idx, -1); 617 sve_incp(result, T, tmp_pdn); 618 BIND(DONE); 619 } 620 621 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 622 Register ch, Register result, 623 Register tmp1, Register tmp2, Register tmp3) 624 { 625 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 626 Register cnt1_neg = cnt1; 627 Register ch1 = rscratch1; 628 Register result_tmp = rscratch2; 629 630 cbz(cnt1, NOMATCH); 631 632 cmp(cnt1, (u1)8); 633 br(LT, DO1_SHORT); 634 635 orr(ch, ch, ch, LSL, 8); 636 orr(ch, ch, ch, LSL, 16); 637 orr(ch, ch, ch, LSL, 32); 638 639 sub(cnt1, cnt1, 8); 640 mov(result_tmp, cnt1); 641 lea(str1, Address(str1, cnt1)); 642 sub(cnt1_neg, zr, cnt1); 643 644 mov(tmp3, 0x0101010101010101); 645 646 BIND(CH1_LOOP); 647 ldr(ch1, Address(str1, cnt1_neg)); 648 eor(ch1, ch, ch1); 649 sub(tmp1, ch1, tmp3); 650 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 651 bics(tmp1, tmp1, tmp2); 652 br(NE, HAS_ZERO); 653 adds(cnt1_neg, cnt1_neg, 8); 654 br(LT, CH1_LOOP); 655 656 cmp(cnt1_neg, (u1)8); 657 mov(cnt1_neg, 0); 658 br(LT, CH1_LOOP); 659 b(NOMATCH); 660 661 BIND(HAS_ZERO); 662 rev(tmp1, tmp1); 663 clz(tmp1, tmp1); 664 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 665 b(MATCH); 666 667 BIND(DO1_SHORT); 668 mov(result_tmp, cnt1); 669 lea(str1, Address(str1, cnt1)); 670 sub(cnt1_neg, zr, cnt1); 671 BIND(DO1_LOOP); 672 ldrb(ch1, Address(str1, cnt1_neg)); 673 cmp(ch, ch1); 674 br(EQ, MATCH); 675 adds(cnt1_neg, cnt1_neg, 1); 676 br(LT, DO1_LOOP); 677 BIND(NOMATCH); 678 mov(result, -1); 679 b(DONE); 680 BIND(MATCH); 681 add(result, result_tmp, cnt1_neg); 682 BIND(DONE); 683 } 684 685 // Compare strings. 686 void C2_MacroAssembler::string_compare(Register str1, Register str2, 687 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 688 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 689 PRegister pgtmp1, PRegister pgtmp2, int ae) { 690 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 691 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 692 SHORT_LOOP_START, TAIL_CHECK; 693 694 bool isLL = ae == StrIntrinsicNode::LL; 695 bool isLU = ae == StrIntrinsicNode::LU; 696 bool isUL = ae == StrIntrinsicNode::UL; 697 698 // The stub threshold for LL strings is: 72 (64 + 8) chars 699 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 700 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 701 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 702 703 bool str1_isL = isLL || isLU; 704 bool str2_isL = isLL || isUL; 705 706 int str1_chr_shift = str1_isL ? 0 : 1; 707 int str2_chr_shift = str2_isL ? 0 : 1; 708 int str1_chr_size = str1_isL ? 1 : 2; 709 int str2_chr_size = str2_isL ? 1 : 2; 710 int minCharsInWord = isLL ? wordSize : wordSize/2; 711 712 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 713 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 714 (chr_insn)&MacroAssembler::ldrh; 715 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 716 (chr_insn)&MacroAssembler::ldrh; 717 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 718 (uxt_insn)&MacroAssembler::uxthw; 719 720 BLOCK_COMMENT("string_compare {"); 721 722 // Bizzarely, the counts are passed in bytes, regardless of whether they 723 // are L or U strings, however the result is always in characters. 724 if (!str1_isL) asrw(cnt1, cnt1, 1); 725 if (!str2_isL) asrw(cnt2, cnt2, 1); 726 727 // Compute the minimum of the string lengths and save the difference. 728 subsw(result, cnt1, cnt2); 729 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 730 731 // A very short string 732 cmpw(cnt2, minCharsInWord); 733 br(Assembler::LE, SHORT_STRING); 734 735 // Compare longwords 736 // load first parts of strings and finish initialization while loading 737 { 738 if (str1_isL == str2_isL) { // LL or UU 739 ldr(tmp1, Address(str1)); 740 cmp(str1, str2); 741 br(Assembler::EQ, DONE); 742 ldr(tmp2, Address(str2)); 743 cmp(cnt2, stub_threshold); 744 br(GE, STUB); 745 subsw(cnt2, cnt2, minCharsInWord); 746 br(EQ, TAIL_CHECK); 747 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 748 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 749 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 750 } else if (isLU) { 751 ldrs(vtmp, Address(str1)); 752 ldr(tmp2, Address(str2)); 753 cmp(cnt2, stub_threshold); 754 br(GE, STUB); 755 subw(cnt2, cnt2, 4); 756 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 757 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 758 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 759 zip1(vtmp, T8B, vtmp, vtmpZ); 760 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 761 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 762 add(cnt1, cnt1, 4); 763 fmovd(tmp1, vtmp); 764 } else { // UL case 765 ldr(tmp1, Address(str1)); 766 ldrs(vtmp, Address(str2)); 767 cmp(cnt2, stub_threshold); 768 br(GE, STUB); 769 subw(cnt2, cnt2, 4); 770 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 771 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 772 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 773 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 774 zip1(vtmp, T8B, vtmp, vtmpZ); 775 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 776 add(cnt1, cnt1, 8); 777 fmovd(tmp2, vtmp); 778 } 779 adds(cnt2, cnt2, isUL ? 4 : 8); 780 br(GE, TAIL); 781 eor(rscratch2, tmp1, tmp2); 782 cbnz(rscratch2, DIFF); 783 // main loop 784 bind(NEXT_WORD); 785 if (str1_isL == str2_isL) { 786 ldr(tmp1, Address(str1, cnt2)); 787 ldr(tmp2, Address(str2, cnt2)); 788 adds(cnt2, cnt2, 8); 789 } else if (isLU) { 790 ldrs(vtmp, Address(str1, cnt1)); 791 ldr(tmp2, Address(str2, cnt2)); 792 add(cnt1, cnt1, 4); 793 zip1(vtmp, T8B, vtmp, vtmpZ); 794 fmovd(tmp1, vtmp); 795 adds(cnt2, cnt2, 8); 796 } else { // UL 797 ldrs(vtmp, Address(str2, cnt2)); 798 ldr(tmp1, Address(str1, cnt1)); 799 zip1(vtmp, T8B, vtmp, vtmpZ); 800 add(cnt1, cnt1, 8); 801 fmovd(tmp2, vtmp); 802 adds(cnt2, cnt2, 4); 803 } 804 br(GE, TAIL); 805 806 eor(rscratch2, tmp1, tmp2); 807 cbz(rscratch2, NEXT_WORD); 808 b(DIFF); 809 bind(TAIL); 810 eor(rscratch2, tmp1, tmp2); 811 cbnz(rscratch2, DIFF); 812 // Last longword. In the case where length == 4 we compare the 813 // same longword twice, but that's still faster than another 814 // conditional branch. 815 if (str1_isL == str2_isL) { 816 ldr(tmp1, Address(str1)); 817 ldr(tmp2, Address(str2)); 818 } else if (isLU) { 819 ldrs(vtmp, Address(str1)); 820 ldr(tmp2, Address(str2)); 821 zip1(vtmp, T8B, vtmp, vtmpZ); 822 fmovd(tmp1, vtmp); 823 } else { // UL 824 ldrs(vtmp, Address(str2)); 825 ldr(tmp1, Address(str1)); 826 zip1(vtmp, T8B, vtmp, vtmpZ); 827 fmovd(tmp2, vtmp); 828 } 829 bind(TAIL_CHECK); 830 eor(rscratch2, tmp1, tmp2); 831 cbz(rscratch2, DONE); 832 833 // Find the first different characters in the longwords and 834 // compute their difference. 835 bind(DIFF); 836 rev(rscratch2, rscratch2); 837 clz(rscratch2, rscratch2); 838 andr(rscratch2, rscratch2, isLL ? -8 : -16); 839 lsrv(tmp1, tmp1, rscratch2); 840 (this->*ext_chr)(tmp1, tmp1); 841 lsrv(tmp2, tmp2, rscratch2); 842 (this->*ext_chr)(tmp2, tmp2); 843 subw(result, tmp1, tmp2); 844 b(DONE); 845 } 846 847 bind(STUB); 848 RuntimeAddress stub = nullptr; 849 switch(ae) { 850 case StrIntrinsicNode::LL: 851 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 852 break; 853 case StrIntrinsicNode::UU: 854 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 855 break; 856 case StrIntrinsicNode::LU: 857 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 858 break; 859 case StrIntrinsicNode::UL: 860 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 861 break; 862 default: 863 ShouldNotReachHere(); 864 } 865 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 866 address call = trampoline_call(stub); 867 if (call == nullptr) { 868 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 869 ciEnv::current()->record_failure("CodeCache is full"); 870 return; 871 } 872 b(DONE); 873 874 bind(SHORT_STRING); 875 // Is the minimum length zero? 876 cbz(cnt2, DONE); 877 // arrange code to do most branches while loading and loading next characters 878 // while comparing previous 879 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 880 subs(cnt2, cnt2, 1); 881 br(EQ, SHORT_LAST_INIT); 882 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 883 b(SHORT_LOOP_START); 884 bind(SHORT_LOOP); 885 subs(cnt2, cnt2, 1); 886 br(EQ, SHORT_LAST); 887 bind(SHORT_LOOP_START); 888 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 889 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 890 cmp(tmp1, cnt1); 891 br(NE, SHORT_LOOP_TAIL); 892 subs(cnt2, cnt2, 1); 893 br(EQ, SHORT_LAST2); 894 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 895 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 896 cmp(tmp2, rscratch1); 897 br(EQ, SHORT_LOOP); 898 sub(result, tmp2, rscratch1); 899 b(DONE); 900 bind(SHORT_LOOP_TAIL); 901 sub(result, tmp1, cnt1); 902 b(DONE); 903 bind(SHORT_LAST2); 904 cmp(tmp2, rscratch1); 905 br(EQ, DONE); 906 sub(result, tmp2, rscratch1); 907 908 b(DONE); 909 bind(SHORT_LAST_INIT); 910 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 911 bind(SHORT_LAST); 912 cmp(tmp1, cnt1); 913 br(EQ, DONE); 914 sub(result, tmp1, cnt1); 915 916 bind(DONE); 917 918 BLOCK_COMMENT("} string_compare"); 919 } 920 921 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 922 FloatRegister src2, Condition cond, bool isQ) { 923 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 924 FloatRegister zn = src1, zm = src2; 925 bool needs_negation = false; 926 switch (cond) { 927 case LT: cond = GT; zn = src2; zm = src1; break; 928 case LE: cond = GE; zn = src2; zm = src1; break; 929 case LO: cond = HI; zn = src2; zm = src1; break; 930 case LS: cond = HS; zn = src2; zm = src1; break; 931 case NE: cond = EQ; needs_negation = true; break; 932 default: 933 break; 934 } 935 936 if (is_floating_point_type(bt)) { 937 fcm(cond, dst, size, zn, zm); 938 } else { 939 cm(cond, dst, size, zn, zm); 940 } 941 942 if (needs_negation) { 943 notr(dst, isQ ? T16B : T8B, dst); 944 } 945 } 946 947 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 948 Condition cond, bool isQ) { 949 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 950 if (bt == T_FLOAT || bt == T_DOUBLE) { 951 if (cond == Assembler::NE) { 952 fcm(Assembler::EQ, dst, size, src); 953 notr(dst, isQ ? T16B : T8B, dst); 954 } else { 955 fcm(cond, dst, size, src); 956 } 957 } else { 958 if (cond == Assembler::NE) { 959 cm(Assembler::EQ, dst, size, src); 960 notr(dst, isQ ? T16B : T8B, dst); 961 } else { 962 cm(cond, dst, size, src); 963 } 964 } 965 } 966 967 // Compress the least significant bit of each byte to the rightmost and clear 968 // the higher garbage bits. 969 void C2_MacroAssembler::bytemask_compress(Register dst) { 970 // Example input, dst = 0x01 00 00 00 01 01 00 01 971 // The "??" bytes are garbage. 972 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 973 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 974 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 975 andr(dst, dst, 0xff); // dst = 0x8D 976 } 977 978 // Pack the lowest-numbered bit of each mask element in src into a long value 979 // in dst, at most the first 64 lane elements. 980 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 981 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 982 FloatRegister vtmp1, FloatRegister vtmp2) { 983 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 984 assert_different_registers(dst, rscratch1); 985 assert_different_registers(vtmp1, vtmp2); 986 987 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 988 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 989 // Expected: dst = 0x658D 990 991 // Convert the mask into vector with sequential bytes. 992 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 993 sve_cpy(vtmp1, size, src, 1, false); 994 if (bt != T_BYTE) { 995 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 996 } 997 998 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 999 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1000 // is to compress each significant bit of the byte in a cross-lane way. Due 1001 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1002 // (bit-compress in each lane) with the biggest lane size (T = D) then 1003 // concatenate the results. 1004 1005 // The second source input of BEXT, initialized with 0x01 in each byte. 1006 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1007 sve_dup(vtmp2, B, 1); 1008 1009 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1010 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1011 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1012 // --------------------------------------- 1013 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1014 sve_bext(vtmp1, D, vtmp1, vtmp2); 1015 1016 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1017 // result to dst. 1018 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1019 // dst = 0x658D 1020 if (lane_cnt <= 8) { 1021 // No need to concatenate. 1022 umov(dst, vtmp1, B, 0); 1023 } else if (lane_cnt <= 16) { 1024 ins(vtmp1, B, vtmp1, 1, 8); 1025 umov(dst, vtmp1, H, 0); 1026 } else { 1027 // As the lane count is 64 at most, the final expected value must be in 1028 // the lowest 64 bits after narrowing vtmp1 from D to B. 1029 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1030 umov(dst, vtmp1, D, 0); 1031 } 1032 } else if (UseSVE > 0) { 1033 // Compress the lowest 8 bytes. 1034 fmovd(dst, vtmp1); 1035 bytemask_compress(dst); 1036 if (lane_cnt <= 8) return; 1037 1038 // Repeat on higher bytes and join the results. 1039 // Compress 8 bytes in each iteration. 1040 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1041 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1042 bytemask_compress(rscratch1); 1043 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1044 } 1045 } else { 1046 assert(false, "unsupported"); 1047 ShouldNotReachHere(); 1048 } 1049 } 1050 1051 // Unpack the mask, a long value in src, into predicate register dst based on the 1052 // corresponding data type. Note that dst can support at most 64 lanes. 1053 // Below example gives the expected dst predicate register in different types, with 1054 // a valid src(0x658D) on a 1024-bit vector size machine. 1055 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1056 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1057 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1058 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1059 // 1060 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1061 // has 24 significant bits would be an invalid input if dst predicate register refers to 1062 // a LONG type 1024-bit vector, which has at most 16 lanes. 1063 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1064 FloatRegister vtmp1, FloatRegister vtmp2) { 1065 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1066 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1067 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1068 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1069 // Expected: dst = 0b01101001 10001101 1070 1071 // Put long value from general purpose register into the first lane of vector. 1072 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1073 sve_dup(vtmp1, B, 0); 1074 mov(vtmp1, D, 0, src); 1075 1076 // As sve_cmp generates mask value with the minimum unit in byte, we should 1077 // transform the value in the first lane which is mask in bit now to the 1078 // mask in byte, which can be done by SVE2's BDEP instruction. 1079 1080 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1081 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1082 if (lane_cnt <= 8) { 1083 // Nothing. As only one byte exsits. 1084 } else if (lane_cnt <= 16) { 1085 ins(vtmp1, B, vtmp1, 8, 1); 1086 mov(vtmp1, B, 1, zr); 1087 } else { 1088 sve_vector_extend(vtmp1, D, vtmp1, B); 1089 } 1090 1091 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1092 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1093 sve_dup(vtmp2, B, 1); 1094 1095 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1096 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1097 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1098 // --------------------------------------- 1099 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1100 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1101 1102 if (bt != T_BYTE) { 1103 sve_vector_extend(vtmp1, size, vtmp1, B); 1104 } 1105 // Generate mask according to the given vector, in which the elements have been 1106 // extended to expected type. 1107 // dst = 0b01101001 10001101 1108 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1109 } 1110 1111 // Clobbers: rflags 1112 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1113 FloatRegister zn, FloatRegister zm, Condition cond) { 1114 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1115 FloatRegister z1 = zn, z2 = zm; 1116 switch (cond) { 1117 case LE: z1 = zm; z2 = zn; cond = GE; break; 1118 case LT: z1 = zm; z2 = zn; cond = GT; break; 1119 case LO: z1 = zm; z2 = zn; cond = HI; break; 1120 case LS: z1 = zm; z2 = zn; cond = HS; break; 1121 default: 1122 break; 1123 } 1124 1125 SIMD_RegVariant size = elemType_to_regVariant(bt); 1126 if (is_floating_point_type(bt)) { 1127 sve_fcm(cond, pd, size, pg, z1, z2); 1128 } else { 1129 assert(is_integral_type(bt), "unsupported element type"); 1130 sve_cmp(cond, pd, size, pg, z1, z2); 1131 } 1132 } 1133 1134 // Get index of the last mask lane that is set 1135 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1136 SIMD_RegVariant size = elemType_to_regVariant(bt); 1137 sve_rev(ptmp, size, src); 1138 sve_brkb(ptmp, ptrue, ptmp, false); 1139 sve_cntp(dst, size, ptrue, ptmp); 1140 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1141 subw(dst, rscratch1, dst); 1142 } 1143 1144 // Extend integer vector src to dst with the same lane count 1145 // but larger element size, e.g. 4B -> 4I 1146 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1147 FloatRegister src, BasicType src_bt) { 1148 if (src_bt == T_BYTE) { 1149 if (dst_bt == T_SHORT) { 1150 // 4B/8B to 4S/8S 1151 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1152 sxtl(dst, T8H, src, T8B); 1153 } else { 1154 // 4B to 4I 1155 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1156 sxtl(dst, T8H, src, T8B); 1157 sxtl(dst, T4S, dst, T4H); 1158 } 1159 } else if (src_bt == T_SHORT) { 1160 // 4S to 4I 1161 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1162 sxtl(dst, T4S, src, T4H); 1163 } else if (src_bt == T_INT) { 1164 // 2I to 2L 1165 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1166 sxtl(dst, T2D, src, T2S); 1167 } else { 1168 ShouldNotReachHere(); 1169 } 1170 } 1171 1172 // Narrow integer vector src down to dst with the same lane count 1173 // but smaller element size, e.g. 4I -> 4B 1174 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1175 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1176 if (src_bt == T_SHORT) { 1177 // 4S/8S to 4B/8B 1178 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1179 assert(dst_bt == T_BYTE, "unsupported"); 1180 xtn(dst, T8B, src, T8H); 1181 } else if (src_bt == T_INT) { 1182 // 4I to 4B/4S 1183 assert(src_vlen_in_bytes == 16, "unsupported"); 1184 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1185 xtn(dst, T4H, src, T4S); 1186 if (dst_bt == T_BYTE) { 1187 xtn(dst, T8B, dst, T8H); 1188 } 1189 } else if (src_bt == T_LONG) { 1190 // 2L to 2I 1191 assert(src_vlen_in_bytes == 16, "unsupported"); 1192 assert(dst_bt == T_INT, "unsupported"); 1193 xtn(dst, T2S, src, T2D); 1194 } else { 1195 ShouldNotReachHere(); 1196 } 1197 } 1198 1199 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1200 FloatRegister src, SIMD_RegVariant src_size) { 1201 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1202 if (src_size == B) { 1203 switch (dst_size) { 1204 case H: 1205 sve_sunpklo(dst, H, src); 1206 break; 1207 case S: 1208 sve_sunpklo(dst, H, src); 1209 sve_sunpklo(dst, S, dst); 1210 break; 1211 case D: 1212 sve_sunpklo(dst, H, src); 1213 sve_sunpklo(dst, S, dst); 1214 sve_sunpklo(dst, D, dst); 1215 break; 1216 default: 1217 ShouldNotReachHere(); 1218 } 1219 } else if (src_size == H) { 1220 if (dst_size == S) { 1221 sve_sunpklo(dst, S, src); 1222 } else { // D 1223 sve_sunpklo(dst, S, src); 1224 sve_sunpklo(dst, D, dst); 1225 } 1226 } else if (src_size == S) { 1227 sve_sunpklo(dst, D, src); 1228 } 1229 } 1230 1231 // Vector narrow from src to dst with specified element sizes. 1232 // High part of dst vector will be filled with zero. 1233 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1234 FloatRegister src, SIMD_RegVariant src_size, 1235 FloatRegister tmp) { 1236 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1237 assert_different_registers(src, tmp); 1238 sve_dup(tmp, src_size, 0); 1239 if (src_size == D) { 1240 switch (dst_size) { 1241 case S: 1242 sve_uzp1(dst, S, src, tmp); 1243 break; 1244 case H: 1245 assert_different_registers(dst, tmp); 1246 sve_uzp1(dst, S, src, tmp); 1247 sve_uzp1(dst, H, dst, tmp); 1248 break; 1249 case B: 1250 assert_different_registers(dst, tmp); 1251 sve_uzp1(dst, S, src, tmp); 1252 sve_uzp1(dst, H, dst, tmp); 1253 sve_uzp1(dst, B, dst, tmp); 1254 break; 1255 default: 1256 ShouldNotReachHere(); 1257 } 1258 } else if (src_size == S) { 1259 if (dst_size == H) { 1260 sve_uzp1(dst, H, src, tmp); 1261 } else { // B 1262 assert_different_registers(dst, tmp); 1263 sve_uzp1(dst, H, src, tmp); 1264 sve_uzp1(dst, B, dst, tmp); 1265 } 1266 } else if (src_size == H) { 1267 sve_uzp1(dst, B, src, tmp); 1268 } 1269 } 1270 1271 // Extend src predicate to dst predicate with the same lane count but larger 1272 // element size, e.g. 64Byte -> 512Long 1273 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1274 uint dst_element_length_in_bytes, 1275 uint src_element_length_in_bytes) { 1276 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1277 sve_punpklo(dst, src); 1278 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1279 sve_punpklo(dst, src); 1280 sve_punpklo(dst, dst); 1281 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1282 sve_punpklo(dst, src); 1283 sve_punpklo(dst, dst); 1284 sve_punpklo(dst, dst); 1285 } else { 1286 assert(false, "unsupported"); 1287 ShouldNotReachHere(); 1288 } 1289 } 1290 1291 // Narrow src predicate to dst predicate with the same lane count but 1292 // smaller element size, e.g. 512Long -> 64Byte 1293 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1294 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1295 // The insignificant bits in src predicate are expected to be zero. 1296 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1297 // passed as the second argument. An example narrowing operation with a given mask would be - 1298 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1299 // Mask (for 2 Longs) : TF 1300 // Predicate register for the above mask (16 bits) : 00000001 00000000 1301 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1302 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1303 assert_different_registers(src, ptmp); 1304 assert_different_registers(dst, ptmp); 1305 sve_pfalse(ptmp); 1306 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1307 sve_uzp1(dst, B, src, ptmp); 1308 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1309 sve_uzp1(dst, H, src, ptmp); 1310 sve_uzp1(dst, B, dst, ptmp); 1311 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1312 sve_uzp1(dst, S, src, ptmp); 1313 sve_uzp1(dst, H, dst, ptmp); 1314 sve_uzp1(dst, B, dst, ptmp); 1315 } else { 1316 assert(false, "unsupported"); 1317 ShouldNotReachHere(); 1318 } 1319 } 1320 1321 // Vector reduction add for integral type with ASIMD instructions. 1322 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1323 Register isrc, FloatRegister vsrc, 1324 unsigned vector_length_in_bytes, 1325 FloatRegister vtmp) { 1326 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1327 assert_different_registers(dst, isrc); 1328 bool isQ = vector_length_in_bytes == 16; 1329 1330 BLOCK_COMMENT("neon_reduce_add_integral {"); 1331 switch(bt) { 1332 case T_BYTE: 1333 addv(vtmp, isQ ? T16B : T8B, vsrc); 1334 smov(dst, vtmp, B, 0); 1335 addw(dst, dst, isrc, ext::sxtb); 1336 break; 1337 case T_SHORT: 1338 addv(vtmp, isQ ? T8H : T4H, vsrc); 1339 smov(dst, vtmp, H, 0); 1340 addw(dst, dst, isrc, ext::sxth); 1341 break; 1342 case T_INT: 1343 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1344 umov(dst, vtmp, S, 0); 1345 addw(dst, dst, isrc); 1346 break; 1347 case T_LONG: 1348 assert(isQ, "unsupported"); 1349 addpd(vtmp, vsrc); 1350 umov(dst, vtmp, D, 0); 1351 add(dst, dst, isrc); 1352 break; 1353 default: 1354 assert(false, "unsupported"); 1355 ShouldNotReachHere(); 1356 } 1357 BLOCK_COMMENT("} neon_reduce_add_integral"); 1358 } 1359 1360 // Vector reduction multiply for integral type with ASIMD instructions. 1361 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1362 // Clobbers: rscratch1 1363 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1364 Register isrc, FloatRegister vsrc, 1365 unsigned vector_length_in_bytes, 1366 FloatRegister vtmp1, FloatRegister vtmp2) { 1367 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1368 bool isQ = vector_length_in_bytes == 16; 1369 1370 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1371 switch(bt) { 1372 case T_BYTE: 1373 if (isQ) { 1374 // Multiply the lower half and higher half of vector iteratively. 1375 // vtmp1 = vsrc[8:15] 1376 ins(vtmp1, D, vsrc, 0, 1); 1377 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1378 mulv(vtmp1, T8B, vtmp1, vsrc); 1379 // vtmp2 = vtmp1[4:7] 1380 ins(vtmp2, S, vtmp1, 0, 1); 1381 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1382 mulv(vtmp1, T8B, vtmp2, vtmp1); 1383 } else { 1384 ins(vtmp1, S, vsrc, 0, 1); 1385 mulv(vtmp1, T8B, vtmp1, vsrc); 1386 } 1387 // vtmp2 = vtmp1[2:3] 1388 ins(vtmp2, H, vtmp1, 0, 1); 1389 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1390 mulv(vtmp2, T8B, vtmp2, vtmp1); 1391 // dst = vtmp2[0] * isrc * vtmp2[1] 1392 umov(rscratch1, vtmp2, B, 0); 1393 mulw(dst, rscratch1, isrc); 1394 sxtb(dst, dst); 1395 umov(rscratch1, vtmp2, B, 1); 1396 mulw(dst, rscratch1, dst); 1397 sxtb(dst, dst); 1398 break; 1399 case T_SHORT: 1400 if (isQ) { 1401 ins(vtmp2, D, vsrc, 0, 1); 1402 mulv(vtmp2, T4H, vtmp2, vsrc); 1403 ins(vtmp1, S, vtmp2, 0, 1); 1404 mulv(vtmp1, T4H, vtmp1, vtmp2); 1405 } else { 1406 ins(vtmp1, S, vsrc, 0, 1); 1407 mulv(vtmp1, T4H, vtmp1, vsrc); 1408 } 1409 umov(rscratch1, vtmp1, H, 0); 1410 mulw(dst, rscratch1, isrc); 1411 sxth(dst, dst); 1412 umov(rscratch1, vtmp1, H, 1); 1413 mulw(dst, rscratch1, dst); 1414 sxth(dst, dst); 1415 break; 1416 case T_INT: 1417 if (isQ) { 1418 ins(vtmp1, D, vsrc, 0, 1); 1419 mulv(vtmp1, T2S, vtmp1, vsrc); 1420 } else { 1421 vtmp1 = vsrc; 1422 } 1423 umov(rscratch1, vtmp1, S, 0); 1424 mul(dst, rscratch1, isrc); 1425 umov(rscratch1, vtmp1, S, 1); 1426 mul(dst, rscratch1, dst); 1427 break; 1428 case T_LONG: 1429 umov(rscratch1, vsrc, D, 0); 1430 mul(dst, isrc, rscratch1); 1431 umov(rscratch1, vsrc, D, 1); 1432 mul(dst, dst, rscratch1); 1433 break; 1434 default: 1435 assert(false, "unsupported"); 1436 ShouldNotReachHere(); 1437 } 1438 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1439 } 1440 1441 // Vector reduction multiply for floating-point type with ASIMD instructions. 1442 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1443 FloatRegister fsrc, FloatRegister vsrc, 1444 unsigned vector_length_in_bytes, 1445 FloatRegister vtmp) { 1446 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1447 bool isQ = vector_length_in_bytes == 16; 1448 1449 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1450 switch(bt) { 1451 case T_FLOAT: 1452 fmuls(dst, fsrc, vsrc); 1453 ins(vtmp, S, vsrc, 0, 1); 1454 fmuls(dst, dst, vtmp); 1455 if (isQ) { 1456 ins(vtmp, S, vsrc, 0, 2); 1457 fmuls(dst, dst, vtmp); 1458 ins(vtmp, S, vsrc, 0, 3); 1459 fmuls(dst, dst, vtmp); 1460 } 1461 break; 1462 case T_DOUBLE: 1463 assert(isQ, "unsupported"); 1464 fmuld(dst, fsrc, vsrc); 1465 ins(vtmp, D, vsrc, 0, 1); 1466 fmuld(dst, dst, vtmp); 1467 break; 1468 default: 1469 assert(false, "unsupported"); 1470 ShouldNotReachHere(); 1471 } 1472 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1473 } 1474 1475 // Helper to select logical instruction 1476 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1477 Register Rn, Register Rm, 1478 enum shift_kind kind, unsigned shift) { 1479 switch(opc) { 1480 case Op_AndReductionV: 1481 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1482 break; 1483 case Op_OrReductionV: 1484 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1485 break; 1486 case Op_XorReductionV: 1487 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1488 break; 1489 default: 1490 assert(false, "unsupported"); 1491 ShouldNotReachHere(); 1492 } 1493 } 1494 1495 // Vector reduction logical operations And, Or, Xor 1496 // Clobbers: rscratch1 1497 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1498 Register isrc, FloatRegister vsrc, 1499 unsigned vector_length_in_bytes) { 1500 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1501 "unsupported"); 1502 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1503 assert_different_registers(dst, isrc); 1504 bool isQ = vector_length_in_bytes == 16; 1505 1506 BLOCK_COMMENT("neon_reduce_logical {"); 1507 umov(rscratch1, vsrc, isQ ? D : S, 0); 1508 umov(dst, vsrc, isQ ? D : S, 1); 1509 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1510 switch(bt) { 1511 case T_BYTE: 1512 if (isQ) { 1513 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1514 } 1515 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1516 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1517 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1518 sxtb(dst, dst); 1519 break; 1520 case T_SHORT: 1521 if (isQ) { 1522 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1523 } 1524 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1525 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1526 sxth(dst, dst); 1527 break; 1528 case T_INT: 1529 if (isQ) { 1530 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1531 } 1532 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1533 break; 1534 case T_LONG: 1535 assert(isQ, "unsupported"); 1536 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1537 break; 1538 default: 1539 assert(false, "unsupported"); 1540 ShouldNotReachHere(); 1541 } 1542 BLOCK_COMMENT("} neon_reduce_logical"); 1543 } 1544 1545 // Vector reduction min/max for integral type with ASIMD instructions. 1546 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1547 // Clobbers: rscratch1, rflags 1548 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1549 Register isrc, FloatRegister vsrc, 1550 unsigned vector_length_in_bytes, 1551 FloatRegister vtmp) { 1552 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1553 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1554 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1555 assert_different_registers(dst, isrc); 1556 bool isQ = vector_length_in_bytes == 16; 1557 bool is_min = opc == Op_MinReductionV; 1558 1559 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1560 if (bt == T_LONG) { 1561 assert(vtmp == fnoreg, "should be"); 1562 assert(isQ, "should be"); 1563 umov(rscratch1, vsrc, D, 0); 1564 cmp(isrc, rscratch1); 1565 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1566 umov(rscratch1, vsrc, D, 1); 1567 cmp(dst, rscratch1); 1568 csel(dst, dst, rscratch1, is_min ? LT : GT); 1569 } else { 1570 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1571 if (size == T2S) { 1572 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 1573 } else { 1574 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 1575 } 1576 if (bt == T_INT) { 1577 umov(dst, vtmp, S, 0); 1578 } else { 1579 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 1580 } 1581 cmpw(dst, isrc); 1582 cselw(dst, dst, isrc, is_min ? LT : GT); 1583 } 1584 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 1585 } 1586 1587 // Vector reduction for integral type with SVE instruction. 1588 // Supported operations are Add, And, Or, Xor, Max, Min. 1589 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 1590 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1591 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1592 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1593 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1594 assert_different_registers(src1, dst); 1595 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1596 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1597 switch (opc) { 1598 case Op_AddReductionVI: { 1599 sve_uaddv(tmp, size, pg, src2); 1600 if (bt == T_BYTE) { 1601 smov(dst, tmp, size, 0); 1602 addw(dst, src1, dst, ext::sxtb); 1603 } else if (bt == T_SHORT) { 1604 smov(dst, tmp, size, 0); 1605 addw(dst, src1, dst, ext::sxth); 1606 } else { 1607 umov(dst, tmp, size, 0); 1608 addw(dst, dst, src1); 1609 } 1610 break; 1611 } 1612 case Op_AddReductionVL: { 1613 sve_uaddv(tmp, size, pg, src2); 1614 umov(dst, tmp, size, 0); 1615 add(dst, dst, src1); 1616 break; 1617 } 1618 case Op_AndReductionV: { 1619 sve_andv(tmp, size, pg, src2); 1620 if (bt == T_INT || bt == T_LONG) { 1621 umov(dst, tmp, size, 0); 1622 } else { 1623 smov(dst, tmp, size, 0); 1624 } 1625 if (bt == T_LONG) { 1626 andr(dst, dst, src1); 1627 } else { 1628 andw(dst, dst, src1); 1629 } 1630 break; 1631 } 1632 case Op_OrReductionV: { 1633 sve_orv(tmp, size, pg, src2); 1634 if (bt == T_INT || bt == T_LONG) { 1635 umov(dst, tmp, size, 0); 1636 } else { 1637 smov(dst, tmp, size, 0); 1638 } 1639 if (bt == T_LONG) { 1640 orr(dst, dst, src1); 1641 } else { 1642 orrw(dst, dst, src1); 1643 } 1644 break; 1645 } 1646 case Op_XorReductionV: { 1647 sve_eorv(tmp, size, pg, src2); 1648 if (bt == T_INT || bt == T_LONG) { 1649 umov(dst, tmp, size, 0); 1650 } else { 1651 smov(dst, tmp, size, 0); 1652 } 1653 if (bt == T_LONG) { 1654 eor(dst, dst, src1); 1655 } else { 1656 eorw(dst, dst, src1); 1657 } 1658 break; 1659 } 1660 case Op_MaxReductionV: { 1661 sve_smaxv(tmp, size, pg, src2); 1662 if (bt == T_INT || bt == T_LONG) { 1663 umov(dst, tmp, size, 0); 1664 } else { 1665 smov(dst, tmp, size, 0); 1666 } 1667 if (bt == T_LONG) { 1668 cmp(dst, src1); 1669 csel(dst, dst, src1, Assembler::GT); 1670 } else { 1671 cmpw(dst, src1); 1672 cselw(dst, dst, src1, Assembler::GT); 1673 } 1674 break; 1675 } 1676 case Op_MinReductionV: { 1677 sve_sminv(tmp, size, pg, src2); 1678 if (bt == T_INT || bt == T_LONG) { 1679 umov(dst, tmp, size, 0); 1680 } else { 1681 smov(dst, tmp, size, 0); 1682 } 1683 if (bt == T_LONG) { 1684 cmp(dst, src1); 1685 csel(dst, dst, src1, Assembler::LT); 1686 } else { 1687 cmpw(dst, src1); 1688 cselw(dst, dst, src1, Assembler::LT); 1689 } 1690 break; 1691 } 1692 default: 1693 assert(false, "unsupported"); 1694 ShouldNotReachHere(); 1695 } 1696 1697 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1698 if (bt == T_BYTE) { 1699 sxtb(dst, dst); 1700 } else if (bt == T_SHORT) { 1701 sxth(dst, dst); 1702 } 1703 } 1704 } 1705 1706 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 1707 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 1708 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 1709 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 1710 uint32_t max_vector_length = Matcher::max_vector_size(bt); 1711 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 1712 1713 // Set all elements to false if the input "lane_cnt" is zero. 1714 if (lane_cnt == 0) { 1715 sve_pfalse(dst); 1716 return; 1717 } 1718 1719 SIMD_RegVariant size = elemType_to_regVariant(bt); 1720 assert(size != Q, "invalid size"); 1721 1722 // Set all true if "lane_cnt" equals to the max lane count. 1723 if (lane_cnt == max_vector_length) { 1724 sve_ptrue(dst, size, /* ALL */ 0b11111); 1725 return; 1726 } 1727 1728 // Fixed numbers for "ptrue". 1729 switch(lane_cnt) { 1730 case 1: /* VL1 */ 1731 case 2: /* VL2 */ 1732 case 3: /* VL3 */ 1733 case 4: /* VL4 */ 1734 case 5: /* VL5 */ 1735 case 6: /* VL6 */ 1736 case 7: /* VL7 */ 1737 case 8: /* VL8 */ 1738 sve_ptrue(dst, size, lane_cnt); 1739 return; 1740 case 16: 1741 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1742 return; 1743 case 32: 1744 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1745 return; 1746 case 64: 1747 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1748 return; 1749 case 128: 1750 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1751 return; 1752 case 256: 1753 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1754 return; 1755 default: 1756 break; 1757 } 1758 1759 // Special patterns for "ptrue". 1760 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 1761 sve_ptrue(dst, size, /* POW2 */ 0b00000); 1762 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 1763 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 1764 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 1765 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 1766 } else { 1767 // Encode to "whileltw" for the remaining cases. 1768 mov(rscratch1, lane_cnt); 1769 sve_whileltw(dst, size, zr, rscratch1); 1770 } 1771 } 1772 1773 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1774 // Any remaining elements of dst will be filled with zero. 1775 // Clobbers: rscratch1 1776 // Preserves: src, mask 1777 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 1778 FloatRegister vtmp1, FloatRegister vtmp2, 1779 PRegister pgtmp) { 1780 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1781 assert_different_registers(dst, src, vtmp1, vtmp2); 1782 assert_different_registers(mask, pgtmp); 1783 1784 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 1785 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 1786 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 1787 sve_dup(vtmp2, H, 0); 1788 1789 // Extend lowest half to type INT. 1790 // dst = 00004444 00003333 00002222 00001111 1791 sve_uunpklo(dst, S, src); 1792 // pgtmp = 00000001 00000000 00000001 00000001 1793 sve_punpklo(pgtmp, mask); 1794 // Pack the active elements in size of type INT to the right, 1795 // and fill the remainings with zero. 1796 // dst = 00000000 00004444 00002222 00001111 1797 sve_compact(dst, S, dst, pgtmp); 1798 // Narrow the result back to type SHORT. 1799 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 1800 sve_uzp1(dst, H, dst, vtmp2); 1801 // Count the active elements of lowest half. 1802 // rscratch1 = 3 1803 sve_cntp(rscratch1, S, ptrue, pgtmp); 1804 1805 // Repeat to the highest half. 1806 // pgtmp = 00000001 00000000 00000000 00000001 1807 sve_punpkhi(pgtmp, mask); 1808 // vtmp1 = 00008888 00007777 00006666 00005555 1809 sve_uunpkhi(vtmp1, S, src); 1810 // vtmp1 = 00000000 00000000 00008888 00005555 1811 sve_compact(vtmp1, S, vtmp1, pgtmp); 1812 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1813 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 1814 1815 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 1816 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1817 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1818 // TRUE_CNT is the number of active elements in the compressed low. 1819 neg(rscratch1, rscratch1); 1820 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1821 sve_index(vtmp2, H, rscratch1, 1); 1822 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 1823 sve_tbl(vtmp1, H, vtmp1, vtmp2); 1824 1825 // Combine the compressed high(after shifted) with the compressed low. 1826 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 1827 sve_orr(dst, dst, vtmp1); 1828 } 1829 1830 // Clobbers: rscratch1, rscratch2 1831 // Preserves: src, mask 1832 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 1833 FloatRegister vtmp1, FloatRegister vtmp2, 1834 FloatRegister vtmp3, FloatRegister vtmp4, 1835 PRegister ptmp, PRegister pgtmp) { 1836 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1837 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 1838 assert_different_registers(mask, ptmp, pgtmp); 1839 // Example input: src = 88 77 66 55 44 33 22 11 1840 // mask = 01 00 00 01 01 00 01 01 1841 // Expected result: dst = 00 00 00 88 55 44 22 11 1842 1843 sve_dup(vtmp4, B, 0); 1844 // Extend lowest half to type SHORT. 1845 // vtmp1 = 0044 0033 0022 0011 1846 sve_uunpklo(vtmp1, H, src); 1847 // ptmp = 0001 0000 0001 0001 1848 sve_punpklo(ptmp, mask); 1849 // Count the active elements of lowest half. 1850 // rscratch2 = 3 1851 sve_cntp(rscratch2, H, ptrue, ptmp); 1852 // Pack the active elements in size of type SHORT to the right, 1853 // and fill the remainings with zero. 1854 // dst = 0000 0044 0022 0011 1855 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 1856 // Narrow the result back to type BYTE. 1857 // dst = 00 00 00 00 00 44 22 11 1858 sve_uzp1(dst, B, dst, vtmp4); 1859 1860 // Repeat to the highest half. 1861 // ptmp = 0001 0000 0000 0001 1862 sve_punpkhi(ptmp, mask); 1863 // vtmp1 = 0088 0077 0066 0055 1864 sve_uunpkhi(vtmp2, H, src); 1865 // vtmp1 = 0000 0000 0088 0055 1866 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 1867 1868 sve_dup(vtmp4, B, 0); 1869 // vtmp1 = 00 00 00 00 00 00 88 55 1870 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 1871 1872 // Compressed low: dst = 00 00 00 00 00 44 22 11 1873 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 1874 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1875 // TRUE_CNT is the number of active elements in the compressed low. 1876 neg(rscratch2, rscratch2); 1877 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1878 sve_index(vtmp2, B, rscratch2, 1); 1879 // vtmp1 = 00 00 00 88 55 00 00 00 1880 sve_tbl(vtmp1, B, vtmp1, vtmp2); 1881 // Combine the compressed high(after shifted) with the compressed low. 1882 // dst = 00 00 00 88 55 44 22 11 1883 sve_orr(dst, dst, vtmp1); 1884 } 1885 1886 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1887 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1888 SIMD_Arrangement size = isQ ? T16B : T8B; 1889 if (bt == T_BYTE) { 1890 rbit(dst, size, src); 1891 } else { 1892 neon_reverse_bytes(dst, src, bt, isQ); 1893 rbit(dst, size, dst); 1894 } 1895 } 1896 1897 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1898 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1899 SIMD_Arrangement size = isQ ? T16B : T8B; 1900 switch (bt) { 1901 case T_BYTE: 1902 if (dst != src) { 1903 orr(dst, size, src, src); 1904 } 1905 break; 1906 case T_SHORT: 1907 rev16(dst, size, src); 1908 break; 1909 case T_INT: 1910 rev32(dst, size, src); 1911 break; 1912 case T_LONG: 1913 rev64(dst, size, src); 1914 break; 1915 default: 1916 assert(false, "unsupported"); 1917 ShouldNotReachHere(); 1918 } 1919 } 1920 1921 // Extract a scalar element from an sve vector at position 'idx'. 1922 // The input elements in src are expected to be of integral type. 1923 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 1924 int idx, FloatRegister vtmp) { 1925 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1926 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1927 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 1928 if (bt == T_INT || bt == T_LONG) { 1929 umov(dst, src, size, idx); 1930 } else { 1931 smov(dst, src, size, idx); 1932 } 1933 } else { 1934 sve_orr(vtmp, src, src); 1935 sve_ext(vtmp, vtmp, idx << size); 1936 if (bt == T_INT || bt == T_LONG) { 1937 umov(dst, vtmp, size, 0); 1938 } else { 1939 smov(dst, vtmp, size, 0); 1940 } 1941 } 1942 } 1943 1944 // java.lang.Math::round intrinsics 1945 1946 // Clobbers: rscratch1, rflags 1947 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1948 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 1949 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 1950 switch (T) { 1951 case T2S: 1952 case T4S: 1953 fmovs(tmp1, T, 0.5f); 1954 mov(rscratch1, jint_cast(0x1.0p23f)); 1955 break; 1956 case T2D: 1957 fmovd(tmp1, T, 0.5); 1958 mov(rscratch1, julong_cast(0x1.0p52)); 1959 break; 1960 default: 1961 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 1962 } 1963 fadd(tmp1, T, tmp1, src); 1964 fcvtms(tmp1, T, tmp1); 1965 // tmp1 = floor(src + 0.5, ties to even) 1966 1967 fcvtas(dst, T, src); 1968 // dst = round(src), ties to away 1969 1970 fneg(tmp3, T, src); 1971 dup(tmp2, T, rscratch1); 1972 cm(HS, tmp3, T, tmp3, tmp2); 1973 // tmp3 is now a set of flags 1974 1975 bif(dst, T16B, tmp1, tmp3); 1976 // result in dst 1977 } 1978 1979 // Clobbers: rscratch1, rflags 1980 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1981 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 1982 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1983 assert_different_registers(tmp1, tmp2, src, dst); 1984 1985 switch (T) { 1986 case S: 1987 mov(rscratch1, jint_cast(0x1.0p23f)); 1988 break; 1989 case D: 1990 mov(rscratch1, julong_cast(0x1.0p52)); 1991 break; 1992 default: 1993 assert(T == S || T == D, "invalid register variant"); 1994 } 1995 1996 sve_frinta(dst, T, ptrue, src); 1997 // dst = round(src), ties to away 1998 1999 Label none; 2000 2001 sve_fneg(tmp1, T, ptrue, src); 2002 sve_dup(tmp2, T, rscratch1); 2003 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2004 br(EQ, none); 2005 { 2006 sve_cpy(tmp1, T, pgtmp, 0.5); 2007 sve_fadd(tmp1, T, pgtmp, src); 2008 sve_frintm(dst, T, pgtmp, tmp1); 2009 // dst = floor(src + 0.5, ties to even) 2010 } 2011 bind(none); 2012 2013 sve_fcvtzs(dst, T, ptrue, dst, T); 2014 // result in dst 2015 } 2016 2017 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2018 FloatRegister one, SIMD_Arrangement T) { 2019 assert_different_registers(dst, src, zero, one); 2020 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2021 2022 facgt(dst, T, src, zero); 2023 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2024 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2025 } 2026 2027 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2028 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2029 assert_different_registers(dst, src, zero, one, vtmp); 2030 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2031 2032 sve_orr(vtmp, src, src); 2033 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2034 switch (T) { 2035 case S: 2036 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2037 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2038 // on the sign of the float value 2039 break; 2040 case D: 2041 sve_and(vtmp, T, min_jlong); 2042 sve_orr(vtmp, T, jlong_cast(1.0)); 2043 break; 2044 default: 2045 assert(false, "unsupported"); 2046 ShouldNotReachHere(); 2047 } 2048 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2049 // Result in dst 2050 } 2051 2052 bool C2_MacroAssembler::in_scratch_emit_size() { 2053 if (ciEnv::current()->task() != nullptr) { 2054 PhaseOutput* phase_output = Compile::current()->output(); 2055 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2056 return true; 2057 } 2058 } 2059 return MacroAssembler::in_scratch_emit_size(); 2060 }