1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 #ifdef PRODUCT 37 #define BLOCK_COMMENT(str) /* nothing */ 38 #define STOP(error) stop(error) 39 #else 40 #define BLOCK_COMMENT(str) block_comment(str) 41 #define STOP(error) block_comment(error); stop(error) 42 #endif 43 44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 45 46 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 47 48 void C2_MacroAssembler::entry_barrier() { 49 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 50 if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) { 51 // Dummy labels for just measuring the code size 52 Label dummy_slow_path; 53 Label dummy_continuation; 54 Label dummy_guard; 55 Label* slow_path = &dummy_slow_path; 56 Label* continuation = &dummy_continuation; 57 Label* guard = &dummy_guard; 58 if (!Compile::current()->output()->in_scratch_emit_size()) { 59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 61 Compile::current()->output()->add_stub(stub); 62 slow_path = &stub->entry(); 63 continuation = &stub->continuation(); 64 guard = &stub->guard(); 65 } 66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 68 } 69 } 70 71 int C2_MacroAssembler::entry_barrier_stub_size() { 72 return 4 * 6; 73 } 74 75 // Search for str1 in str2 and return index or -1 76 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 77 Register cnt2, Register cnt1, 78 Register tmp1, Register tmp2, 79 Register tmp3, Register tmp4, 80 Register tmp5, Register tmp6, 81 int icnt1, Register result, int ae) { 82 // NOTE: tmp5, tmp6 can be zr depending on specific method version 83 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 84 85 Register ch1 = rscratch1; 86 Register ch2 = rscratch2; 87 Register cnt1tmp = tmp1; 88 Register cnt2tmp = tmp2; 89 Register cnt1_neg = cnt1; 90 Register cnt2_neg = cnt2; 91 Register result_tmp = tmp4; 92 93 bool isL = ae == StrIntrinsicNode::LL; 94 95 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 96 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 97 int str1_chr_shift = str1_isL ? 0:1; 98 int str2_chr_shift = str2_isL ? 0:1; 99 int str1_chr_size = str1_isL ? 1:2; 100 int str2_chr_size = str2_isL ? 1:2; 101 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 102 (chr_insn)&MacroAssembler::ldrh; 103 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 104 (chr_insn)&MacroAssembler::ldrh; 105 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 106 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 107 108 // Note, inline_string_indexOf() generates checks: 109 // if (substr.count > string.count) return -1; 110 // if (substr.count == 0) return 0; 111 112 // We have two strings, a source string in str2, cnt2 and a pattern string 113 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 114 115 // For larger pattern and source we use a simplified Boyer Moore algorithm. 116 // With a small pattern and source we use linear scan. 117 118 if (icnt1 == -1) { 119 sub(result_tmp, cnt2, cnt1); 120 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 121 br(LT, LINEARSEARCH); 122 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 123 subs(zr, cnt1, 256); 124 lsr(tmp1, cnt2, 2); 125 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 126 br(GE, LINEARSTUB); 127 } 128 129 // The Boyer Moore alogorithm is based on the description here:- 130 // 131 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 132 // 133 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 134 // and the 'Good Suffix' rule. 135 // 136 // These rules are essentially heuristics for how far we can shift the 137 // pattern along the search string. 138 // 139 // The implementation here uses the 'Bad Character' rule only because of the 140 // complexity of initialisation for the 'Good Suffix' rule. 141 // 142 // This is also known as the Boyer-Moore-Horspool algorithm:- 143 // 144 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 145 // 146 // This particular implementation has few java-specific optimizations. 147 // 148 // #define ASIZE 256 149 // 150 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 151 // int i, j; 152 // unsigned c; 153 // unsigned char bc[ASIZE]; 154 // 155 // /* Preprocessing */ 156 // for (i = 0; i < ASIZE; ++i) 157 // bc[i] = m; 158 // for (i = 0; i < m - 1; ) { 159 // c = x[i]; 160 // ++i; 161 // // c < 256 for Latin1 string, so, no need for branch 162 // #ifdef PATTERN_STRING_IS_LATIN1 163 // bc[c] = m - i; 164 // #else 165 // if (c < ASIZE) bc[c] = m - i; 166 // #endif 167 // } 168 // 169 // /* Searching */ 170 // j = 0; 171 // while (j <= n - m) { 172 // c = y[i+j]; 173 // if (x[m-1] == c) 174 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 175 // if (i < 0) return j; 176 // // c < 256 for Latin1 string, so, no need for branch 177 // #ifdef SOURCE_STRING_IS_LATIN1 178 // // LL case: (c< 256) always true. Remove branch 179 // j += bc[y[j+m-1]]; 180 // #endif 181 // #ifndef PATTERN_STRING_IS_UTF 182 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 183 // if (c < ASIZE) 184 // j += bc[y[j+m-1]]; 185 // else 186 // j += 1 187 // #endif 188 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 189 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 190 // if (c < ASIZE) 191 // j += bc[y[j+m-1]]; 192 // else 193 // j += m 194 // #endif 195 // } 196 // } 197 198 if (icnt1 == -1) { 199 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 200 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 201 Register cnt1end = tmp2; 202 Register str2end = cnt2; 203 Register skipch = tmp2; 204 205 // str1 length is >=8, so, we can read at least 1 register for cases when 206 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 207 // UL case. We'll re-read last character in inner pre-loop code to have 208 // single outer pre-loop load 209 const int firstStep = isL ? 7 : 3; 210 211 const int ASIZE = 256; 212 const int STORED_BYTES = 32; // amount of bytes stored per instruction 213 sub(sp, sp, ASIZE); 214 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 215 mov(ch1, sp); 216 BIND(BM_INIT_LOOP); 217 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 218 subs(tmp5, tmp5, 1); 219 br(GT, BM_INIT_LOOP); 220 221 sub(cnt1tmp, cnt1, 1); 222 mov(tmp5, str2); 223 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 224 sub(ch2, cnt1, 1); 225 mov(tmp3, str1); 226 BIND(BCLOOP); 227 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 228 if (!str1_isL) { 229 subs(zr, ch1, ASIZE); 230 br(HS, BCSKIP); 231 } 232 strb(ch2, Address(sp, ch1)); 233 BIND(BCSKIP); 234 subs(ch2, ch2, 1); 235 br(GT, BCLOOP); 236 237 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 238 if (str1_isL == str2_isL) { 239 // load last 8 bytes (8LL/4UU symbols) 240 ldr(tmp6, Address(tmp6, -wordSize)); 241 } else { 242 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 243 // convert Latin1 to UTF. We'll have to wait until load completed, but 244 // it's still faster than per-character loads+checks 245 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 246 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 247 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 248 andr(tmp6, tmp6, 0xFF); // str1[N-4] 249 orr(ch2, ch1, ch2, LSL, 16); 250 orr(tmp6, tmp6, tmp3, LSL, 48); 251 orr(tmp6, tmp6, ch2, LSL, 16); 252 } 253 BIND(BMLOOPSTR2); 254 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 255 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 256 if (str1_isL == str2_isL) { 257 // re-init tmp3. It's for free because it's executed in parallel with 258 // load above. Alternative is to initialize it before loop, but it'll 259 // affect performance on in-order systems with 2 or more ld/st pipelines 260 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 261 } 262 if (!isL) { // UU/UL case 263 lsl(ch2, cnt1tmp, 1); // offset in bytes 264 } 265 cmp(tmp3, skipch); 266 br(NE, BMSKIP); 267 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 268 mov(ch1, tmp6); 269 if (isL) { 270 b(BMLOOPSTR1_AFTER_LOAD); 271 } else { 272 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 273 b(BMLOOPSTR1_CMP); 274 } 275 BIND(BMLOOPSTR1); 276 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 277 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 278 BIND(BMLOOPSTR1_AFTER_LOAD); 279 subs(cnt1tmp, cnt1tmp, 1); 280 br(LT, BMLOOPSTR1_LASTCMP); 281 BIND(BMLOOPSTR1_CMP); 282 cmp(ch1, ch2); 283 br(EQ, BMLOOPSTR1); 284 BIND(BMSKIP); 285 if (!isL) { 286 // if we've met UTF symbol while searching Latin1 pattern, then we can 287 // skip cnt1 symbols 288 if (str1_isL != str2_isL) { 289 mov(result_tmp, cnt1); 290 } else { 291 mov(result_tmp, 1); 292 } 293 subs(zr, skipch, ASIZE); 294 br(HS, BMADV); 295 } 296 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 297 BIND(BMADV); 298 sub(cnt1tmp, cnt1, 1); 299 add(str2, str2, result_tmp, LSL, str2_chr_shift); 300 cmp(str2, str2end); 301 br(LE, BMLOOPSTR2); 302 add(sp, sp, ASIZE); 303 b(NOMATCH); 304 BIND(BMLOOPSTR1_LASTCMP); 305 cmp(ch1, ch2); 306 br(NE, BMSKIP); 307 BIND(BMMATCH); 308 sub(result, str2, tmp5); 309 if (!str2_isL) lsr(result, result, 1); 310 add(sp, sp, ASIZE); 311 b(DONE); 312 313 BIND(LINEARSTUB); 314 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 315 br(LT, LINEAR_MEDIUM); 316 mov(result, zr); 317 RuntimeAddress stub = NULL; 318 if (isL) { 319 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 320 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 321 } else if (str1_isL) { 322 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 323 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 324 } else { 325 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 326 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 327 } 328 address call = trampoline_call(stub); 329 if (call == nullptr) { 330 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 331 ciEnv::current()->record_failure("CodeCache is full"); 332 return; 333 } 334 b(DONE); 335 } 336 337 BIND(LINEARSEARCH); 338 { 339 Label DO1, DO2, DO3; 340 341 Register str2tmp = tmp2; 342 Register first = tmp3; 343 344 if (icnt1 == -1) 345 { 346 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 347 348 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 349 br(LT, DOSHORT); 350 BIND(LINEAR_MEDIUM); 351 (this->*str1_load_1chr)(first, Address(str1)); 352 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 353 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 354 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 355 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 356 357 BIND(FIRST_LOOP); 358 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 359 cmp(first, ch2); 360 br(EQ, STR1_LOOP); 361 BIND(STR2_NEXT); 362 adds(cnt2_neg, cnt2_neg, str2_chr_size); 363 br(LE, FIRST_LOOP); 364 b(NOMATCH); 365 366 BIND(STR1_LOOP); 367 adds(cnt1tmp, cnt1_neg, str1_chr_size); 368 add(cnt2tmp, cnt2_neg, str2_chr_size); 369 br(GE, MATCH); 370 371 BIND(STR1_NEXT); 372 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 373 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 374 cmp(ch1, ch2); 375 br(NE, STR2_NEXT); 376 adds(cnt1tmp, cnt1tmp, str1_chr_size); 377 add(cnt2tmp, cnt2tmp, str2_chr_size); 378 br(LT, STR1_NEXT); 379 b(MATCH); 380 381 BIND(DOSHORT); 382 if (str1_isL == str2_isL) { 383 cmp(cnt1, (u1)2); 384 br(LT, DO1); 385 br(GT, DO3); 386 } 387 } 388 389 if (icnt1 == 4) { 390 Label CH1_LOOP; 391 392 (this->*load_4chr)(ch1, str1); 393 sub(result_tmp, cnt2, 4); 394 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 395 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 396 397 BIND(CH1_LOOP); 398 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 399 cmp(ch1, ch2); 400 br(EQ, MATCH); 401 adds(cnt2_neg, cnt2_neg, str2_chr_size); 402 br(LE, CH1_LOOP); 403 b(NOMATCH); 404 } 405 406 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 407 Label CH1_LOOP; 408 409 BIND(DO2); 410 (this->*load_2chr)(ch1, str1); 411 if (icnt1 == 2) { 412 sub(result_tmp, cnt2, 2); 413 } 414 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 415 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 416 BIND(CH1_LOOP); 417 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 418 cmp(ch1, ch2); 419 br(EQ, MATCH); 420 adds(cnt2_neg, cnt2_neg, str2_chr_size); 421 br(LE, CH1_LOOP); 422 b(NOMATCH); 423 } 424 425 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 426 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 427 428 BIND(DO3); 429 (this->*load_2chr)(first, str1); 430 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 431 if (icnt1 == 3) { 432 sub(result_tmp, cnt2, 3); 433 } 434 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 435 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 436 BIND(FIRST_LOOP); 437 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 438 cmpw(first, ch2); 439 br(EQ, STR1_LOOP); 440 BIND(STR2_NEXT); 441 adds(cnt2_neg, cnt2_neg, str2_chr_size); 442 br(LE, FIRST_LOOP); 443 b(NOMATCH); 444 445 BIND(STR1_LOOP); 446 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 447 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 448 cmp(ch1, ch2); 449 br(NE, STR2_NEXT); 450 b(MATCH); 451 } 452 453 if (icnt1 == -1 || icnt1 == 1) { 454 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 455 456 BIND(DO1); 457 (this->*str1_load_1chr)(ch1, str1); 458 cmp(cnt2, (u1)8); 459 br(LT, DO1_SHORT); 460 461 sub(result_tmp, cnt2, 8/str2_chr_size); 462 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 463 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 464 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 465 466 if (str2_isL) { 467 orr(ch1, ch1, ch1, LSL, 8); 468 } 469 orr(ch1, ch1, ch1, LSL, 16); 470 orr(ch1, ch1, ch1, LSL, 32); 471 BIND(CH1_LOOP); 472 ldr(ch2, Address(str2, cnt2_neg)); 473 eor(ch2, ch1, ch2); 474 sub(tmp1, ch2, tmp3); 475 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 476 bics(tmp1, tmp1, tmp2); 477 br(NE, HAS_ZERO); 478 adds(cnt2_neg, cnt2_neg, 8); 479 br(LT, CH1_LOOP); 480 481 cmp(cnt2_neg, (u1)8); 482 mov(cnt2_neg, 0); 483 br(LT, CH1_LOOP); 484 b(NOMATCH); 485 486 BIND(HAS_ZERO); 487 rev(tmp1, tmp1); 488 clz(tmp1, tmp1); 489 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 490 b(MATCH); 491 492 BIND(DO1_SHORT); 493 mov(result_tmp, cnt2); 494 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 495 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 496 BIND(DO1_LOOP); 497 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 498 cmpw(ch1, ch2); 499 br(EQ, MATCH); 500 adds(cnt2_neg, cnt2_neg, str2_chr_size); 501 br(LT, DO1_LOOP); 502 } 503 } 504 BIND(NOMATCH); 505 mov(result, -1); 506 b(DONE); 507 BIND(MATCH); 508 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 509 BIND(DONE); 510 } 511 512 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 513 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 514 515 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 516 Register ch, Register result, 517 Register tmp1, Register tmp2, Register tmp3) 518 { 519 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 520 Register cnt1_neg = cnt1; 521 Register ch1 = rscratch1; 522 Register result_tmp = rscratch2; 523 524 cbz(cnt1, NOMATCH); 525 526 cmp(cnt1, (u1)4); 527 br(LT, DO1_SHORT); 528 529 orr(ch, ch, ch, LSL, 16); 530 orr(ch, ch, ch, LSL, 32); 531 532 sub(cnt1, cnt1, 4); 533 mov(result_tmp, cnt1); 534 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 535 sub(cnt1_neg, zr, cnt1, LSL, 1); 536 537 mov(tmp3, 0x0001000100010001); 538 539 BIND(CH1_LOOP); 540 ldr(ch1, Address(str1, cnt1_neg)); 541 eor(ch1, ch, ch1); 542 sub(tmp1, ch1, tmp3); 543 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 544 bics(tmp1, tmp1, tmp2); 545 br(NE, HAS_ZERO); 546 adds(cnt1_neg, cnt1_neg, 8); 547 br(LT, CH1_LOOP); 548 549 cmp(cnt1_neg, (u1)8); 550 mov(cnt1_neg, 0); 551 br(LT, CH1_LOOP); 552 b(NOMATCH); 553 554 BIND(HAS_ZERO); 555 rev(tmp1, tmp1); 556 clz(tmp1, tmp1); 557 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 558 b(MATCH); 559 560 BIND(DO1_SHORT); 561 mov(result_tmp, cnt1); 562 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 563 sub(cnt1_neg, zr, cnt1, LSL, 1); 564 BIND(DO1_LOOP); 565 ldrh(ch1, Address(str1, cnt1_neg)); 566 cmpw(ch, ch1); 567 br(EQ, MATCH); 568 adds(cnt1_neg, cnt1_neg, 2); 569 br(LT, DO1_LOOP); 570 BIND(NOMATCH); 571 mov(result, -1); 572 b(DONE); 573 BIND(MATCH); 574 add(result, result_tmp, cnt1_neg, ASR, 1); 575 BIND(DONE); 576 } 577 578 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 579 Register ch, Register result, 580 FloatRegister ztmp1, 581 FloatRegister ztmp2, 582 PRegister tmp_pg, 583 PRegister tmp_pdn, bool isL) 584 { 585 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 586 assert(tmp_pg->is_governing(), 587 "this register has to be a governing predicate register"); 588 589 Label LOOP, MATCH, DONE, NOMATCH; 590 Register vec_len = rscratch1; 591 Register idx = rscratch2; 592 593 SIMD_RegVariant T = (isL == true) ? B : H; 594 595 cbz(cnt1, NOMATCH); 596 597 // Assign the particular char throughout the vector. 598 sve_dup(ztmp2, T, ch); 599 if (isL) { 600 sve_cntb(vec_len); 601 } else { 602 sve_cnth(vec_len); 603 } 604 mov(idx, 0); 605 606 // Generate a predicate to control the reading of input string. 607 sve_whilelt(tmp_pg, T, idx, cnt1); 608 609 BIND(LOOP); 610 // Read a vector of 8- or 16-bit data depending on the string type. Note 611 // that inactive elements indicated by the predicate register won't cause 612 // a data read from memory to the destination vector. 613 if (isL) { 614 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 615 } else { 616 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 617 } 618 add(idx, idx, vec_len); 619 620 // Perform the comparison. An element of the destination predicate is set 621 // to active if the particular char is matched. 622 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 623 624 // Branch if the particular char is found. 625 br(NE, MATCH); 626 627 sve_whilelt(tmp_pg, T, idx, cnt1); 628 629 // Loop back if the particular char not found. 630 br(MI, LOOP); 631 632 BIND(NOMATCH); 633 mov(result, -1); 634 b(DONE); 635 636 BIND(MATCH); 637 // Undo the index increment. 638 sub(idx, idx, vec_len); 639 640 // Crop the vector to find its location. 641 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 642 add(result, idx, -1); 643 sve_incp(result, T, tmp_pdn); 644 BIND(DONE); 645 } 646 647 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 648 Register ch, Register result, 649 Register tmp1, Register tmp2, Register tmp3) 650 { 651 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 652 Register cnt1_neg = cnt1; 653 Register ch1 = rscratch1; 654 Register result_tmp = rscratch2; 655 656 cbz(cnt1, NOMATCH); 657 658 cmp(cnt1, (u1)8); 659 br(LT, DO1_SHORT); 660 661 orr(ch, ch, ch, LSL, 8); 662 orr(ch, ch, ch, LSL, 16); 663 orr(ch, ch, ch, LSL, 32); 664 665 sub(cnt1, cnt1, 8); 666 mov(result_tmp, cnt1); 667 lea(str1, Address(str1, cnt1)); 668 sub(cnt1_neg, zr, cnt1); 669 670 mov(tmp3, 0x0101010101010101); 671 672 BIND(CH1_LOOP); 673 ldr(ch1, Address(str1, cnt1_neg)); 674 eor(ch1, ch, ch1); 675 sub(tmp1, ch1, tmp3); 676 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 677 bics(tmp1, tmp1, tmp2); 678 br(NE, HAS_ZERO); 679 adds(cnt1_neg, cnt1_neg, 8); 680 br(LT, CH1_LOOP); 681 682 cmp(cnt1_neg, (u1)8); 683 mov(cnt1_neg, 0); 684 br(LT, CH1_LOOP); 685 b(NOMATCH); 686 687 BIND(HAS_ZERO); 688 rev(tmp1, tmp1); 689 clz(tmp1, tmp1); 690 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 691 b(MATCH); 692 693 BIND(DO1_SHORT); 694 mov(result_tmp, cnt1); 695 lea(str1, Address(str1, cnt1)); 696 sub(cnt1_neg, zr, cnt1); 697 BIND(DO1_LOOP); 698 ldrb(ch1, Address(str1, cnt1_neg)); 699 cmp(ch, ch1); 700 br(EQ, MATCH); 701 adds(cnt1_neg, cnt1_neg, 1); 702 br(LT, DO1_LOOP); 703 BIND(NOMATCH); 704 mov(result, -1); 705 b(DONE); 706 BIND(MATCH); 707 add(result, result_tmp, cnt1_neg); 708 BIND(DONE); 709 } 710 711 // Compare strings. 712 void C2_MacroAssembler::string_compare(Register str1, Register str2, 713 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 714 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 715 PRegister pgtmp1, PRegister pgtmp2, int ae) { 716 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 717 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 718 SHORT_LOOP_START, TAIL_CHECK; 719 720 bool isLL = ae == StrIntrinsicNode::LL; 721 bool isLU = ae == StrIntrinsicNode::LU; 722 bool isUL = ae == StrIntrinsicNode::UL; 723 724 // The stub threshold for LL strings is: 72 (64 + 8) chars 725 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 726 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 727 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 728 729 bool str1_isL = isLL || isLU; 730 bool str2_isL = isLL || isUL; 731 732 int str1_chr_shift = str1_isL ? 0 : 1; 733 int str2_chr_shift = str2_isL ? 0 : 1; 734 int str1_chr_size = str1_isL ? 1 : 2; 735 int str2_chr_size = str2_isL ? 1 : 2; 736 int minCharsInWord = isLL ? wordSize : wordSize/2; 737 738 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 739 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 740 (chr_insn)&MacroAssembler::ldrh; 741 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 742 (chr_insn)&MacroAssembler::ldrh; 743 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 744 (uxt_insn)&MacroAssembler::uxthw; 745 746 BLOCK_COMMENT("string_compare {"); 747 748 // Bizzarely, the counts are passed in bytes, regardless of whether they 749 // are L or U strings, however the result is always in characters. 750 if (!str1_isL) asrw(cnt1, cnt1, 1); 751 if (!str2_isL) asrw(cnt2, cnt2, 1); 752 753 // Compute the minimum of the string lengths and save the difference. 754 subsw(result, cnt1, cnt2); 755 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 756 757 // A very short string 758 cmpw(cnt2, minCharsInWord); 759 br(Assembler::LE, SHORT_STRING); 760 761 // Compare longwords 762 // load first parts of strings and finish initialization while loading 763 { 764 if (str1_isL == str2_isL) { // LL or UU 765 ldr(tmp1, Address(str1)); 766 cmp(str1, str2); 767 br(Assembler::EQ, DONE); 768 ldr(tmp2, Address(str2)); 769 cmp(cnt2, stub_threshold); 770 br(GE, STUB); 771 subsw(cnt2, cnt2, minCharsInWord); 772 br(EQ, TAIL_CHECK); 773 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 774 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 775 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 776 } else if (isLU) { 777 ldrs(vtmp, Address(str1)); 778 ldr(tmp2, Address(str2)); 779 cmp(cnt2, stub_threshold); 780 br(GE, STUB); 781 subw(cnt2, cnt2, 4); 782 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 783 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 784 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 785 zip1(vtmp, T8B, vtmp, vtmpZ); 786 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 787 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 788 add(cnt1, cnt1, 4); 789 fmovd(tmp1, vtmp); 790 } else { // UL case 791 ldr(tmp1, Address(str1)); 792 ldrs(vtmp, Address(str2)); 793 cmp(cnt2, stub_threshold); 794 br(GE, STUB); 795 subw(cnt2, cnt2, 4); 796 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 797 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 798 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 799 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 800 zip1(vtmp, T8B, vtmp, vtmpZ); 801 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 802 add(cnt1, cnt1, 8); 803 fmovd(tmp2, vtmp); 804 } 805 adds(cnt2, cnt2, isUL ? 4 : 8); 806 br(GE, TAIL); 807 eor(rscratch2, tmp1, tmp2); 808 cbnz(rscratch2, DIFF); 809 // main loop 810 bind(NEXT_WORD); 811 if (str1_isL == str2_isL) { 812 ldr(tmp1, Address(str1, cnt2)); 813 ldr(tmp2, Address(str2, cnt2)); 814 adds(cnt2, cnt2, 8); 815 } else if (isLU) { 816 ldrs(vtmp, Address(str1, cnt1)); 817 ldr(tmp2, Address(str2, cnt2)); 818 add(cnt1, cnt1, 4); 819 zip1(vtmp, T8B, vtmp, vtmpZ); 820 fmovd(tmp1, vtmp); 821 adds(cnt2, cnt2, 8); 822 } else { // UL 823 ldrs(vtmp, Address(str2, cnt2)); 824 ldr(tmp1, Address(str1, cnt1)); 825 zip1(vtmp, T8B, vtmp, vtmpZ); 826 add(cnt1, cnt1, 8); 827 fmovd(tmp2, vtmp); 828 adds(cnt2, cnt2, 4); 829 } 830 br(GE, TAIL); 831 832 eor(rscratch2, tmp1, tmp2); 833 cbz(rscratch2, NEXT_WORD); 834 b(DIFF); 835 bind(TAIL); 836 eor(rscratch2, tmp1, tmp2); 837 cbnz(rscratch2, DIFF); 838 // Last longword. In the case where length == 4 we compare the 839 // same longword twice, but that's still faster than another 840 // conditional branch. 841 if (str1_isL == str2_isL) { 842 ldr(tmp1, Address(str1)); 843 ldr(tmp2, Address(str2)); 844 } else if (isLU) { 845 ldrs(vtmp, Address(str1)); 846 ldr(tmp2, Address(str2)); 847 zip1(vtmp, T8B, vtmp, vtmpZ); 848 fmovd(tmp1, vtmp); 849 } else { // UL 850 ldrs(vtmp, Address(str2)); 851 ldr(tmp1, Address(str1)); 852 zip1(vtmp, T8B, vtmp, vtmpZ); 853 fmovd(tmp2, vtmp); 854 } 855 bind(TAIL_CHECK); 856 eor(rscratch2, tmp1, tmp2); 857 cbz(rscratch2, DONE); 858 859 // Find the first different characters in the longwords and 860 // compute their difference. 861 bind(DIFF); 862 rev(rscratch2, rscratch2); 863 clz(rscratch2, rscratch2); 864 andr(rscratch2, rscratch2, isLL ? -8 : -16); 865 lsrv(tmp1, tmp1, rscratch2); 866 (this->*ext_chr)(tmp1, tmp1); 867 lsrv(tmp2, tmp2, rscratch2); 868 (this->*ext_chr)(tmp2, tmp2); 869 subw(result, tmp1, tmp2); 870 b(DONE); 871 } 872 873 bind(STUB); 874 RuntimeAddress stub = NULL; 875 switch(ae) { 876 case StrIntrinsicNode::LL: 877 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 878 break; 879 case StrIntrinsicNode::UU: 880 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 881 break; 882 case StrIntrinsicNode::LU: 883 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 884 break; 885 case StrIntrinsicNode::UL: 886 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 887 break; 888 default: 889 ShouldNotReachHere(); 890 } 891 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 892 address call = trampoline_call(stub); 893 if (call == nullptr) { 894 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 895 ciEnv::current()->record_failure("CodeCache is full"); 896 return; 897 } 898 b(DONE); 899 900 bind(SHORT_STRING); 901 // Is the minimum length zero? 902 cbz(cnt2, DONE); 903 // arrange code to do most branches while loading and loading next characters 904 // while comparing previous 905 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 906 subs(cnt2, cnt2, 1); 907 br(EQ, SHORT_LAST_INIT); 908 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 909 b(SHORT_LOOP_START); 910 bind(SHORT_LOOP); 911 subs(cnt2, cnt2, 1); 912 br(EQ, SHORT_LAST); 913 bind(SHORT_LOOP_START); 914 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 915 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 916 cmp(tmp1, cnt1); 917 br(NE, SHORT_LOOP_TAIL); 918 subs(cnt2, cnt2, 1); 919 br(EQ, SHORT_LAST2); 920 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 921 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 922 cmp(tmp2, rscratch1); 923 br(EQ, SHORT_LOOP); 924 sub(result, tmp2, rscratch1); 925 b(DONE); 926 bind(SHORT_LOOP_TAIL); 927 sub(result, tmp1, cnt1); 928 b(DONE); 929 bind(SHORT_LAST2); 930 cmp(tmp2, rscratch1); 931 br(EQ, DONE); 932 sub(result, tmp2, rscratch1); 933 934 b(DONE); 935 bind(SHORT_LAST_INIT); 936 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 937 bind(SHORT_LAST); 938 cmp(tmp1, cnt1); 939 br(EQ, DONE); 940 sub(result, tmp1, cnt1); 941 942 bind(DONE); 943 944 BLOCK_COMMENT("} string_compare"); 945 } 946 947 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 948 FloatRegister src2, int cond, bool isQ) { 949 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 950 if (bt == T_FLOAT || bt == T_DOUBLE) { 951 switch (cond) { 952 case BoolTest::eq: fcmeq(dst, size, src1, src2); break; 953 case BoolTest::ne: { 954 fcmeq(dst, size, src1, src2); 955 notr(dst, T16B, dst); 956 break; 957 } 958 case BoolTest::ge: fcmge(dst, size, src1, src2); break; 959 case BoolTest::gt: fcmgt(dst, size, src1, src2); break; 960 case BoolTest::le: fcmge(dst, size, src2, src1); break; 961 case BoolTest::lt: fcmgt(dst, size, src2, src1); break; 962 default: 963 assert(false, "unsupported"); 964 ShouldNotReachHere(); 965 } 966 } else { 967 switch (cond) { 968 case BoolTest::eq: cmeq(dst, size, src1, src2); break; 969 case BoolTest::ne: { 970 cmeq(dst, size, src1, src2); 971 notr(dst, T16B, dst); 972 break; 973 } 974 case BoolTest::ge: cmge(dst, size, src1, src2); break; 975 case BoolTest::gt: cmgt(dst, size, src1, src2); break; 976 case BoolTest::le: cmge(dst, size, src2, src1); break; 977 case BoolTest::lt: cmgt(dst, size, src2, src1); break; 978 case BoolTest::uge: cmhs(dst, size, src1, src2); break; 979 case BoolTest::ugt: cmhi(dst, size, src1, src2); break; 980 case BoolTest::ult: cmhi(dst, size, src2, src1); break; 981 case BoolTest::ule: cmhs(dst, size, src2, src1); break; 982 default: 983 assert(false, "unsupported"); 984 ShouldNotReachHere(); 985 } 986 } 987 } 988 989 // Compress the least significant bit of each byte to the rightmost and clear 990 // the higher garbage bits. 991 void C2_MacroAssembler::bytemask_compress(Register dst) { 992 // Example input, dst = 0x01 00 00 00 01 01 00 01 993 // The "??" bytes are garbage. 994 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 995 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 996 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 997 andr(dst, dst, 0xff); // dst = 0x8D 998 } 999 1000 // Pack the lowest-numbered bit of each mask element in src into a long value 1001 // in dst, at most the first 64 lane elements. 1002 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1003 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1004 FloatRegister vtmp1, FloatRegister vtmp2) { 1005 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1006 assert_different_registers(dst, rscratch1); 1007 assert_different_registers(vtmp1, vtmp2); 1008 1009 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1010 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1011 // Expected: dst = 0x658D 1012 1013 // Convert the mask into vector with sequential bytes. 1014 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1015 sve_cpy(vtmp1, size, src, 1, false); 1016 if (bt != T_BYTE) { 1017 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1018 } 1019 1020 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1021 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1022 // is to compress each significant bit of the byte in a cross-lane way. Due 1023 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1024 // (bit-compress in each lane) with the biggest lane size (T = D) then 1025 // concatenate the results. 1026 1027 // The second source input of BEXT, initialized with 0x01 in each byte. 1028 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1029 sve_dup(vtmp2, B, 1); 1030 1031 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1032 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1033 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1034 // --------------------------------------- 1035 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1036 sve_bext(vtmp1, D, vtmp1, vtmp2); 1037 1038 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1039 // result to dst. 1040 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1041 // dst = 0x658D 1042 if (lane_cnt <= 8) { 1043 // No need to concatenate. 1044 umov(dst, vtmp1, B, 0); 1045 } else if (lane_cnt <= 16) { 1046 ins(vtmp1, B, vtmp1, 1, 8); 1047 umov(dst, vtmp1, H, 0); 1048 } else { 1049 // As the lane count is 64 at most, the final expected value must be in 1050 // the lowest 64 bits after narrowing vtmp1 from D to B. 1051 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1052 umov(dst, vtmp1, D, 0); 1053 } 1054 } else if (UseSVE > 0) { 1055 // Compress the lowest 8 bytes. 1056 fmovd(dst, vtmp1); 1057 bytemask_compress(dst); 1058 if (lane_cnt <= 8) return; 1059 1060 // Repeat on higher bytes and join the results. 1061 // Compress 8 bytes in each iteration. 1062 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1063 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1064 bytemask_compress(rscratch1); 1065 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1066 } 1067 } else { 1068 assert(false, "unsupported"); 1069 ShouldNotReachHere(); 1070 } 1071 } 1072 1073 // Unpack the mask, a long value in src, into predicate register dst based on the 1074 // corresponding data type. Note that dst can support at most 64 lanes. 1075 // Below example gives the expected dst predicate register in different types, with 1076 // a valid src(0x658D) on a 1024-bit vector size machine. 1077 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1078 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1079 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1080 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1081 // 1082 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1083 // has 24 significant bits would be an invalid input if dst predicate register refers to 1084 // a LONG type 1024-bit vector, which has at most 16 lanes. 1085 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1086 FloatRegister vtmp1, FloatRegister vtmp2) { 1087 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1088 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1089 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1090 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1091 // Expected: dst = 0b01101001 10001101 1092 1093 // Put long value from general purpose register into the first lane of vector. 1094 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1095 sve_dup(vtmp1, B, 0); 1096 mov(vtmp1, D, 0, src); 1097 1098 // As sve_cmp generates mask value with the minimum unit in byte, we should 1099 // transform the value in the first lane which is mask in bit now to the 1100 // mask in byte, which can be done by SVE2's BDEP instruction. 1101 1102 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1103 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1104 if (lane_cnt <= 8) { 1105 // Nothing. As only one byte exsits. 1106 } else if (lane_cnt <= 16) { 1107 ins(vtmp1, B, vtmp1, 8, 1); 1108 mov(vtmp1, B, 1, zr); 1109 } else { 1110 sve_vector_extend(vtmp1, D, vtmp1, B); 1111 } 1112 1113 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1114 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1115 sve_dup(vtmp2, B, 1); 1116 1117 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1118 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1119 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1120 // --------------------------------------- 1121 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1122 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1123 1124 if (bt != T_BYTE) { 1125 sve_vector_extend(vtmp1, size, vtmp1, B); 1126 } 1127 // Generate mask according to the given vector, in which the elements have been 1128 // extended to expected type. 1129 // dst = 0b01101001 10001101 1130 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1131 } 1132 1133 // Clobbers: rflags 1134 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1135 FloatRegister zn, FloatRegister zm, int cond) { 1136 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1137 FloatRegister z1 = zn, z2 = zm; 1138 // Convert the original BoolTest condition to Assembler::condition. 1139 Condition condition; 1140 switch (cond) { 1141 case BoolTest::eq: condition = Assembler::EQ; break; 1142 case BoolTest::ne: condition = Assembler::NE; break; 1143 case BoolTest::le: z1 = zm; z2 = zn; condition = Assembler::GE; break; 1144 case BoolTest::ge: condition = Assembler::GE; break; 1145 case BoolTest::lt: z1 = zm; z2 = zn; condition = Assembler::GT; break; 1146 case BoolTest::gt: condition = Assembler::GT; break; 1147 default: 1148 assert(false, "unsupported compare condition"); 1149 ShouldNotReachHere(); 1150 } 1151 1152 SIMD_RegVariant size = elemType_to_regVariant(bt); 1153 if (bt == T_FLOAT || bt == T_DOUBLE) { 1154 sve_fcm(condition, pd, size, pg, z1, z2); 1155 } else { 1156 assert(is_integral_type(bt), "unsupported element type"); 1157 sve_cmp(condition, pd, size, pg, z1, z2); 1158 } 1159 } 1160 1161 // Get index of the last mask lane that is set 1162 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1163 SIMD_RegVariant size = elemType_to_regVariant(bt); 1164 sve_rev(ptmp, size, src); 1165 sve_brkb(ptmp, ptrue, ptmp, false); 1166 sve_cntp(dst, size, ptrue, ptmp); 1167 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1168 subw(dst, rscratch1, dst); 1169 } 1170 1171 // Extend integer vector src to dst with the same lane count 1172 // but larger element size, e.g. 4B -> 4I 1173 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1174 FloatRegister src, BasicType src_bt) { 1175 if (src_bt == T_BYTE) { 1176 if (dst_bt == T_SHORT) { 1177 // 4B/8B to 4S/8S 1178 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported"); 1179 sxtl(dst, T8H, src, T8B); 1180 } else { 1181 // 4B to 4I 1182 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1183 sxtl(dst, T8H, src, T8B); 1184 sxtl(dst, T4S, dst, T4H); 1185 } 1186 } else if (src_bt == T_SHORT) { 1187 // 4S to 4I 1188 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1189 sxtl(dst, T4S, src, T4H); 1190 } else if (src_bt == T_INT) { 1191 // 2I to 2L 1192 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1193 sxtl(dst, T2D, src, T2S); 1194 } else { 1195 ShouldNotReachHere(); 1196 } 1197 } 1198 1199 // Narrow integer vector src down to dst with the same lane count 1200 // but smaller element size, e.g. 4I -> 4B 1201 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1202 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1203 if (src_bt == T_SHORT) { 1204 // 4S/8S to 4B/8B 1205 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1206 assert(dst_bt == T_BYTE, "unsupported"); 1207 xtn(dst, T8B, src, T8H); 1208 } else if (src_bt == T_INT) { 1209 // 4I to 4B/4S 1210 assert(src_vlen_in_bytes == 16, "unsupported"); 1211 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1212 xtn(dst, T4H, src, T4S); 1213 if (dst_bt == T_BYTE) { 1214 xtn(dst, T8B, dst, T8H); 1215 } 1216 } else if (src_bt == T_LONG) { 1217 // 2L to 2I 1218 assert(src_vlen_in_bytes == 16, "unsupported"); 1219 assert(dst_bt == T_INT, "unsupported"); 1220 xtn(dst, T2S, src, T2D); 1221 } else { 1222 ShouldNotReachHere(); 1223 } 1224 } 1225 1226 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1227 FloatRegister src, SIMD_RegVariant src_size) { 1228 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1229 if (src_size == B) { 1230 switch (dst_size) { 1231 case H: 1232 sve_sunpklo(dst, H, src); 1233 break; 1234 case S: 1235 sve_sunpklo(dst, H, src); 1236 sve_sunpklo(dst, S, dst); 1237 break; 1238 case D: 1239 sve_sunpklo(dst, H, src); 1240 sve_sunpklo(dst, S, dst); 1241 sve_sunpklo(dst, D, dst); 1242 break; 1243 default: 1244 ShouldNotReachHere(); 1245 } 1246 } else if (src_size == H) { 1247 if (dst_size == S) { 1248 sve_sunpklo(dst, S, src); 1249 } else { // D 1250 sve_sunpklo(dst, S, src); 1251 sve_sunpklo(dst, D, dst); 1252 } 1253 } else if (src_size == S) { 1254 sve_sunpklo(dst, D, src); 1255 } 1256 } 1257 1258 // Vector narrow from src to dst with specified element sizes. 1259 // High part of dst vector will be filled with zero. 1260 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1261 FloatRegister src, SIMD_RegVariant src_size, 1262 FloatRegister tmp) { 1263 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1264 assert_different_registers(src, tmp); 1265 sve_dup(tmp, src_size, 0); 1266 if (src_size == D) { 1267 switch (dst_size) { 1268 case S: 1269 sve_uzp1(dst, S, src, tmp); 1270 break; 1271 case H: 1272 assert_different_registers(dst, tmp); 1273 sve_uzp1(dst, S, src, tmp); 1274 sve_uzp1(dst, H, dst, tmp); 1275 break; 1276 case B: 1277 assert_different_registers(dst, tmp); 1278 sve_uzp1(dst, S, src, tmp); 1279 sve_uzp1(dst, H, dst, tmp); 1280 sve_uzp1(dst, B, dst, tmp); 1281 break; 1282 default: 1283 ShouldNotReachHere(); 1284 } 1285 } else if (src_size == S) { 1286 if (dst_size == H) { 1287 sve_uzp1(dst, H, src, tmp); 1288 } else { // B 1289 assert_different_registers(dst, tmp); 1290 sve_uzp1(dst, H, src, tmp); 1291 sve_uzp1(dst, B, dst, tmp); 1292 } 1293 } else if (src_size == H) { 1294 sve_uzp1(dst, B, src, tmp); 1295 } 1296 } 1297 1298 // Extend src predicate to dst predicate with the same lane count but larger 1299 // element size, e.g. 64Byte -> 512Long 1300 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1301 uint dst_element_length_in_bytes, 1302 uint src_element_length_in_bytes) { 1303 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1304 sve_punpklo(dst, src); 1305 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1306 sve_punpklo(dst, src); 1307 sve_punpklo(dst, dst); 1308 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1309 sve_punpklo(dst, src); 1310 sve_punpklo(dst, dst); 1311 sve_punpklo(dst, dst); 1312 } else { 1313 assert(false, "unsupported"); 1314 ShouldNotReachHere(); 1315 } 1316 } 1317 1318 // Narrow src predicate to dst predicate with the same lane count but 1319 // smaller element size, e.g. 512Long -> 64Byte 1320 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, 1321 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1322 // The insignificant bits in src predicate are expected to be zero. 1323 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1324 sve_uzp1(dst, B, src, src); 1325 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1326 sve_uzp1(dst, H, src, src); 1327 sve_uzp1(dst, B, dst, dst); 1328 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1329 sve_uzp1(dst, S, src, src); 1330 sve_uzp1(dst, H, dst, dst); 1331 sve_uzp1(dst, B, dst, dst); 1332 } else { 1333 assert(false, "unsupported"); 1334 ShouldNotReachHere(); 1335 } 1336 } 1337 1338 // Vector reduction add for integral type with ASIMD instructions. 1339 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1340 Register isrc, FloatRegister vsrc, 1341 unsigned vector_length_in_bytes, 1342 FloatRegister vtmp) { 1343 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1344 assert_different_registers(dst, isrc); 1345 bool isQ = vector_length_in_bytes == 16; 1346 1347 BLOCK_COMMENT("neon_reduce_add_integral {"); 1348 switch(bt) { 1349 case T_BYTE: 1350 addv(vtmp, isQ ? T16B : T8B, vsrc); 1351 smov(dst, vtmp, B, 0); 1352 addw(dst, dst, isrc, ext::sxtb); 1353 break; 1354 case T_SHORT: 1355 addv(vtmp, isQ ? T8H : T4H, vsrc); 1356 smov(dst, vtmp, H, 0); 1357 addw(dst, dst, isrc, ext::sxth); 1358 break; 1359 case T_INT: 1360 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 1361 umov(dst, vtmp, S, 0); 1362 addw(dst, dst, isrc); 1363 break; 1364 case T_LONG: 1365 assert(isQ, "unsupported"); 1366 addpd(vtmp, vsrc); 1367 umov(dst, vtmp, D, 0); 1368 add(dst, dst, isrc); 1369 break; 1370 default: 1371 assert(false, "unsupported"); 1372 ShouldNotReachHere(); 1373 } 1374 BLOCK_COMMENT("} neon_reduce_add_integral"); 1375 } 1376 1377 // Vector reduction multiply for integral type with ASIMD instructions. 1378 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 1379 // Clobbers: rscratch1 1380 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 1381 Register isrc, FloatRegister vsrc, 1382 unsigned vector_length_in_bytes, 1383 FloatRegister vtmp1, FloatRegister vtmp2) { 1384 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1385 bool isQ = vector_length_in_bytes == 16; 1386 1387 BLOCK_COMMENT("neon_reduce_mul_integral {"); 1388 switch(bt) { 1389 case T_BYTE: 1390 if (isQ) { 1391 // Multiply the lower half and higher half of vector iteratively. 1392 // vtmp1 = vsrc[8:15] 1393 ins(vtmp1, D, vsrc, 0, 1); 1394 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 1395 mulv(vtmp1, T8B, vtmp1, vsrc); 1396 // vtmp2 = vtmp1[4:7] 1397 ins(vtmp2, S, vtmp1, 0, 1); 1398 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 1399 mulv(vtmp1, T8B, vtmp2, vtmp1); 1400 } else { 1401 ins(vtmp1, S, vsrc, 0, 1); 1402 mulv(vtmp1, T8B, vtmp1, vsrc); 1403 } 1404 // vtmp2 = vtmp1[2:3] 1405 ins(vtmp2, H, vtmp1, 0, 1); 1406 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 1407 mulv(vtmp2, T8B, vtmp2, vtmp1); 1408 // dst = vtmp2[0] * isrc * vtmp2[1] 1409 umov(rscratch1, vtmp2, B, 0); 1410 mulw(dst, rscratch1, isrc); 1411 sxtb(dst, dst); 1412 umov(rscratch1, vtmp2, B, 1); 1413 mulw(dst, rscratch1, dst); 1414 sxtb(dst, dst); 1415 break; 1416 case T_SHORT: 1417 if (isQ) { 1418 ins(vtmp2, D, vsrc, 0, 1); 1419 mulv(vtmp2, T4H, vtmp2, vsrc); 1420 ins(vtmp1, S, vtmp2, 0, 1); 1421 mulv(vtmp1, T4H, vtmp1, vtmp2); 1422 } else { 1423 ins(vtmp1, S, vsrc, 0, 1); 1424 mulv(vtmp1, T4H, vtmp1, vsrc); 1425 } 1426 umov(rscratch1, vtmp1, H, 0); 1427 mulw(dst, rscratch1, isrc); 1428 sxth(dst, dst); 1429 umov(rscratch1, vtmp1, H, 1); 1430 mulw(dst, rscratch1, dst); 1431 sxth(dst, dst); 1432 break; 1433 case T_INT: 1434 if (isQ) { 1435 ins(vtmp1, D, vsrc, 0, 1); 1436 mulv(vtmp1, T2S, vtmp1, vsrc); 1437 } else { 1438 vtmp1 = vsrc; 1439 } 1440 umov(rscratch1, vtmp1, S, 0); 1441 mul(dst, rscratch1, isrc); 1442 umov(rscratch1, vtmp1, S, 1); 1443 mul(dst, rscratch1, dst); 1444 break; 1445 case T_LONG: 1446 umov(rscratch1, vsrc, D, 0); 1447 mul(dst, isrc, rscratch1); 1448 umov(rscratch1, vsrc, D, 1); 1449 mul(dst, dst, rscratch1); 1450 break; 1451 default: 1452 assert(false, "unsupported"); 1453 ShouldNotReachHere(); 1454 } 1455 BLOCK_COMMENT("} neon_reduce_mul_integral"); 1456 } 1457 1458 // Vector reduction multiply for floating-point type with ASIMD instructions. 1459 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 1460 FloatRegister fsrc, FloatRegister vsrc, 1461 unsigned vector_length_in_bytes, 1462 FloatRegister vtmp) { 1463 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1464 bool isQ = vector_length_in_bytes == 16; 1465 1466 BLOCK_COMMENT("neon_reduce_mul_fp {"); 1467 switch(bt) { 1468 case T_FLOAT: 1469 fmuls(dst, fsrc, vsrc); 1470 ins(vtmp, S, vsrc, 0, 1); 1471 fmuls(dst, dst, vtmp); 1472 if (isQ) { 1473 ins(vtmp, S, vsrc, 0, 2); 1474 fmuls(dst, dst, vtmp); 1475 ins(vtmp, S, vsrc, 0, 3); 1476 fmuls(dst, dst, vtmp); 1477 } 1478 break; 1479 case T_DOUBLE: 1480 assert(isQ, "unsupported"); 1481 fmuld(dst, fsrc, vsrc); 1482 ins(vtmp, D, vsrc, 0, 1); 1483 fmuld(dst, dst, vtmp); 1484 break; 1485 default: 1486 assert(false, "unsupported"); 1487 ShouldNotReachHere(); 1488 } 1489 BLOCK_COMMENT("} neon_reduce_mul_fp"); 1490 } 1491 1492 // Helper to select logical instruction 1493 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 1494 Register Rn, Register Rm, 1495 enum shift_kind kind, unsigned shift) { 1496 switch(opc) { 1497 case Op_AndReductionV: 1498 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 1499 break; 1500 case Op_OrReductionV: 1501 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 1502 break; 1503 case Op_XorReductionV: 1504 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 1505 break; 1506 default: 1507 assert(false, "unsupported"); 1508 ShouldNotReachHere(); 1509 } 1510 } 1511 1512 // Vector reduction logical operations And, Or, Xor 1513 // Clobbers: rscratch1 1514 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 1515 Register isrc, FloatRegister vsrc, 1516 unsigned vector_length_in_bytes) { 1517 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 1518 "unsupported"); 1519 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1520 assert_different_registers(dst, isrc); 1521 bool isQ = vector_length_in_bytes == 16; 1522 1523 BLOCK_COMMENT("neon_reduce_logical {"); 1524 umov(rscratch1, vsrc, isQ ? D : S, 0); 1525 umov(dst, vsrc, isQ ? D : S, 1); 1526 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 1527 switch(bt) { 1528 case T_BYTE: 1529 if (isQ) { 1530 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1531 } 1532 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1533 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 1534 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1535 sxtb(dst, dst); 1536 break; 1537 case T_SHORT: 1538 if (isQ) { 1539 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1540 } 1541 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 1542 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1543 sxth(dst, dst); 1544 break; 1545 case T_INT: 1546 if (isQ) { 1547 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 1548 } 1549 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 1550 break; 1551 case T_LONG: 1552 assert(isQ, "unsupported"); 1553 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 1554 break; 1555 default: 1556 assert(false, "unsupported"); 1557 ShouldNotReachHere(); 1558 } 1559 BLOCK_COMMENT("} neon_reduce_logical"); 1560 } 1561 1562 // Vector reduction min/max for integral type with ASIMD instructions. 1563 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 1564 // Clobbers: rscratch1, rflags 1565 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 1566 Register isrc, FloatRegister vsrc, 1567 unsigned vector_length_in_bytes, 1568 FloatRegister vtmp) { 1569 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 1570 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1571 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 1572 assert_different_registers(dst, isrc); 1573 bool isQ = vector_length_in_bytes == 16; 1574 bool is_min = opc == Op_MinReductionV; 1575 1576 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 1577 if (bt == T_LONG) { 1578 assert(vtmp == fnoreg, "should be"); 1579 assert(isQ, "should be"); 1580 umov(rscratch1, vsrc, D, 0); 1581 cmp(isrc, rscratch1); 1582 csel(dst, isrc, rscratch1, is_min ? LT : GT); 1583 umov(rscratch1, vsrc, D, 1); 1584 cmp(dst, rscratch1); 1585 csel(dst, dst, rscratch1, is_min ? LT : GT); 1586 } else { 1587 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1588 if (size == T2S) { 1589 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 1590 } else { 1591 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 1592 } 1593 if (bt == T_INT) { 1594 umov(dst, vtmp, S, 0); 1595 } else { 1596 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 1597 } 1598 cmpw(dst, isrc); 1599 cselw(dst, dst, isrc, is_min ? LT : GT); 1600 } 1601 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 1602 } 1603 1604 // Vector reduction for integral type with SVE instruction. 1605 // Supported operations are Add, And, Or, Xor, Max, Min. 1606 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 1607 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 1608 FloatRegister src2, PRegister pg, FloatRegister tmp) { 1609 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1610 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1611 assert_different_registers(src1, dst); 1612 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 1613 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1614 switch (opc) { 1615 case Op_AddReductionVI: { 1616 sve_uaddv(tmp, size, pg, src2); 1617 if (bt == T_BYTE) { 1618 smov(dst, tmp, size, 0); 1619 addw(dst, src1, dst, ext::sxtb); 1620 } else if (bt == T_SHORT) { 1621 smov(dst, tmp, size, 0); 1622 addw(dst, src1, dst, ext::sxth); 1623 } else { 1624 umov(dst, tmp, size, 0); 1625 addw(dst, dst, src1); 1626 } 1627 break; 1628 } 1629 case Op_AddReductionVL: { 1630 sve_uaddv(tmp, size, pg, src2); 1631 umov(dst, tmp, size, 0); 1632 add(dst, dst, src1); 1633 break; 1634 } 1635 case Op_AndReductionV: { 1636 sve_andv(tmp, size, pg, src2); 1637 if (bt == T_INT || bt == T_LONG) { 1638 umov(dst, tmp, size, 0); 1639 } else { 1640 smov(dst, tmp, size, 0); 1641 } 1642 if (bt == T_LONG) { 1643 andr(dst, dst, src1); 1644 } else { 1645 andw(dst, dst, src1); 1646 } 1647 break; 1648 } 1649 case Op_OrReductionV: { 1650 sve_orv(tmp, size, pg, src2); 1651 if (bt == T_INT || bt == T_LONG) { 1652 umov(dst, tmp, size, 0); 1653 } else { 1654 smov(dst, tmp, size, 0); 1655 } 1656 if (bt == T_LONG) { 1657 orr(dst, dst, src1); 1658 } else { 1659 orrw(dst, dst, src1); 1660 } 1661 break; 1662 } 1663 case Op_XorReductionV: { 1664 sve_eorv(tmp, size, pg, src2); 1665 if (bt == T_INT || bt == T_LONG) { 1666 umov(dst, tmp, size, 0); 1667 } else { 1668 smov(dst, tmp, size, 0); 1669 } 1670 if (bt == T_LONG) { 1671 eor(dst, dst, src1); 1672 } else { 1673 eorw(dst, dst, src1); 1674 } 1675 break; 1676 } 1677 case Op_MaxReductionV: { 1678 sve_smaxv(tmp, size, pg, src2); 1679 if (bt == T_INT || bt == T_LONG) { 1680 umov(dst, tmp, size, 0); 1681 } else { 1682 smov(dst, tmp, size, 0); 1683 } 1684 if (bt == T_LONG) { 1685 cmp(dst, src1); 1686 csel(dst, dst, src1, Assembler::GT); 1687 } else { 1688 cmpw(dst, src1); 1689 cselw(dst, dst, src1, Assembler::GT); 1690 } 1691 break; 1692 } 1693 case Op_MinReductionV: { 1694 sve_sminv(tmp, size, pg, src2); 1695 if (bt == T_INT || bt == T_LONG) { 1696 umov(dst, tmp, size, 0); 1697 } else { 1698 smov(dst, tmp, size, 0); 1699 } 1700 if (bt == T_LONG) { 1701 cmp(dst, src1); 1702 csel(dst, dst, src1, Assembler::LT); 1703 } else { 1704 cmpw(dst, src1); 1705 cselw(dst, dst, src1, Assembler::LT); 1706 } 1707 break; 1708 } 1709 default: 1710 assert(false, "unsupported"); 1711 ShouldNotReachHere(); 1712 } 1713 1714 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 1715 if (bt == T_BYTE) { 1716 sxtb(dst, dst); 1717 } else if (bt == T_SHORT) { 1718 sxth(dst, dst); 1719 } 1720 } 1721 } 1722 1723 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 1724 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 1725 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 1726 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 1727 uint32_t max_vector_length = Matcher::max_vector_size(bt); 1728 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 1729 1730 // Set all elements to false if the input "lane_cnt" is zero. 1731 if (lane_cnt == 0) { 1732 sve_pfalse(dst); 1733 return; 1734 } 1735 1736 SIMD_RegVariant size = elemType_to_regVariant(bt); 1737 assert(size != Q, "invalid size"); 1738 1739 // Set all true if "lane_cnt" equals to the max lane count. 1740 if (lane_cnt == max_vector_length) { 1741 sve_ptrue(dst, size, /* ALL */ 0b11111); 1742 return; 1743 } 1744 1745 // Fixed numbers for "ptrue". 1746 switch(lane_cnt) { 1747 case 1: /* VL1 */ 1748 case 2: /* VL2 */ 1749 case 3: /* VL3 */ 1750 case 4: /* VL4 */ 1751 case 5: /* VL5 */ 1752 case 6: /* VL6 */ 1753 case 7: /* VL7 */ 1754 case 8: /* VL8 */ 1755 sve_ptrue(dst, size, lane_cnt); 1756 return; 1757 case 16: 1758 sve_ptrue(dst, size, /* VL16 */ 0b01001); 1759 return; 1760 case 32: 1761 sve_ptrue(dst, size, /* VL32 */ 0b01010); 1762 return; 1763 case 64: 1764 sve_ptrue(dst, size, /* VL64 */ 0b01011); 1765 return; 1766 case 128: 1767 sve_ptrue(dst, size, /* VL128 */ 0b01100); 1768 return; 1769 case 256: 1770 sve_ptrue(dst, size, /* VL256 */ 0b01101); 1771 return; 1772 default: 1773 break; 1774 } 1775 1776 // Special patterns for "ptrue". 1777 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 1778 sve_ptrue(dst, size, /* POW2 */ 0b00000); 1779 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 1780 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 1781 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 1782 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 1783 } else { 1784 // Encode to "whilelow" for the remaining cases. 1785 mov(rscratch1, lane_cnt); 1786 sve_whilelow(dst, size, zr, rscratch1); 1787 } 1788 } 1789 1790 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 1791 // Any remaining elements of dst will be filled with zero. 1792 // Clobbers: rscratch1 1793 // Preserves: src, mask 1794 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 1795 FloatRegister vtmp1, FloatRegister vtmp2, 1796 PRegister pgtmp) { 1797 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1798 assert_different_registers(dst, src, vtmp1, vtmp2); 1799 assert_different_registers(mask, pgtmp); 1800 1801 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 1802 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 1803 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 1804 sve_dup(vtmp2, H, 0); 1805 1806 // Extend lowest half to type INT. 1807 // dst = 00004444 00003333 00002222 00001111 1808 sve_uunpklo(dst, S, src); 1809 // pgtmp = 00000001 00000000 00000001 00000001 1810 sve_punpklo(pgtmp, mask); 1811 // Pack the active elements in size of type INT to the right, 1812 // and fill the remainings with zero. 1813 // dst = 00000000 00004444 00002222 00001111 1814 sve_compact(dst, S, dst, pgtmp); 1815 // Narrow the result back to type SHORT. 1816 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 1817 sve_uzp1(dst, H, dst, vtmp2); 1818 // Count the active elements of lowest half. 1819 // rscratch1 = 3 1820 sve_cntp(rscratch1, S, ptrue, pgtmp); 1821 1822 // Repeat to the highest half. 1823 // pgtmp = 00000001 00000000 00000000 00000001 1824 sve_punpkhi(pgtmp, mask); 1825 // vtmp1 = 00008888 00007777 00006666 00005555 1826 sve_uunpkhi(vtmp1, S, src); 1827 // vtmp1 = 00000000 00000000 00008888 00005555 1828 sve_compact(vtmp1, S, vtmp1, pgtmp); 1829 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1830 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 1831 1832 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 1833 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 1834 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1835 // TRUE_CNT is the number of active elements in the compressed low. 1836 neg(rscratch1, rscratch1); 1837 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1838 sve_index(vtmp2, H, rscratch1, 1); 1839 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 1840 sve_tbl(vtmp1, H, vtmp1, vtmp2); 1841 1842 // Combine the compressed high(after shifted) with the compressed low. 1843 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 1844 sve_orr(dst, dst, vtmp1); 1845 } 1846 1847 // Clobbers: rscratch1, rscratch2 1848 // Preserves: src, mask 1849 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 1850 FloatRegister vtmp1, FloatRegister vtmp2, 1851 FloatRegister vtmp3, FloatRegister vtmp4, 1852 PRegister ptmp, PRegister pgtmp) { 1853 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 1854 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 1855 assert_different_registers(mask, ptmp, pgtmp); 1856 // Example input: src = 88 77 66 55 44 33 22 11 1857 // mask = 01 00 00 01 01 00 01 01 1858 // Expected result: dst = 00 00 00 88 55 44 22 11 1859 1860 sve_dup(vtmp4, B, 0); 1861 // Extend lowest half to type SHORT. 1862 // vtmp1 = 0044 0033 0022 0011 1863 sve_uunpklo(vtmp1, H, src); 1864 // ptmp = 0001 0000 0001 0001 1865 sve_punpklo(ptmp, mask); 1866 // Count the active elements of lowest half. 1867 // rscratch2 = 3 1868 sve_cntp(rscratch2, H, ptrue, ptmp); 1869 // Pack the active elements in size of type SHORT to the right, 1870 // and fill the remainings with zero. 1871 // dst = 0000 0044 0022 0011 1872 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 1873 // Narrow the result back to type BYTE. 1874 // dst = 00 00 00 00 00 44 22 11 1875 sve_uzp1(dst, B, dst, vtmp4); 1876 1877 // Repeat to the highest half. 1878 // ptmp = 0001 0000 0000 0001 1879 sve_punpkhi(ptmp, mask); 1880 // vtmp1 = 0088 0077 0066 0055 1881 sve_uunpkhi(vtmp2, H, src); 1882 // vtmp1 = 0000 0000 0088 0055 1883 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 1884 1885 sve_dup(vtmp4, B, 0); 1886 // vtmp1 = 00 00 00 00 00 00 88 55 1887 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 1888 1889 // Compressed low: dst = 00 00 00 00 00 44 22 11 1890 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 1891 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 1892 // TRUE_CNT is the number of active elements in the compressed low. 1893 neg(rscratch2, rscratch2); 1894 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 1895 sve_index(vtmp2, B, rscratch2, 1); 1896 // vtmp1 = 00 00 00 88 55 00 00 00 1897 sve_tbl(vtmp1, B, vtmp1, vtmp2); 1898 // Combine the compressed high(after shifted) with the compressed low. 1899 // dst = 00 00 00 88 55 44 22 11 1900 sve_orr(dst, dst, vtmp1); 1901 } 1902 1903 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1904 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1905 SIMD_Arrangement size = isQ ? T16B : T8B; 1906 if (bt == T_BYTE) { 1907 rbit(dst, size, src); 1908 } else { 1909 neon_reverse_bytes(dst, src, bt, isQ); 1910 rbit(dst, size, dst); 1911 } 1912 } 1913 1914 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 1915 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 1916 SIMD_Arrangement size = isQ ? T16B : T8B; 1917 switch (bt) { 1918 case T_BYTE: 1919 if (dst != src) { 1920 orr(dst, size, src, src); 1921 } 1922 break; 1923 case T_SHORT: 1924 rev16(dst, size, src); 1925 break; 1926 case T_INT: 1927 rev32(dst, size, src); 1928 break; 1929 case T_LONG: 1930 rev64(dst, size, src); 1931 break; 1932 default: 1933 assert(false, "unsupported"); 1934 ShouldNotReachHere(); 1935 } 1936 } 1937 1938 // Extract a scalar element from an sve vector at position 'idx'. 1939 // The input elements in src are expected to be of integral type. 1940 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 1941 int idx, FloatRegister vtmp) { 1942 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 1943 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1944 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 1945 if (bt == T_INT || bt == T_LONG) { 1946 umov(dst, src, size, idx); 1947 } else { 1948 smov(dst, src, size, idx); 1949 } 1950 } else { 1951 sve_orr(vtmp, src, src); 1952 sve_ext(vtmp, vtmp, idx << size); 1953 if (bt == T_INT || bt == T_LONG) { 1954 umov(dst, vtmp, size, 0); 1955 } else { 1956 smov(dst, vtmp, size, 0); 1957 } 1958 } 1959 } 1960 1961 // java.lang.Math::round intrinsics 1962 1963 // Clobbers: rscratch1, rflags 1964 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1965 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 1966 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 1967 switch (T) { 1968 case T2S: 1969 case T4S: 1970 fmovs(tmp1, T, 0.5f); 1971 mov(rscratch1, jint_cast(0x1.0p23f)); 1972 break; 1973 case T2D: 1974 fmovd(tmp1, T, 0.5); 1975 mov(rscratch1, julong_cast(0x1.0p52)); 1976 break; 1977 default: 1978 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 1979 } 1980 fadd(tmp1, T, tmp1, src); 1981 fcvtms(tmp1, T, tmp1); 1982 // tmp1 = floor(src + 0.5, ties to even) 1983 1984 fcvtas(dst, T, src); 1985 // dst = round(src), ties to away 1986 1987 fneg(tmp3, T, src); 1988 dup(tmp2, T, rscratch1); 1989 cmhs(tmp3, T, tmp3, tmp2); 1990 // tmp3 is now a set of flags 1991 1992 bif(dst, T16B, tmp1, tmp3); 1993 // result in dst 1994 } 1995 1996 // Clobbers: rscratch1, rflags 1997 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 1998 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 1999 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2000 assert_different_registers(tmp1, tmp2, src, dst); 2001 2002 switch (T) { 2003 case S: 2004 mov(rscratch1, jint_cast(0x1.0p23f)); 2005 break; 2006 case D: 2007 mov(rscratch1, julong_cast(0x1.0p52)); 2008 break; 2009 default: 2010 assert(T == S || T == D, "invalid register variant"); 2011 } 2012 2013 sve_frinta(dst, T, ptrue, src); 2014 // dst = round(src), ties to away 2015 2016 Label none; 2017 2018 sve_fneg(tmp1, T, ptrue, src); 2019 sve_dup(tmp2, T, rscratch1); 2020 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2021 br(EQ, none); 2022 { 2023 sve_cpy(tmp1, T, pgtmp, 0.5); 2024 sve_fadd(tmp1, T, pgtmp, src); 2025 sve_frintm(dst, T, pgtmp, tmp1); 2026 // dst = floor(src + 0.5, ties to even) 2027 } 2028 bind(none); 2029 2030 sve_fcvtzs(dst, T, ptrue, dst, T); 2031 // result in dst 2032 } 2033 2034 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2035 FloatRegister one, SIMD_Arrangement T) { 2036 assert_different_registers(dst, src, zero, one); 2037 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2038 2039 facgt(dst, T, src, zero); 2040 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2041 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2042 } 2043 2044 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2045 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2046 assert_different_registers(dst, src, zero, one, vtmp); 2047 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2048 2049 sve_orr(vtmp, src, src); 2050 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2051 switch (T) { 2052 case S: 2053 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2054 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2055 // on the sign of the float value 2056 break; 2057 case D: 2058 sve_and(vtmp, T, min_jlong); 2059 sve_orr(vtmp, T, jlong_cast(1.0)); 2060 break; 2061 default: 2062 assert(false, "unsupported"); 2063 ShouldNotReachHere(); 2064 } 2065 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2066 // Result in dst 2067 } 2068 2069 bool C2_MacroAssembler::in_scratch_emit_size() { 2070 if (ciEnv::current()->task() != NULL) { 2071 PhaseOutput* phase_output = Compile::current()->output(); 2072 if (phase_output != NULL && phase_output->in_scratch_emit_size()) { 2073 return true; 2074 } 2075 } 2076 return MacroAssembler::in_scratch_emit_size(); 2077 }