1 /*
2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/stubRoutines.hpp"
34 #include "utilities/globalDefinitions.hpp"
35 #include "utilities/powerOfTwo.hpp"
36
37 #ifdef PRODUCT
38 #define BLOCK_COMMENT(str) /* nothing */
39 #define STOP(error) stop(error)
40 #else
41 #define BLOCK_COMMENT(str) block_comment(str)
42 #define STOP(error) block_comment(error); stop(error)
43 #endif
44
45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46
47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
48
49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
51 FloatRegister vdata0, FloatRegister vdata1,
52 FloatRegister vdata2, FloatRegister vdata3,
53 FloatRegister vmul0, FloatRegister vmul1,
54 FloatRegister vmul2, FloatRegister vmul3,
55 FloatRegister vpow, FloatRegister vpowm,
56 BasicType eltype) {
57 ARRAYS_HASHCODE_REGISTERS;
58
59 Register tmp1 = rscratch1, tmp2 = rscratch2;
60
61 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
62
63 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
64 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
65 // use 4H for chars and shorts instead, but using 8H gives better performance.
66 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
67 : eltype == T_CHAR || eltype == T_SHORT ? 8
68 : eltype == T_INT ? 4
69 : 0;
70 guarantee(vf, "unsupported eltype");
71
72 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
73 const size_t unroll_factor = 4;
74
75 switch (eltype) {
76 case T_BOOLEAN:
77 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
78 break;
79 case T_CHAR:
80 BLOCK_COMMENT("arrays_hashcode(char) {");
81 break;
82 case T_BYTE:
83 BLOCK_COMMENT("arrays_hashcode(byte) {");
84 break;
85 case T_SHORT:
86 BLOCK_COMMENT("arrays_hashcode(short) {");
87 break;
88 case T_INT:
89 BLOCK_COMMENT("arrays_hashcode(int) {");
90 break;
91 default:
92 ShouldNotReachHere();
93 }
94
95 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
96 // implemented by the stub executes just once. Call the stub only if at least two iterations will
97 // be executed.
98 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
99 cmpw(cnt, large_threshold);
100 br(Assembler::HS, LARGE);
101
102 bind(TAIL);
103
104 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
105 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
106 // Iteration eats up the remainder, uf elements at a time.
107 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
108 andr(tmp2, cnt, unroll_factor - 1);
109 adr(tmp1, BR_BASE);
110 // For Cortex-A53 offset is 4 because 2 nops are generated.
111 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
112 movw(tmp2, 0x1f);
113 br(tmp1);
114
115 bind(LOOP);
116 for (size_t i = 0; i < unroll_factor; ++i) {
117 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
118 maddw(result, result, tmp2, tmp1);
119 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
120 // Generate 2nd nop to have 4 instructions per iteration.
121 if (VM_Version::supports_a53mac()) {
122 nop();
123 }
124 }
125 bind(BR_BASE);
126 subsw(cnt, cnt, unroll_factor);
127 br(Assembler::HS, LOOP);
128
129 b(DONE);
130
131 bind(LARGE);
132
133 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
134 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
135 address tpc = trampoline_call(stub);
136 if (tpc == nullptr) {
137 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
138 postcond(pc() == badAddress);
139 return nullptr;
140 }
141
142 bind(DONE);
143
144 BLOCK_COMMENT("} // arrays_hashcode");
145
146 postcond(pc() != badAddress);
147 return pc();
148 }
149
150 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
151 Register t2, Register t3) {
152 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
153
154 // Handle inflated monitor.
155 Label inflated;
156 // Finish fast lock successfully. MUST branch to with flag == EQ
157 Label locked;
158 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
159 Label slow_path;
160
161 if (UseObjectMonitorTable) {
162 // Clear cache in case fast locking succeeds or we need to take the slow-path.
163 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
164 }
165
166 if (DiagnoseSyncOnValueBasedClasses != 0) {
167 load_klass(t1, obj);
168 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
169 tst(t1, KlassFlags::_misc_is_value_based_class);
170 br(Assembler::NE, slow_path);
171 }
172
173 const Register t1_mark = t1;
174 const Register t3_t = t3;
175
176 { // Lightweight locking
177
178 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
179 Label push;
180
181 const Register t2_top = t2;
182
183 // Check if lock-stack is full.
184 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
185 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
186 br(Assembler::GT, slow_path);
187
188 // Check if recursive.
189 subw(t3_t, t2_top, oopSize);
190 ldr(t3_t, Address(rthread, t3_t));
191 cmp(obj, t3_t);
192 br(Assembler::EQ, push);
193
194 // Relaxed normal load to check for monitor. Optimization for monitor case.
195 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
196 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
197
198 // Not inflated
199 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
200
201 // Try to lock. Transition lock-bits 0b01 => 0b00
202 orr(t1_mark, t1_mark, markWord::unlocked_value);
203 eor(t3_t, t1_mark, markWord::unlocked_value);
204 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
205 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
206 br(Assembler::NE, slow_path);
207
208 bind(push);
209 // After successful lock, push object on lock-stack.
210 str(obj, Address(rthread, t2_top));
211 addw(t2_top, t2_top, oopSize);
212 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
213 b(locked);
214 }
215
216 { // Handle inflated monitor.
217 bind(inflated);
218
219 const Register t1_monitor = t1;
220
221 if (!UseObjectMonitorTable) {
222 assert(t1_monitor == t1_mark, "should be the same here");
223 } else {
224 Label monitor_found;
225
226 // Load cache address
227 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
228
229 const int num_unrolled = 2;
230 for (int i = 0; i < num_unrolled; i++) {
231 ldr(t1, Address(t3_t));
232 cmp(obj, t1);
233 br(Assembler::EQ, monitor_found);
234 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
235 }
236
237 Label loop;
238
239 // Search for obj in cache.
240 bind(loop);
241
242 // Check for match.
243 ldr(t1, Address(t3_t));
244 cmp(obj, t1);
245 br(Assembler::EQ, monitor_found);
246
247 // Search until null encountered, guaranteed _null_sentinel at end.
248 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
249 cbnz(t1, loop);
250 // Cache Miss, NE set from cmp above, cbnz does not set flags
251 b(slow_path);
252
253 bind(monitor_found);
254 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
255 }
256
257 const Register t2_owner_addr = t2;
258 const Register t3_owner = t3;
259 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
260 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
261 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
262
263 Label monitor_locked;
264
265 // Compute owner address.
266 lea(t2_owner_addr, owner_address);
267
268 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
269 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
270 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
271 /*release*/ false, /*weak*/ false, t3_owner);
272 br(Assembler::EQ, monitor_locked);
273
274 // Check if recursive.
275 cmp(t3_owner, rscratch2);
276 br(Assembler::NE, slow_path);
277
278 // Recursive.
279 increment(recursions_address, 1);
280
281 bind(monitor_locked);
282 if (UseObjectMonitorTable) {
283 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
284 }
285 }
286
287 bind(locked);
288
289 #ifdef ASSERT
290 // Check that locked label is reached with Flags == EQ.
291 Label flag_correct;
292 br(Assembler::EQ, flag_correct);
293 stop("Fast Lock Flag != EQ");
294 #endif
295
296 bind(slow_path);
297 #ifdef ASSERT
298 // Check that slow_path label is reached with Flags == NE.
299 br(Assembler::NE, flag_correct);
300 stop("Fast Lock Flag != NE");
301 bind(flag_correct);
302 #endif
303 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
304 }
305
306 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
307 Register t2, Register t3) {
308 assert_different_registers(obj, box, t1, t2, t3);
309
310 // Handle inflated monitor.
311 Label inflated, inflated_load_mark;
312 // Finish fast unlock successfully. MUST branch to with flag == EQ
313 Label unlocked;
314 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
315 Label slow_path;
316
317 const Register t1_mark = t1;
318 const Register t2_top = t2;
319 const Register t3_t = t3;
320
321 { // Lightweight unlock
322
323 Label push_and_slow_path;
324
325 // Check if obj is top of lock-stack.
326 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
327 subw(t2_top, t2_top, oopSize);
328 ldr(t3_t, Address(rthread, t2_top));
329 cmp(obj, t3_t);
330 // Top of lock stack was not obj. Must be monitor.
331 br(Assembler::NE, inflated_load_mark);
332
333 // Pop lock-stack.
334 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
335 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
336
337 // Check if recursive.
338 subw(t3_t, t2_top, oopSize);
339 ldr(t3_t, Address(rthread, t3_t));
340 cmp(obj, t3_t);
341 br(Assembler::EQ, unlocked);
342
343 // Not recursive.
344 // Load Mark.
345 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
346
347 // Check header for monitor (0b10).
348 // Because we got here by popping (meaning we pushed in locked)
349 // there will be no monitor in the box. So we need to push back the obj
350 // so that the runtime can fix any potential anonymous owner.
351 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
352
353 // Try to unlock. Transition lock bits 0b00 => 0b01
354 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
355 orr(t3_t, t1_mark, markWord::unlocked_value);
356 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
357 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
358 br(Assembler::EQ, unlocked);
359
360 bind(push_and_slow_path);
361 // Compare and exchange failed.
362 // Restore lock-stack and handle the unlock in runtime.
363 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
364 addw(t2_top, t2_top, oopSize);
365 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
366 b(slow_path);
367 }
368
369
370 { // Handle inflated monitor.
371 bind(inflated_load_mark);
372 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
373 #ifdef ASSERT
374 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
375 stop("Fast Unlock not monitor");
376 #endif
377
378 bind(inflated);
379
380 #ifdef ASSERT
381 Label check_done;
382 subw(t2_top, t2_top, oopSize);
383 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
384 br(Assembler::LT, check_done);
385 ldr(t3_t, Address(rthread, t2_top));
386 cmp(obj, t3_t);
387 br(Assembler::NE, inflated);
388 stop("Fast Unlock lock on stack");
389 bind(check_done);
390 #endif
391
392 const Register t1_monitor = t1;
393
394 if (!UseObjectMonitorTable) {
395 assert(t1_monitor == t1_mark, "should be the same here");
396
397 // Untag the monitor.
398 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
399 } else {
400 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
401 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
402 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
403 br(Assembler::LO, slow_path);
404 }
405
406 const Register t2_recursions = t2;
407 Label not_recursive;
408
409 // Check if recursive.
410 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
411 cbz(t2_recursions, not_recursive);
412
413 // Recursive unlock.
414 sub(t2_recursions, t2_recursions, 1u);
415 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
416 // Set flag == EQ
417 cmp(t2_recursions, t2_recursions);
418 b(unlocked);
419
420 bind(not_recursive);
421
422 const Register t2_owner_addr = t2;
423
424 // Compute owner address.
425 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
426
427 // Set owner to null.
428 // Release to satisfy the JMM
429 stlr(zr, t2_owner_addr);
430 // We need a full fence after clearing owner to avoid stranding.
431 // StoreLoad achieves this.
432 membar(StoreLoad);
433
434 // Check if the entry_list is empty.
435 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
436 cmp(rscratch1, zr);
437 br(Assembler::EQ, unlocked); // If so we are done.
438
439 // Check if there is a successor.
440 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
441 cmp(rscratch1, zr);
442 br(Assembler::NE, unlocked); // If so we are done.
443
444 // Save the monitor pointer in the current thread, so we can try to
445 // reacquire the lock in SharedRuntime::monitor_exit_helper().
446 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
447
448 cmp(zr, rthread); // Set Flag to NE => slow path
449 b(slow_path);
450 }
451
452 bind(unlocked);
453 cmp(zr, zr); // Set Flags to EQ => fast path
454
455 #ifdef ASSERT
456 // Check that unlocked label is reached with Flags == EQ.
457 Label flag_correct;
458 br(Assembler::EQ, flag_correct);
459 stop("Fast Unlock Flag != EQ");
460 #endif
461
462 bind(slow_path);
463 #ifdef ASSERT
464 // Check that slow_path label is reached with Flags == NE.
465 br(Assembler::NE, flag_correct);
466 stop("Fast Unlock Flag != NE");
467 bind(flag_correct);
468 #endif
469 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
470 }
471
472 // Search for str1 in str2 and return index or -1
473 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
474 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
475 Register cnt2, Register cnt1,
476 Register tmp1, Register tmp2,
477 Register tmp3, Register tmp4,
478 Register tmp5, Register tmp6,
479 int icnt1, Register result, int ae) {
480 // NOTE: tmp5, tmp6 can be zr depending on specific method version
481 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
482
483 Register ch1 = rscratch1;
484 Register ch2 = rscratch2;
485 Register cnt1tmp = tmp1;
486 Register cnt2tmp = tmp2;
487 Register cnt1_neg = cnt1;
488 Register cnt2_neg = cnt2;
489 Register result_tmp = tmp4;
490
491 bool isL = ae == StrIntrinsicNode::LL;
492
493 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
494 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
495 int str1_chr_shift = str1_isL ? 0:1;
496 int str2_chr_shift = str2_isL ? 0:1;
497 int str1_chr_size = str1_isL ? 1:2;
498 int str2_chr_size = str2_isL ? 1:2;
499 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
500 (chr_insn)&MacroAssembler::ldrh;
501 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
502 (chr_insn)&MacroAssembler::ldrh;
503 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
504 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
505
506 // Note, inline_string_indexOf() generates checks:
507 // if (substr.count > string.count) return -1;
508 // if (substr.count == 0) return 0;
509
510 // We have two strings, a source string in str2, cnt2 and a pattern string
511 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
512
513 // For larger pattern and source we use a simplified Boyer Moore algorithm.
514 // With a small pattern and source we use linear scan.
515
516 if (icnt1 == -1) {
517 sub(result_tmp, cnt2, cnt1);
518 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
519 br(LT, LINEARSEARCH);
520 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
521 subs(zr, cnt1, 256);
522 lsr(tmp1, cnt2, 2);
523 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
524 br(GE, LINEARSTUB);
525 }
526
527 // The Boyer Moore alogorithm is based on the description here:-
528 //
529 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
530 //
531 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
532 // and the 'Good Suffix' rule.
533 //
534 // These rules are essentially heuristics for how far we can shift the
535 // pattern along the search string.
536 //
537 // The implementation here uses the 'Bad Character' rule only because of the
538 // complexity of initialisation for the 'Good Suffix' rule.
539 //
540 // This is also known as the Boyer-Moore-Horspool algorithm:-
541 //
542 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
543 //
544 // This particular implementation has few java-specific optimizations.
545 //
546 // #define ASIZE 256
547 //
548 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
549 // int i, j;
550 // unsigned c;
551 // unsigned char bc[ASIZE];
552 //
553 // /* Preprocessing */
554 // for (i = 0; i < ASIZE; ++i)
555 // bc[i] = m;
556 // for (i = 0; i < m - 1; ) {
557 // c = x[i];
558 // ++i;
559 // // c < 256 for Latin1 string, so, no need for branch
560 // #ifdef PATTERN_STRING_IS_LATIN1
561 // bc[c] = m - i;
562 // #else
563 // if (c < ASIZE) bc[c] = m - i;
564 // #endif
565 // }
566 //
567 // /* Searching */
568 // j = 0;
569 // while (j <= n - m) {
570 // c = y[i+j];
571 // if (x[m-1] == c)
572 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
573 // if (i < 0) return j;
574 // // c < 256 for Latin1 string, so, no need for branch
575 // #ifdef SOURCE_STRING_IS_LATIN1
576 // // LL case: (c< 256) always true. Remove branch
577 // j += bc[y[j+m-1]];
578 // #endif
579 // #ifndef PATTERN_STRING_IS_UTF
580 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
581 // if (c < ASIZE)
582 // j += bc[y[j+m-1]];
583 // else
584 // j += 1
585 // #endif
586 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
587 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
588 // if (c < ASIZE)
589 // j += bc[y[j+m-1]];
590 // else
591 // j += m
592 // #endif
593 // }
594 // }
595
596 if (icnt1 == -1) {
597 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
598 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
599 Register cnt1end = tmp2;
600 Register str2end = cnt2;
601 Register skipch = tmp2;
602
603 // str1 length is >=8, so, we can read at least 1 register for cases when
604 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
605 // UL case. We'll re-read last character in inner pre-loop code to have
606 // single outer pre-loop load
607 const int firstStep = isL ? 7 : 3;
608
609 const int ASIZE = 256;
610 const int STORED_BYTES = 32; // amount of bytes stored per instruction
611 sub(sp, sp, ASIZE);
612 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
613 mov(ch1, sp);
614 BIND(BM_INIT_LOOP);
615 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
616 subs(tmp5, tmp5, 1);
617 br(GT, BM_INIT_LOOP);
618
619 sub(cnt1tmp, cnt1, 1);
620 mov(tmp5, str2);
621 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
622 sub(ch2, cnt1, 1);
623 mov(tmp3, str1);
624 BIND(BCLOOP);
625 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
626 if (!str1_isL) {
627 subs(zr, ch1, ASIZE);
628 br(HS, BCSKIP);
629 }
630 strb(ch2, Address(sp, ch1));
631 BIND(BCSKIP);
632 subs(ch2, ch2, 1);
633 br(GT, BCLOOP);
634
635 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
636 if (str1_isL == str2_isL) {
637 // load last 8 bytes (8LL/4UU symbols)
638 ldr(tmp6, Address(tmp6, -wordSize));
639 } else {
640 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
641 // convert Latin1 to UTF. We'll have to wait until load completed, but
642 // it's still faster than per-character loads+checks
643 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
644 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
645 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
646 andr(tmp6, tmp6, 0xFF); // str1[N-4]
647 orr(ch2, ch1, ch2, LSL, 16);
648 orr(tmp6, tmp6, tmp3, LSL, 48);
649 orr(tmp6, tmp6, ch2, LSL, 16);
650 }
651 BIND(BMLOOPSTR2);
652 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
653 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
654 if (str1_isL == str2_isL) {
655 // re-init tmp3. It's for free because it's executed in parallel with
656 // load above. Alternative is to initialize it before loop, but it'll
657 // affect performance on in-order systems with 2 or more ld/st pipelines
658 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
659 }
660 if (!isL) { // UU/UL case
661 lsl(ch2, cnt1tmp, 1); // offset in bytes
662 }
663 cmp(tmp3, skipch);
664 br(NE, BMSKIP);
665 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
666 mov(ch1, tmp6);
667 if (isL) {
668 b(BMLOOPSTR1_AFTER_LOAD);
669 } else {
670 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
671 b(BMLOOPSTR1_CMP);
672 }
673 BIND(BMLOOPSTR1);
674 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
675 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
676 BIND(BMLOOPSTR1_AFTER_LOAD);
677 subs(cnt1tmp, cnt1tmp, 1);
678 br(LT, BMLOOPSTR1_LASTCMP);
679 BIND(BMLOOPSTR1_CMP);
680 cmp(ch1, ch2);
681 br(EQ, BMLOOPSTR1);
682 BIND(BMSKIP);
683 if (!isL) {
684 // if we've met UTF symbol while searching Latin1 pattern, then we can
685 // skip cnt1 symbols
686 if (str1_isL != str2_isL) {
687 mov(result_tmp, cnt1);
688 } else {
689 mov(result_tmp, 1);
690 }
691 subs(zr, skipch, ASIZE);
692 br(HS, BMADV);
693 }
694 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
695 BIND(BMADV);
696 sub(cnt1tmp, cnt1, 1);
697 add(str2, str2, result_tmp, LSL, str2_chr_shift);
698 cmp(str2, str2end);
699 br(LE, BMLOOPSTR2);
700 add(sp, sp, ASIZE);
701 b(NOMATCH);
702 BIND(BMLOOPSTR1_LASTCMP);
703 cmp(ch1, ch2);
704 br(NE, BMSKIP);
705 BIND(BMMATCH);
706 sub(result, str2, tmp5);
707 if (!str2_isL) lsr(result, result, 1);
708 add(sp, sp, ASIZE);
709 b(DONE);
710
711 BIND(LINEARSTUB);
712 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
713 br(LT, LINEAR_MEDIUM);
714 mov(result, zr);
715 RuntimeAddress stub = nullptr;
716 if (isL) {
717 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
718 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
719 } else if (str1_isL) {
720 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
721 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
722 } else {
723 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
724 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
725 }
726 address call = trampoline_call(stub);
727 if (call == nullptr) {
728 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
729 ciEnv::current()->record_failure("CodeCache is full");
730 return;
731 }
732 b(DONE);
733 }
734
735 BIND(LINEARSEARCH);
736 {
737 Label DO1, DO2, DO3;
738
739 Register str2tmp = tmp2;
740 Register first = tmp3;
741
742 if (icnt1 == -1)
743 {
744 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
745
746 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
747 br(LT, DOSHORT);
748 BIND(LINEAR_MEDIUM);
749 (this->*str1_load_1chr)(first, Address(str1));
750 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
751 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
753 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
754
755 BIND(FIRST_LOOP);
756 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
757 cmp(first, ch2);
758 br(EQ, STR1_LOOP);
759 BIND(STR2_NEXT);
760 adds(cnt2_neg, cnt2_neg, str2_chr_size);
761 br(LE, FIRST_LOOP);
762 b(NOMATCH);
763
764 BIND(STR1_LOOP);
765 adds(cnt1tmp, cnt1_neg, str1_chr_size);
766 add(cnt2tmp, cnt2_neg, str2_chr_size);
767 br(GE, MATCH);
768
769 BIND(STR1_NEXT);
770 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
771 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
772 cmp(ch1, ch2);
773 br(NE, STR2_NEXT);
774 adds(cnt1tmp, cnt1tmp, str1_chr_size);
775 add(cnt2tmp, cnt2tmp, str2_chr_size);
776 br(LT, STR1_NEXT);
777 b(MATCH);
778
779 BIND(DOSHORT);
780 if (str1_isL == str2_isL) {
781 cmp(cnt1, (u1)2);
782 br(LT, DO1);
783 br(GT, DO3);
784 }
785 }
786
787 if (icnt1 == 4) {
788 Label CH1_LOOP;
789
790 (this->*load_4chr)(ch1, str1);
791 sub(result_tmp, cnt2, 4);
792 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
793 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
794
795 BIND(CH1_LOOP);
796 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
797 cmp(ch1, ch2);
798 br(EQ, MATCH);
799 adds(cnt2_neg, cnt2_neg, str2_chr_size);
800 br(LE, CH1_LOOP);
801 b(NOMATCH);
802 }
803
804 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
805 Label CH1_LOOP;
806
807 BIND(DO2);
808 (this->*load_2chr)(ch1, str1);
809 if (icnt1 == 2) {
810 sub(result_tmp, cnt2, 2);
811 }
812 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
813 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
814 BIND(CH1_LOOP);
815 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
816 cmp(ch1, ch2);
817 br(EQ, MATCH);
818 adds(cnt2_neg, cnt2_neg, str2_chr_size);
819 br(LE, CH1_LOOP);
820 b(NOMATCH);
821 }
822
823 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
824 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
825
826 BIND(DO3);
827 (this->*load_2chr)(first, str1);
828 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
829 if (icnt1 == 3) {
830 sub(result_tmp, cnt2, 3);
831 }
832 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
833 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
834 BIND(FIRST_LOOP);
835 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
836 cmpw(first, ch2);
837 br(EQ, STR1_LOOP);
838 BIND(STR2_NEXT);
839 adds(cnt2_neg, cnt2_neg, str2_chr_size);
840 br(LE, FIRST_LOOP);
841 b(NOMATCH);
842
843 BIND(STR1_LOOP);
844 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
845 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
846 cmp(ch1, ch2);
847 br(NE, STR2_NEXT);
848 b(MATCH);
849 }
850
851 if (icnt1 == -1 || icnt1 == 1) {
852 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
853
854 BIND(DO1);
855 (this->*str1_load_1chr)(ch1, str1);
856 cmp(cnt2, (u1)8);
857 br(LT, DO1_SHORT);
858
859 sub(result_tmp, cnt2, 8/str2_chr_size);
860 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
861 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
862 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
863
864 if (str2_isL) {
865 orr(ch1, ch1, ch1, LSL, 8);
866 }
867 orr(ch1, ch1, ch1, LSL, 16);
868 orr(ch1, ch1, ch1, LSL, 32);
869 BIND(CH1_LOOP);
870 ldr(ch2, Address(str2, cnt2_neg));
871 eor(ch2, ch1, ch2);
872 sub(tmp1, ch2, tmp3);
873 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
874 bics(tmp1, tmp1, tmp2);
875 br(NE, HAS_ZERO);
876 adds(cnt2_neg, cnt2_neg, 8);
877 br(LT, CH1_LOOP);
878
879 cmp(cnt2_neg, (u1)8);
880 mov(cnt2_neg, 0);
881 br(LT, CH1_LOOP);
882 b(NOMATCH);
883
884 BIND(HAS_ZERO);
885 rev(tmp1, tmp1);
886 clz(tmp1, tmp1);
887 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
888 b(MATCH);
889
890 BIND(DO1_SHORT);
891 mov(result_tmp, cnt2);
892 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
893 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
894 BIND(DO1_LOOP);
895 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
896 cmpw(ch1, ch2);
897 br(EQ, MATCH);
898 adds(cnt2_neg, cnt2_neg, str2_chr_size);
899 br(LT, DO1_LOOP);
900 }
901 }
902 BIND(NOMATCH);
903 mov(result, -1);
904 b(DONE);
905 BIND(MATCH);
906 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
907 BIND(DONE);
908 }
909
910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
912
913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
914 Register ch, Register result,
915 Register tmp1, Register tmp2, Register tmp3)
916 {
917 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
918 Register cnt1_neg = cnt1;
919 Register ch1 = rscratch1;
920 Register result_tmp = rscratch2;
921
922 cbz(cnt1, NOMATCH);
923
924 cmp(cnt1, (u1)4);
925 br(LT, DO1_SHORT);
926
927 orr(ch, ch, ch, LSL, 16);
928 orr(ch, ch, ch, LSL, 32);
929
930 sub(cnt1, cnt1, 4);
931 mov(result_tmp, cnt1);
932 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
933 sub(cnt1_neg, zr, cnt1, LSL, 1);
934
935 mov(tmp3, 0x0001000100010001);
936
937 BIND(CH1_LOOP);
938 ldr(ch1, Address(str1, cnt1_neg));
939 eor(ch1, ch, ch1);
940 sub(tmp1, ch1, tmp3);
941 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
942 bics(tmp1, tmp1, tmp2);
943 br(NE, HAS_ZERO);
944 adds(cnt1_neg, cnt1_neg, 8);
945 br(LT, CH1_LOOP);
946
947 cmp(cnt1_neg, (u1)8);
948 mov(cnt1_neg, 0);
949 br(LT, CH1_LOOP);
950 b(NOMATCH);
951
952 BIND(HAS_ZERO);
953 rev(tmp1, tmp1);
954 clz(tmp1, tmp1);
955 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
956 b(MATCH);
957
958 BIND(DO1_SHORT);
959 mov(result_tmp, cnt1);
960 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
961 sub(cnt1_neg, zr, cnt1, LSL, 1);
962 BIND(DO1_LOOP);
963 ldrh(ch1, Address(str1, cnt1_neg));
964 cmpw(ch, ch1);
965 br(EQ, MATCH);
966 adds(cnt1_neg, cnt1_neg, 2);
967 br(LT, DO1_LOOP);
968 BIND(NOMATCH);
969 mov(result, -1);
970 b(DONE);
971 BIND(MATCH);
972 add(result, result_tmp, cnt1_neg, ASR, 1);
973 BIND(DONE);
974 }
975
976 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
977 Register ch, Register result,
978 FloatRegister ztmp1,
979 FloatRegister ztmp2,
980 PRegister tmp_pg,
981 PRegister tmp_pdn, bool isL)
982 {
983 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
984 assert(tmp_pg->is_governing(),
985 "this register has to be a governing predicate register");
986
987 Label LOOP, MATCH, DONE, NOMATCH;
988 Register vec_len = rscratch1;
989 Register idx = rscratch2;
990
991 SIMD_RegVariant T = (isL == true) ? B : H;
992
993 cbz(cnt1, NOMATCH);
994
995 // Assign the particular char throughout the vector.
996 sve_dup(ztmp2, T, ch);
997 if (isL) {
998 sve_cntb(vec_len);
999 } else {
1000 sve_cnth(vec_len);
1001 }
1002 mov(idx, 0);
1003
1004 // Generate a predicate to control the reading of input string.
1005 sve_whilelt(tmp_pg, T, idx, cnt1);
1006
1007 BIND(LOOP);
1008 // Read a vector of 8- or 16-bit data depending on the string type. Note
1009 // that inactive elements indicated by the predicate register won't cause
1010 // a data read from memory to the destination vector.
1011 if (isL) {
1012 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1013 } else {
1014 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1015 }
1016 add(idx, idx, vec_len);
1017
1018 // Perform the comparison. An element of the destination predicate is set
1019 // to active if the particular char is matched.
1020 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1021
1022 // Branch if the particular char is found.
1023 br(NE, MATCH);
1024
1025 sve_whilelt(tmp_pg, T, idx, cnt1);
1026
1027 // Loop back if the particular char not found.
1028 br(MI, LOOP);
1029
1030 BIND(NOMATCH);
1031 mov(result, -1);
1032 b(DONE);
1033
1034 BIND(MATCH);
1035 // Undo the index increment.
1036 sub(idx, idx, vec_len);
1037
1038 // Crop the vector to find its location.
1039 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1040 add(result, idx, -1);
1041 sve_incp(result, T, tmp_pdn);
1042 BIND(DONE);
1043 }
1044
1045 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1046 Register ch, Register result,
1047 Register tmp1, Register tmp2, Register tmp3)
1048 {
1049 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1050 Register cnt1_neg = cnt1;
1051 Register ch1 = rscratch1;
1052 Register result_tmp = rscratch2;
1053
1054 cbz(cnt1, NOMATCH);
1055
1056 cmp(cnt1, (u1)8);
1057 br(LT, DO1_SHORT);
1058
1059 orr(ch, ch, ch, LSL, 8);
1060 orr(ch, ch, ch, LSL, 16);
1061 orr(ch, ch, ch, LSL, 32);
1062
1063 sub(cnt1, cnt1, 8);
1064 mov(result_tmp, cnt1);
1065 lea(str1, Address(str1, cnt1));
1066 sub(cnt1_neg, zr, cnt1);
1067
1068 mov(tmp3, 0x0101010101010101);
1069
1070 BIND(CH1_LOOP);
1071 ldr(ch1, Address(str1, cnt1_neg));
1072 eor(ch1, ch, ch1);
1073 sub(tmp1, ch1, tmp3);
1074 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1075 bics(tmp1, tmp1, tmp2);
1076 br(NE, HAS_ZERO);
1077 adds(cnt1_neg, cnt1_neg, 8);
1078 br(LT, CH1_LOOP);
1079
1080 cmp(cnt1_neg, (u1)8);
1081 mov(cnt1_neg, 0);
1082 br(LT, CH1_LOOP);
1083 b(NOMATCH);
1084
1085 BIND(HAS_ZERO);
1086 rev(tmp1, tmp1);
1087 clz(tmp1, tmp1);
1088 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1089 b(MATCH);
1090
1091 BIND(DO1_SHORT);
1092 mov(result_tmp, cnt1);
1093 lea(str1, Address(str1, cnt1));
1094 sub(cnt1_neg, zr, cnt1);
1095 BIND(DO1_LOOP);
1096 ldrb(ch1, Address(str1, cnt1_neg));
1097 cmp(ch, ch1);
1098 br(EQ, MATCH);
1099 adds(cnt1_neg, cnt1_neg, 1);
1100 br(LT, DO1_LOOP);
1101 BIND(NOMATCH);
1102 mov(result, -1);
1103 b(DONE);
1104 BIND(MATCH);
1105 add(result, result_tmp, cnt1_neg);
1106 BIND(DONE);
1107 }
1108
1109 // Compare strings.
1110 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1111 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1112 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1113 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1114 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1115 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1116 SHORT_LOOP_START, TAIL_CHECK;
1117
1118 bool isLL = ae == StrIntrinsicNode::LL;
1119 bool isLU = ae == StrIntrinsicNode::LU;
1120 bool isUL = ae == StrIntrinsicNode::UL;
1121
1122 // The stub threshold for LL strings is: 72 (64 + 8) chars
1123 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1124 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1125 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1126
1127 bool str1_isL = isLL || isLU;
1128 bool str2_isL = isLL || isUL;
1129
1130 int str1_chr_shift = str1_isL ? 0 : 1;
1131 int str2_chr_shift = str2_isL ? 0 : 1;
1132 int str1_chr_size = str1_isL ? 1 : 2;
1133 int str2_chr_size = str2_isL ? 1 : 2;
1134 int minCharsInWord = isLL ? wordSize : wordSize/2;
1135
1136 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1137 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1138 (chr_insn)&MacroAssembler::ldrh;
1139 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1140 (chr_insn)&MacroAssembler::ldrh;
1141 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1142 (uxt_insn)&MacroAssembler::uxthw;
1143
1144 BLOCK_COMMENT("string_compare {");
1145
1146 // Bizarrely, the counts are passed in bytes, regardless of whether they
1147 // are L or U strings, however the result is always in characters.
1148 if (!str1_isL) asrw(cnt1, cnt1, 1);
1149 if (!str2_isL) asrw(cnt2, cnt2, 1);
1150
1151 // Compute the minimum of the string lengths and save the difference.
1152 subsw(result, cnt1, cnt2);
1153 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1154
1155 // A very short string
1156 cmpw(cnt2, minCharsInWord);
1157 br(Assembler::LE, SHORT_STRING);
1158
1159 // Compare longwords
1160 // load first parts of strings and finish initialization while loading
1161 {
1162 if (str1_isL == str2_isL) { // LL or UU
1163 ldr(tmp1, Address(str1));
1164 cmp(str1, str2);
1165 br(Assembler::EQ, DONE);
1166 ldr(tmp2, Address(str2));
1167 cmp(cnt2, stub_threshold);
1168 br(GE, STUB);
1169 subsw(cnt2, cnt2, minCharsInWord);
1170 br(EQ, TAIL_CHECK);
1171 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1172 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1173 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1174 } else if (isLU) {
1175 ldrs(vtmp, Address(str1));
1176 ldr(tmp2, Address(str2));
1177 cmp(cnt2, stub_threshold);
1178 br(GE, STUB);
1179 subw(cnt2, cnt2, 4);
1180 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1181 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1182 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183 zip1(vtmp, T8B, vtmp, vtmpZ);
1184 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1185 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1186 add(cnt1, cnt1, 4);
1187 fmovd(tmp1, vtmp);
1188 } else { // UL case
1189 ldr(tmp1, Address(str1));
1190 ldrs(vtmp, Address(str2));
1191 cmp(cnt2, stub_threshold);
1192 br(GE, STUB);
1193 subw(cnt2, cnt2, 4);
1194 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1195 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1197 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1198 zip1(vtmp, T8B, vtmp, vtmpZ);
1199 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1200 add(cnt1, cnt1, 8);
1201 fmovd(tmp2, vtmp);
1202 }
1203 adds(cnt2, cnt2, isUL ? 4 : 8);
1204 br(GE, TAIL);
1205 eor(rscratch2, tmp1, tmp2);
1206 cbnz(rscratch2, DIFF);
1207 // main loop
1208 bind(NEXT_WORD);
1209 if (str1_isL == str2_isL) {
1210 ldr(tmp1, Address(str1, cnt2));
1211 ldr(tmp2, Address(str2, cnt2));
1212 adds(cnt2, cnt2, 8);
1213 } else if (isLU) {
1214 ldrs(vtmp, Address(str1, cnt1));
1215 ldr(tmp2, Address(str2, cnt2));
1216 add(cnt1, cnt1, 4);
1217 zip1(vtmp, T8B, vtmp, vtmpZ);
1218 fmovd(tmp1, vtmp);
1219 adds(cnt2, cnt2, 8);
1220 } else { // UL
1221 ldrs(vtmp, Address(str2, cnt2));
1222 ldr(tmp1, Address(str1, cnt1));
1223 zip1(vtmp, T8B, vtmp, vtmpZ);
1224 add(cnt1, cnt1, 8);
1225 fmovd(tmp2, vtmp);
1226 adds(cnt2, cnt2, 4);
1227 }
1228 br(GE, TAIL);
1229
1230 eor(rscratch2, tmp1, tmp2);
1231 cbz(rscratch2, NEXT_WORD);
1232 b(DIFF);
1233 bind(TAIL);
1234 eor(rscratch2, tmp1, tmp2);
1235 cbnz(rscratch2, DIFF);
1236 // Last longword. In the case where length == 4 we compare the
1237 // same longword twice, but that's still faster than another
1238 // conditional branch.
1239 if (str1_isL == str2_isL) {
1240 ldr(tmp1, Address(str1));
1241 ldr(tmp2, Address(str2));
1242 } else if (isLU) {
1243 ldrs(vtmp, Address(str1));
1244 ldr(tmp2, Address(str2));
1245 zip1(vtmp, T8B, vtmp, vtmpZ);
1246 fmovd(tmp1, vtmp);
1247 } else { // UL
1248 ldrs(vtmp, Address(str2));
1249 ldr(tmp1, Address(str1));
1250 zip1(vtmp, T8B, vtmp, vtmpZ);
1251 fmovd(tmp2, vtmp);
1252 }
1253 bind(TAIL_CHECK);
1254 eor(rscratch2, tmp1, tmp2);
1255 cbz(rscratch2, DONE);
1256
1257 // Find the first different characters in the longwords and
1258 // compute their difference.
1259 bind(DIFF);
1260 rev(rscratch2, rscratch2);
1261 clz(rscratch2, rscratch2);
1262 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1263 lsrv(tmp1, tmp1, rscratch2);
1264 (this->*ext_chr)(tmp1, tmp1);
1265 lsrv(tmp2, tmp2, rscratch2);
1266 (this->*ext_chr)(tmp2, tmp2);
1267 subw(result, tmp1, tmp2);
1268 b(DONE);
1269 }
1270
1271 bind(STUB);
1272 RuntimeAddress stub = nullptr;
1273 switch(ae) {
1274 case StrIntrinsicNode::LL:
1275 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1276 break;
1277 case StrIntrinsicNode::UU:
1278 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1279 break;
1280 case StrIntrinsicNode::LU:
1281 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1282 break;
1283 case StrIntrinsicNode::UL:
1284 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1285 break;
1286 default:
1287 ShouldNotReachHere();
1288 }
1289 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1290 address call = trampoline_call(stub);
1291 if (call == nullptr) {
1292 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1293 ciEnv::current()->record_failure("CodeCache is full");
1294 return;
1295 }
1296 b(DONE);
1297
1298 bind(SHORT_STRING);
1299 // Is the minimum length zero?
1300 cbz(cnt2, DONE);
1301 // arrange code to do most branches while loading and loading next characters
1302 // while comparing previous
1303 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1304 subs(cnt2, cnt2, 1);
1305 br(EQ, SHORT_LAST_INIT);
1306 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1307 b(SHORT_LOOP_START);
1308 bind(SHORT_LOOP);
1309 subs(cnt2, cnt2, 1);
1310 br(EQ, SHORT_LAST);
1311 bind(SHORT_LOOP_START);
1312 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1313 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1314 cmp(tmp1, cnt1);
1315 br(NE, SHORT_LOOP_TAIL);
1316 subs(cnt2, cnt2, 1);
1317 br(EQ, SHORT_LAST2);
1318 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1320 cmp(tmp2, rscratch1);
1321 br(EQ, SHORT_LOOP);
1322 sub(result, tmp2, rscratch1);
1323 b(DONE);
1324 bind(SHORT_LOOP_TAIL);
1325 sub(result, tmp1, cnt1);
1326 b(DONE);
1327 bind(SHORT_LAST2);
1328 cmp(tmp2, rscratch1);
1329 br(EQ, DONE);
1330 sub(result, tmp2, rscratch1);
1331
1332 b(DONE);
1333 bind(SHORT_LAST_INIT);
1334 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335 bind(SHORT_LAST);
1336 cmp(tmp1, cnt1);
1337 br(EQ, DONE);
1338 sub(result, tmp1, cnt1);
1339
1340 bind(DONE);
1341
1342 BLOCK_COMMENT("} string_compare");
1343 }
1344
1345 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1346 FloatRegister src2, Condition cond, bool isQ) {
1347 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1348 FloatRegister zn = src1, zm = src2;
1349 bool needs_negation = false;
1350 switch (cond) {
1351 case LT: cond = GT; zn = src2; zm = src1; break;
1352 case LE: cond = GE; zn = src2; zm = src1; break;
1353 case LO: cond = HI; zn = src2; zm = src1; break;
1354 case LS: cond = HS; zn = src2; zm = src1; break;
1355 case NE: cond = EQ; needs_negation = true; break;
1356 default:
1357 break;
1358 }
1359
1360 if (is_floating_point_type(bt)) {
1361 fcm(cond, dst, size, zn, zm);
1362 } else {
1363 cm(cond, dst, size, zn, zm);
1364 }
1365
1366 if (needs_negation) {
1367 notr(dst, isQ ? T16B : T8B, dst);
1368 }
1369 }
1370
1371 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1372 Condition cond, bool isQ) {
1373 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1374 if (bt == T_FLOAT || bt == T_DOUBLE) {
1375 if (cond == Assembler::NE) {
1376 fcm(Assembler::EQ, dst, size, src);
1377 notr(dst, isQ ? T16B : T8B, dst);
1378 } else {
1379 fcm(cond, dst, size, src);
1380 }
1381 } else {
1382 if (cond == Assembler::NE) {
1383 cm(Assembler::EQ, dst, size, src);
1384 notr(dst, isQ ? T16B : T8B, dst);
1385 } else {
1386 cm(cond, dst, size, src);
1387 }
1388 }
1389 }
1390
1391 // Compress the least significant bit of each byte to the rightmost and clear
1392 // the higher garbage bits.
1393 void C2_MacroAssembler::bytemask_compress(Register dst) {
1394 // Example input, dst = 0x01 00 00 00 01 01 00 01
1395 // The "??" bytes are garbage.
1396 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1397 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1398 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1399 andr(dst, dst, 0xff); // dst = 0x8D
1400 }
1401
1402 // Pack the lowest-numbered bit of each mask element in src into a long value
1403 // in dst, at most the first 64 lane elements.
1404 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1405 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1406 FloatRegister vtmp1, FloatRegister vtmp2) {
1407 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1408 assert_different_registers(dst, rscratch1);
1409 assert_different_registers(vtmp1, vtmp2);
1410
1411 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1412 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1413 // Expected: dst = 0x658D
1414
1415 // Convert the mask into vector with sequential bytes.
1416 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1417 sve_cpy(vtmp1, size, src, 1, false);
1418 if (bt != T_BYTE) {
1419 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1420 }
1421
1422 if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1423 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1424 // is to compress each significant bit of the byte in a cross-lane way. Due
1425 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1426 // (bit-compress in each lane) with the biggest lane size (T = D) then
1427 // concatenate the results.
1428
1429 // The second source input of BEXT, initialized with 0x01 in each byte.
1430 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1431 sve_dup(vtmp2, B, 1);
1432
1433 // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1434 // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1435 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1436 // ---------------------------------------
1437 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1438 sve_bext(vtmp1, D, vtmp1, vtmp2);
1439
1440 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1441 // result to dst.
1442 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1443 // dst = 0x658D
1444 if (lane_cnt <= 8) {
1445 // No need to concatenate.
1446 umov(dst, vtmp1, B, 0);
1447 } else if (lane_cnt <= 16) {
1448 ins(vtmp1, B, vtmp1, 1, 8);
1449 umov(dst, vtmp1, H, 0);
1450 } else {
1451 // As the lane count is 64 at most, the final expected value must be in
1452 // the lowest 64 bits after narrowing vtmp1 from D to B.
1453 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1454 umov(dst, vtmp1, D, 0);
1455 }
1456 } else if (UseSVE > 0) {
1457 // Compress the lowest 8 bytes.
1458 fmovd(dst, vtmp1);
1459 bytemask_compress(dst);
1460 if (lane_cnt <= 8) return;
1461
1462 // Repeat on higher bytes and join the results.
1463 // Compress 8 bytes in each iteration.
1464 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1465 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1466 bytemask_compress(rscratch1);
1467 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1468 }
1469 } else {
1470 assert(false, "unsupported");
1471 ShouldNotReachHere();
1472 }
1473 }
1474
1475 // Unpack the mask, a long value in src, into predicate register dst based on the
1476 // corresponding data type. Note that dst can support at most 64 lanes.
1477 // Below example gives the expected dst predicate register in different types, with
1478 // a valid src(0x658D) on a 1024-bit vector size machine.
1479 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1480 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1481 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1482 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1483 //
1484 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1485 // has 24 significant bits would be an invalid input if dst predicate register refers to
1486 // a LONG type 1024-bit vector, which has at most 16 lanes.
1487 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1488 FloatRegister vtmp1, FloatRegister vtmp2) {
1489 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1490 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1491 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1492 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1493 // Expected: dst = 0b01101001 10001101
1494
1495 // Put long value from general purpose register into the first lane of vector.
1496 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1497 sve_dup(vtmp1, B, 0);
1498 mov(vtmp1, D, 0, src);
1499
1500 // As sve_cmp generates mask value with the minimum unit in byte, we should
1501 // transform the value in the first lane which is mask in bit now to the
1502 // mask in byte, which can be done by SVE2's BDEP instruction.
1503
1504 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1505 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1506 if (lane_cnt <= 8) {
1507 // Nothing. As only one byte exsits.
1508 } else if (lane_cnt <= 16) {
1509 ins(vtmp1, B, vtmp1, 8, 1);
1510 mov(vtmp1, B, 1, zr);
1511 } else {
1512 sve_vector_extend(vtmp1, D, vtmp1, B);
1513 }
1514
1515 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1516 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1517 sve_dup(vtmp2, B, 1);
1518
1519 // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1520 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1521 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1522 // ---------------------------------------
1523 // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1524 sve_bdep(vtmp1, D, vtmp1, vtmp2);
1525
1526 if (bt != T_BYTE) {
1527 sve_vector_extend(vtmp1, size, vtmp1, B);
1528 }
1529 // Generate mask according to the given vector, in which the elements have been
1530 // extended to expected type.
1531 // dst = 0b01101001 10001101
1532 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1533 }
1534
1535 // Clobbers: rflags
1536 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1537 FloatRegister zn, FloatRegister zm, Condition cond) {
1538 assert(pg->is_governing(), "This register has to be a governing predicate register");
1539 FloatRegister z1 = zn, z2 = zm;
1540 switch (cond) {
1541 case LE: z1 = zm; z2 = zn; cond = GE; break;
1542 case LT: z1 = zm; z2 = zn; cond = GT; break;
1543 case LO: z1 = zm; z2 = zn; cond = HI; break;
1544 case LS: z1 = zm; z2 = zn; cond = HS; break;
1545 default:
1546 break;
1547 }
1548
1549 SIMD_RegVariant size = elemType_to_regVariant(bt);
1550 if (is_floating_point_type(bt)) {
1551 sve_fcm(cond, pd, size, pg, z1, z2);
1552 } else {
1553 assert(is_integral_type(bt), "unsupported element type");
1554 sve_cmp(cond, pd, size, pg, z1, z2);
1555 }
1556 }
1557
1558 // Get index of the last mask lane that is set
1559 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1560 SIMD_RegVariant size = elemType_to_regVariant(bt);
1561 sve_rev(ptmp, size, src);
1562 sve_brkb(ptmp, ptrue, ptmp, false);
1563 sve_cntp(dst, size, ptrue, ptmp);
1564 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1565 subw(dst, rscratch1, dst);
1566 }
1567
1568 // Extend integer vector src to dst with the same lane count
1569 // but larger element size, e.g. 4B -> 4I
1570 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1571 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1572 if (src_bt == T_BYTE) {
1573 // 4B to 4S/4I, 8B to 8S
1574 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1575 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1576 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1577 if (dst_bt == T_INT) {
1578 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1579 }
1580 } else if (src_bt == T_SHORT) {
1581 // 2S to 2I/2L, 4S to 4I
1582 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1583 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1584 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1585 if (dst_bt == T_LONG) {
1586 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1587 }
1588 } else if (src_bt == T_INT) {
1589 // 2I to 2L
1590 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1591 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1592 } else {
1593 ShouldNotReachHere();
1594 }
1595 }
1596
1597 // Narrow integer vector src down to dst with the same lane count
1598 // but smaller element size, e.g. 4I -> 4B
1599 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1600 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1601 if (src_bt == T_SHORT) {
1602 // 4S/8S to 4B/8B
1603 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1604 assert(dst_bt == T_BYTE, "unsupported");
1605 xtn(dst, T8B, src, T8H);
1606 } else if (src_bt == T_INT) {
1607 // 2I to 2S, 4I to 4B/4S
1608 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1609 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1610 xtn(dst, T4H, src, T4S);
1611 if (dst_bt == T_BYTE) {
1612 xtn(dst, T8B, dst, T8H);
1613 }
1614 } else if (src_bt == T_LONG) {
1615 // 2L to 2S/2I
1616 assert(src_vlen_in_bytes == 16, "unsupported");
1617 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1618 xtn(dst, T2S, src, T2D);
1619 if (dst_bt == T_SHORT) {
1620 xtn(dst, T4H, dst, T4S);
1621 }
1622 } else {
1623 ShouldNotReachHere();
1624 }
1625 }
1626
1627 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1628 FloatRegister src, SIMD_RegVariant src_size,
1629 bool is_unsigned) {
1630 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1631
1632 if (src_size == B) {
1633 switch (dst_size) {
1634 case H:
1635 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1636 break;
1637 case S:
1638 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1639 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1640 break;
1641 case D:
1642 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1643 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1644 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1645 break;
1646 default:
1647 ShouldNotReachHere();
1648 }
1649 } else if (src_size == H) {
1650 if (dst_size == S) {
1651 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1652 } else { // D
1653 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1654 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1655 }
1656 } else if (src_size == S) {
1657 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1658 }
1659 }
1660
1661 // Vector narrow from src to dst with specified element sizes.
1662 // High part of dst vector will be filled with zero.
1663 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1664 FloatRegister src, SIMD_RegVariant src_size,
1665 FloatRegister tmp) {
1666 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1667 assert_different_registers(src, tmp);
1668 sve_dup(tmp, src_size, 0);
1669 if (src_size == D) {
1670 switch (dst_size) {
1671 case S:
1672 sve_uzp1(dst, S, src, tmp);
1673 break;
1674 case H:
1675 assert_different_registers(dst, tmp);
1676 sve_uzp1(dst, S, src, tmp);
1677 sve_uzp1(dst, H, dst, tmp);
1678 break;
1679 case B:
1680 assert_different_registers(dst, tmp);
1681 sve_uzp1(dst, S, src, tmp);
1682 sve_uzp1(dst, H, dst, tmp);
1683 sve_uzp1(dst, B, dst, tmp);
1684 break;
1685 default:
1686 ShouldNotReachHere();
1687 }
1688 } else if (src_size == S) {
1689 if (dst_size == H) {
1690 sve_uzp1(dst, H, src, tmp);
1691 } else { // B
1692 assert_different_registers(dst, tmp);
1693 sve_uzp1(dst, H, src, tmp);
1694 sve_uzp1(dst, B, dst, tmp);
1695 }
1696 } else if (src_size == H) {
1697 sve_uzp1(dst, B, src, tmp);
1698 }
1699 }
1700
1701 // Extend src predicate to dst predicate with the same lane count but larger
1702 // element size, e.g. 64Byte -> 512Long
1703 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1704 uint dst_element_length_in_bytes,
1705 uint src_element_length_in_bytes) {
1706 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1707 sve_punpklo(dst, src);
1708 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1709 sve_punpklo(dst, src);
1710 sve_punpklo(dst, dst);
1711 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1712 sve_punpklo(dst, src);
1713 sve_punpklo(dst, dst);
1714 sve_punpklo(dst, dst);
1715 } else {
1716 assert(false, "unsupported");
1717 ShouldNotReachHere();
1718 }
1719 }
1720
1721 // Narrow src predicate to dst predicate with the same lane count but
1722 // smaller element size, e.g. 512Long -> 64Byte
1723 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1724 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1725 // The insignificant bits in src predicate are expected to be zero.
1726 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1727 // passed as the second argument. An example narrowing operation with a given mask would be -
1728 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1729 // Mask (for 2 Longs) : TF
1730 // Predicate register for the above mask (16 bits) : 00000001 00000000
1731 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1732 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1733 assert_different_registers(src, ptmp);
1734 assert_different_registers(dst, ptmp);
1735 sve_pfalse(ptmp);
1736 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1737 sve_uzp1(dst, B, src, ptmp);
1738 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1739 sve_uzp1(dst, H, src, ptmp);
1740 sve_uzp1(dst, B, dst, ptmp);
1741 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1742 sve_uzp1(dst, S, src, ptmp);
1743 sve_uzp1(dst, H, dst, ptmp);
1744 sve_uzp1(dst, B, dst, ptmp);
1745 } else {
1746 assert(false, "unsupported");
1747 ShouldNotReachHere();
1748 }
1749 }
1750
1751 // Vector reduction add for integral type with ASIMD instructions.
1752 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1753 Register isrc, FloatRegister vsrc,
1754 unsigned vector_length_in_bytes,
1755 FloatRegister vtmp) {
1756 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1757 assert_different_registers(dst, isrc);
1758 bool isQ = vector_length_in_bytes == 16;
1759
1760 BLOCK_COMMENT("neon_reduce_add_integral {");
1761 switch(bt) {
1762 case T_BYTE:
1763 addv(vtmp, isQ ? T16B : T8B, vsrc);
1764 smov(dst, vtmp, B, 0);
1765 addw(dst, dst, isrc, ext::sxtb);
1766 break;
1767 case T_SHORT:
1768 addv(vtmp, isQ ? T8H : T4H, vsrc);
1769 smov(dst, vtmp, H, 0);
1770 addw(dst, dst, isrc, ext::sxth);
1771 break;
1772 case T_INT:
1773 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1774 umov(dst, vtmp, S, 0);
1775 addw(dst, dst, isrc);
1776 break;
1777 case T_LONG:
1778 assert(isQ, "unsupported");
1779 addpd(vtmp, vsrc);
1780 umov(dst, vtmp, D, 0);
1781 add(dst, dst, isrc);
1782 break;
1783 default:
1784 assert(false, "unsupported");
1785 ShouldNotReachHere();
1786 }
1787 BLOCK_COMMENT("} neon_reduce_add_integral");
1788 }
1789
1790 // Vector reduction multiply for integral type with ASIMD instructions.
1791 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1792 // Clobbers: rscratch1
1793 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1794 Register isrc, FloatRegister vsrc,
1795 unsigned vector_length_in_bytes,
1796 FloatRegister vtmp1, FloatRegister vtmp2) {
1797 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1798 bool isQ = vector_length_in_bytes == 16;
1799
1800 BLOCK_COMMENT("neon_reduce_mul_integral {");
1801 switch(bt) {
1802 case T_BYTE:
1803 if (isQ) {
1804 // Multiply the lower half and higher half of vector iteratively.
1805 // vtmp1 = vsrc[8:15]
1806 ins(vtmp1, D, vsrc, 0, 1);
1807 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1808 mulv(vtmp1, T8B, vtmp1, vsrc);
1809 // vtmp2 = vtmp1[4:7]
1810 ins(vtmp2, S, vtmp1, 0, 1);
1811 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1812 mulv(vtmp1, T8B, vtmp2, vtmp1);
1813 } else {
1814 ins(vtmp1, S, vsrc, 0, 1);
1815 mulv(vtmp1, T8B, vtmp1, vsrc);
1816 }
1817 // vtmp2 = vtmp1[2:3]
1818 ins(vtmp2, H, vtmp1, 0, 1);
1819 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1820 mulv(vtmp2, T8B, vtmp2, vtmp1);
1821 // dst = vtmp2[0] * isrc * vtmp2[1]
1822 umov(rscratch1, vtmp2, B, 0);
1823 mulw(dst, rscratch1, isrc);
1824 sxtb(dst, dst);
1825 umov(rscratch1, vtmp2, B, 1);
1826 mulw(dst, rscratch1, dst);
1827 sxtb(dst, dst);
1828 break;
1829 case T_SHORT:
1830 if (isQ) {
1831 ins(vtmp2, D, vsrc, 0, 1);
1832 mulv(vtmp2, T4H, vtmp2, vsrc);
1833 ins(vtmp1, S, vtmp2, 0, 1);
1834 mulv(vtmp1, T4H, vtmp1, vtmp2);
1835 } else {
1836 ins(vtmp1, S, vsrc, 0, 1);
1837 mulv(vtmp1, T4H, vtmp1, vsrc);
1838 }
1839 umov(rscratch1, vtmp1, H, 0);
1840 mulw(dst, rscratch1, isrc);
1841 sxth(dst, dst);
1842 umov(rscratch1, vtmp1, H, 1);
1843 mulw(dst, rscratch1, dst);
1844 sxth(dst, dst);
1845 break;
1846 case T_INT:
1847 if (isQ) {
1848 ins(vtmp1, D, vsrc, 0, 1);
1849 mulv(vtmp1, T2S, vtmp1, vsrc);
1850 } else {
1851 vtmp1 = vsrc;
1852 }
1853 umov(rscratch1, vtmp1, S, 0);
1854 mul(dst, rscratch1, isrc);
1855 umov(rscratch1, vtmp1, S, 1);
1856 mul(dst, rscratch1, dst);
1857 break;
1858 case T_LONG:
1859 umov(rscratch1, vsrc, D, 0);
1860 mul(dst, isrc, rscratch1);
1861 umov(rscratch1, vsrc, D, 1);
1862 mul(dst, dst, rscratch1);
1863 break;
1864 default:
1865 assert(false, "unsupported");
1866 ShouldNotReachHere();
1867 }
1868 BLOCK_COMMENT("} neon_reduce_mul_integral");
1869 }
1870
1871 // Vector reduction multiply for floating-point type with ASIMD instructions.
1872 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1873 FloatRegister fsrc, FloatRegister vsrc,
1874 unsigned vector_length_in_bytes,
1875 FloatRegister vtmp) {
1876 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1877 bool isQ = vector_length_in_bytes == 16;
1878
1879 BLOCK_COMMENT("neon_reduce_mul_fp {");
1880 switch(bt) {
1881 case T_FLOAT:
1882 fmuls(dst, fsrc, vsrc);
1883 ins(vtmp, S, vsrc, 0, 1);
1884 fmuls(dst, dst, vtmp);
1885 if (isQ) {
1886 ins(vtmp, S, vsrc, 0, 2);
1887 fmuls(dst, dst, vtmp);
1888 ins(vtmp, S, vsrc, 0, 3);
1889 fmuls(dst, dst, vtmp);
1890 }
1891 break;
1892 case T_DOUBLE:
1893 assert(isQ, "unsupported");
1894 fmuld(dst, fsrc, vsrc);
1895 ins(vtmp, D, vsrc, 0, 1);
1896 fmuld(dst, dst, vtmp);
1897 break;
1898 default:
1899 assert(false, "unsupported");
1900 ShouldNotReachHere();
1901 }
1902 BLOCK_COMMENT("} neon_reduce_mul_fp");
1903 }
1904
1905 // Helper to select logical instruction
1906 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1907 Register Rn, Register Rm,
1908 enum shift_kind kind, unsigned shift) {
1909 switch(opc) {
1910 case Op_AndReductionV:
1911 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1912 break;
1913 case Op_OrReductionV:
1914 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1915 break;
1916 case Op_XorReductionV:
1917 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1918 break;
1919 default:
1920 assert(false, "unsupported");
1921 ShouldNotReachHere();
1922 }
1923 }
1924
1925 // Vector reduction logical operations And, Or, Xor
1926 // Clobbers: rscratch1
1927 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1928 Register isrc, FloatRegister vsrc,
1929 unsigned vector_length_in_bytes) {
1930 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1931 "unsupported");
1932 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1933 assert_different_registers(dst, isrc);
1934 bool isQ = vector_length_in_bytes == 16;
1935
1936 BLOCK_COMMENT("neon_reduce_logical {");
1937 umov(rscratch1, vsrc, isQ ? D : S, 0);
1938 umov(dst, vsrc, isQ ? D : S, 1);
1939 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1940 switch(bt) {
1941 case T_BYTE:
1942 if (isQ) {
1943 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1944 }
1945 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1946 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1947 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1948 sxtb(dst, dst);
1949 break;
1950 case T_SHORT:
1951 if (isQ) {
1952 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1953 }
1954 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1955 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1956 sxth(dst, dst);
1957 break;
1958 case T_INT:
1959 if (isQ) {
1960 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1961 }
1962 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1963 break;
1964 case T_LONG:
1965 assert(isQ, "unsupported");
1966 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1967 break;
1968 default:
1969 assert(false, "unsupported");
1970 ShouldNotReachHere();
1971 }
1972 BLOCK_COMMENT("} neon_reduce_logical");
1973 }
1974
1975 // Vector reduction min/max for integral type with ASIMD instructions.
1976 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1977 // Clobbers: rscratch1, rflags
1978 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1979 Register isrc, FloatRegister vsrc,
1980 unsigned vector_length_in_bytes,
1981 FloatRegister vtmp) {
1982 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1983 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1984 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1985 assert_different_registers(dst, isrc);
1986 bool isQ = vector_length_in_bytes == 16;
1987 bool is_min = opc == Op_MinReductionV;
1988
1989 BLOCK_COMMENT("neon_reduce_minmax_integral {");
1990 if (bt == T_LONG) {
1991 assert(vtmp == fnoreg, "should be");
1992 assert(isQ, "should be");
1993 umov(rscratch1, vsrc, D, 0);
1994 cmp(isrc, rscratch1);
1995 csel(dst, isrc, rscratch1, is_min ? LT : GT);
1996 umov(rscratch1, vsrc, D, 1);
1997 cmp(dst, rscratch1);
1998 csel(dst, dst, rscratch1, is_min ? LT : GT);
1999 } else {
2000 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2001 if (size == T2S) {
2002 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2003 } else {
2004 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2005 }
2006 if (bt == T_INT) {
2007 umov(dst, vtmp, S, 0);
2008 } else {
2009 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2010 }
2011 cmpw(dst, isrc);
2012 cselw(dst, dst, isrc, is_min ? LT : GT);
2013 }
2014 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2015 }
2016
2017 // Vector reduction for integral type with SVE instruction.
2018 // Supported operations are Add, And, Or, Xor, Max, Min.
2019 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2020 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2021 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2022 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2023 assert(pg->is_governing(), "This register has to be a governing predicate register");
2024 assert_different_registers(src1, dst);
2025 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2026 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2027 switch (opc) {
2028 case Op_AddReductionVI: {
2029 sve_uaddv(tmp, size, pg, src2);
2030 if (bt == T_BYTE) {
2031 smov(dst, tmp, size, 0);
2032 addw(dst, src1, dst, ext::sxtb);
2033 } else if (bt == T_SHORT) {
2034 smov(dst, tmp, size, 0);
2035 addw(dst, src1, dst, ext::sxth);
2036 } else {
2037 umov(dst, tmp, size, 0);
2038 addw(dst, dst, src1);
2039 }
2040 break;
2041 }
2042 case Op_AddReductionVL: {
2043 sve_uaddv(tmp, size, pg, src2);
2044 umov(dst, tmp, size, 0);
2045 add(dst, dst, src1);
2046 break;
2047 }
2048 case Op_AndReductionV: {
2049 sve_andv(tmp, size, pg, src2);
2050 if (bt == T_INT || bt == T_LONG) {
2051 umov(dst, tmp, size, 0);
2052 } else {
2053 smov(dst, tmp, size, 0);
2054 }
2055 if (bt == T_LONG) {
2056 andr(dst, dst, src1);
2057 } else {
2058 andw(dst, dst, src1);
2059 }
2060 break;
2061 }
2062 case Op_OrReductionV: {
2063 sve_orv(tmp, size, pg, src2);
2064 if (bt == T_INT || bt == T_LONG) {
2065 umov(dst, tmp, size, 0);
2066 } else {
2067 smov(dst, tmp, size, 0);
2068 }
2069 if (bt == T_LONG) {
2070 orr(dst, dst, src1);
2071 } else {
2072 orrw(dst, dst, src1);
2073 }
2074 break;
2075 }
2076 case Op_XorReductionV: {
2077 sve_eorv(tmp, size, pg, src2);
2078 if (bt == T_INT || bt == T_LONG) {
2079 umov(dst, tmp, size, 0);
2080 } else {
2081 smov(dst, tmp, size, 0);
2082 }
2083 if (bt == T_LONG) {
2084 eor(dst, dst, src1);
2085 } else {
2086 eorw(dst, dst, src1);
2087 }
2088 break;
2089 }
2090 case Op_MaxReductionV: {
2091 sve_smaxv(tmp, size, pg, src2);
2092 if (bt == T_INT || bt == T_LONG) {
2093 umov(dst, tmp, size, 0);
2094 } else {
2095 smov(dst, tmp, size, 0);
2096 }
2097 if (bt == T_LONG) {
2098 cmp(dst, src1);
2099 csel(dst, dst, src1, Assembler::GT);
2100 } else {
2101 cmpw(dst, src1);
2102 cselw(dst, dst, src1, Assembler::GT);
2103 }
2104 break;
2105 }
2106 case Op_MinReductionV: {
2107 sve_sminv(tmp, size, pg, src2);
2108 if (bt == T_INT || bt == T_LONG) {
2109 umov(dst, tmp, size, 0);
2110 } else {
2111 smov(dst, tmp, size, 0);
2112 }
2113 if (bt == T_LONG) {
2114 cmp(dst, src1);
2115 csel(dst, dst, src1, Assembler::LT);
2116 } else {
2117 cmpw(dst, src1);
2118 cselw(dst, dst, src1, Assembler::LT);
2119 }
2120 break;
2121 }
2122 default:
2123 assert(false, "unsupported");
2124 ShouldNotReachHere();
2125 }
2126
2127 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2128 if (bt == T_BYTE) {
2129 sxtb(dst, dst);
2130 } else if (bt == T_SHORT) {
2131 sxth(dst, dst);
2132 }
2133 }
2134 }
2135
2136 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2137 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2138 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2139 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2140 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2141 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2142
2143 // Set all elements to false if the input "lane_cnt" is zero.
2144 if (lane_cnt == 0) {
2145 sve_pfalse(dst);
2146 return;
2147 }
2148
2149 SIMD_RegVariant size = elemType_to_regVariant(bt);
2150 assert(size != Q, "invalid size");
2151
2152 // Set all true if "lane_cnt" equals to the max lane count.
2153 if (lane_cnt == max_vector_length) {
2154 sve_ptrue(dst, size, /* ALL */ 0b11111);
2155 return;
2156 }
2157
2158 // Fixed numbers for "ptrue".
2159 switch(lane_cnt) {
2160 case 1: /* VL1 */
2161 case 2: /* VL2 */
2162 case 3: /* VL3 */
2163 case 4: /* VL4 */
2164 case 5: /* VL5 */
2165 case 6: /* VL6 */
2166 case 7: /* VL7 */
2167 case 8: /* VL8 */
2168 sve_ptrue(dst, size, lane_cnt);
2169 return;
2170 case 16:
2171 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2172 return;
2173 case 32:
2174 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2175 return;
2176 case 64:
2177 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2178 return;
2179 case 128:
2180 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2181 return;
2182 case 256:
2183 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2184 return;
2185 default:
2186 break;
2187 }
2188
2189 // Special patterns for "ptrue".
2190 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2191 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2192 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2193 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2194 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2195 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2196 } else {
2197 // Encode to "whileltw" for the remaining cases.
2198 mov(rscratch1, lane_cnt);
2199 sve_whileltw(dst, size, zr, rscratch1);
2200 }
2201 }
2202
2203 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2204 // Any remaining elements of dst will be filled with zero.
2205 // Clobbers: rscratch1
2206 // Preserves: src, mask
2207 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2208 FloatRegister vtmp1, FloatRegister vtmp2,
2209 PRegister pgtmp) {
2210 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2211 assert_different_registers(dst, src, vtmp1, vtmp2);
2212 assert_different_registers(mask, pgtmp);
2213
2214 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111
2215 // mask = 0001 0000 0000 0001 0001 0000 0001 0001
2216 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111
2217 sve_dup(vtmp2, H, 0);
2218
2219 // Extend lowest half to type INT.
2220 // dst = 00004444 00003333 00002222 00001111
2221 sve_uunpklo(dst, S, src);
2222 // pgtmp = 00000001 00000000 00000001 00000001
2223 sve_punpklo(pgtmp, mask);
2224 // Pack the active elements in size of type INT to the right,
2225 // and fill the remainings with zero.
2226 // dst = 00000000 00004444 00002222 00001111
2227 sve_compact(dst, S, dst, pgtmp);
2228 // Narrow the result back to type SHORT.
2229 // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2230 sve_uzp1(dst, H, dst, vtmp2);
2231 // Count the active elements of lowest half.
2232 // rscratch1 = 3
2233 sve_cntp(rscratch1, S, ptrue, pgtmp);
2234
2235 // Repeat to the highest half.
2236 // pgtmp = 00000001 00000000 00000000 00000001
2237 sve_punpkhi(pgtmp, mask);
2238 // vtmp1 = 00008888 00007777 00006666 00005555
2239 sve_uunpkhi(vtmp1, S, src);
2240 // vtmp1 = 00000000 00000000 00008888 00005555
2241 sve_compact(vtmp1, S, vtmp1, pgtmp);
2242 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2243 sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2244
2245 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111
2246 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2247 // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2248 // TRUE_CNT is the number of active elements in the compressed low.
2249 neg(rscratch1, rscratch1);
2250 // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2251 sve_index(vtmp2, H, rscratch1, 1);
2252 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2253 sve_tbl(vtmp1, H, vtmp1, vtmp2);
2254
2255 // Combine the compressed high(after shifted) with the compressed low.
2256 // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2257 sve_orr(dst, dst, vtmp1);
2258 }
2259
2260 // Clobbers: rscratch1, rscratch2
2261 // Preserves: src, mask
2262 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2263 FloatRegister vtmp1, FloatRegister vtmp2,
2264 FloatRegister vtmp3, FloatRegister vtmp4,
2265 PRegister ptmp, PRegister pgtmp) {
2266 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2267 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2268 assert_different_registers(mask, ptmp, pgtmp);
2269 // Example input: src = 88 77 66 55 44 33 22 11
2270 // mask = 01 00 00 01 01 00 01 01
2271 // Expected result: dst = 00 00 00 88 55 44 22 11
2272
2273 sve_dup(vtmp4, B, 0);
2274 // Extend lowest half to type SHORT.
2275 // vtmp1 = 0044 0033 0022 0011
2276 sve_uunpklo(vtmp1, H, src);
2277 // ptmp = 0001 0000 0001 0001
2278 sve_punpklo(ptmp, mask);
2279 // Count the active elements of lowest half.
2280 // rscratch2 = 3
2281 sve_cntp(rscratch2, H, ptrue, ptmp);
2282 // Pack the active elements in size of type SHORT to the right,
2283 // and fill the remainings with zero.
2284 // dst = 0000 0044 0022 0011
2285 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2286 // Narrow the result back to type BYTE.
2287 // dst = 00 00 00 00 00 44 22 11
2288 sve_uzp1(dst, B, dst, vtmp4);
2289
2290 // Repeat to the highest half.
2291 // ptmp = 0001 0000 0000 0001
2292 sve_punpkhi(ptmp, mask);
2293 // vtmp1 = 0088 0077 0066 0055
2294 sve_uunpkhi(vtmp2, H, src);
2295 // vtmp1 = 0000 0000 0088 0055
2296 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2297
2298 sve_dup(vtmp4, B, 0);
2299 // vtmp1 = 00 00 00 00 00 00 88 55
2300 sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2301
2302 // Compressed low: dst = 00 00 00 00 00 44 22 11
2303 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55
2304 // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2305 // TRUE_CNT is the number of active elements in the compressed low.
2306 neg(rscratch2, rscratch2);
2307 // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2308 sve_index(vtmp2, B, rscratch2, 1);
2309 // vtmp1 = 00 00 00 88 55 00 00 00
2310 sve_tbl(vtmp1, B, vtmp1, vtmp2);
2311 // Combine the compressed high(after shifted) with the compressed low.
2312 // dst = 00 00 00 88 55 44 22 11
2313 sve_orr(dst, dst, vtmp1);
2314 }
2315
2316 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2317 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2318 SIMD_Arrangement size = isQ ? T16B : T8B;
2319 if (bt == T_BYTE) {
2320 rbit(dst, size, src);
2321 } else {
2322 neon_reverse_bytes(dst, src, bt, isQ);
2323 rbit(dst, size, dst);
2324 }
2325 }
2326
2327 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2328 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2329 SIMD_Arrangement size = isQ ? T16B : T8B;
2330 switch (bt) {
2331 case T_BYTE:
2332 if (dst != src) {
2333 orr(dst, size, src, src);
2334 }
2335 break;
2336 case T_SHORT:
2337 rev16(dst, size, src);
2338 break;
2339 case T_INT:
2340 rev32(dst, size, src);
2341 break;
2342 case T_LONG:
2343 rev64(dst, size, src);
2344 break;
2345 default:
2346 assert(false, "unsupported");
2347 ShouldNotReachHere();
2348 }
2349 }
2350
2351 // VectorRearrange implementation for short/int/float/long/double types with NEON
2352 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2353 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2354 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2355 // and use bsl to implement the operation.
2356 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2357 FloatRegister shuffle, FloatRegister tmp,
2358 BasicType bt, bool isQ) {
2359 assert_different_registers(dst, src, shuffle, tmp);
2360 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2361 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2362
2363 // Here is an example that rearranges a NEON vector with 4 ints:
2364 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2365 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2366 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2367 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2368 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2369 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2370 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2371 // 4. Use Vm as index register, and use V1 as table register.
2372 // Then get V2 as the result by tbl NEON instructions.
2373 switch (bt) {
2374 case T_SHORT:
2375 mov(tmp, size1, 0x02);
2376 mulv(dst, size2, shuffle, tmp);
2377 mov(tmp, size2, 0x0100);
2378 addv(dst, size1, dst, tmp);
2379 tbl(dst, size1, src, 1, dst);
2380 break;
2381 case T_INT:
2382 case T_FLOAT:
2383 mov(tmp, size1, 0x04);
2384 mulv(dst, size2, shuffle, tmp);
2385 mov(tmp, size2, 0x03020100);
2386 addv(dst, size1, dst, tmp);
2387 tbl(dst, size1, src, 1, dst);
2388 break;
2389 case T_LONG:
2390 case T_DOUBLE:
2391 // Load the iota indices for Long type. The indices are ordered by
2392 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2393 // the offset for L is 48.
2394 lea(rscratch1,
2395 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2396 ldrq(tmp, rscratch1);
2397 // Check whether the input "shuffle" is the same with iota indices.
2398 // Return "src" if true, otherwise swap the two elements of "src".
2399 cm(EQ, dst, size2, shuffle, tmp);
2400 ext(tmp, size1, src, src, 8);
2401 bsl(dst, size1, src, tmp);
2402 break;
2403 default:
2404 assert(false, "unsupported element type");
2405 ShouldNotReachHere();
2406 }
2407 }
2408
2409 // Extract a scalar element from an sve vector at position 'idx'.
2410 // The input elements in src are expected to be of integral type.
2411 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2412 int idx, FloatRegister vtmp) {
2413 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2414 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2415 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2416 if (bt == T_INT || bt == T_LONG) {
2417 umov(dst, src, size, idx);
2418 } else {
2419 smov(dst, src, size, idx);
2420 }
2421 } else {
2422 sve_orr(vtmp, src, src);
2423 sve_ext(vtmp, vtmp, idx << size);
2424 if (bt == T_INT || bt == T_LONG) {
2425 umov(dst, vtmp, size, 0);
2426 } else {
2427 smov(dst, vtmp, size, 0);
2428 }
2429 }
2430 }
2431
2432 // java.lang.Math::round intrinsics
2433
2434 // Clobbers: rscratch1, rflags
2435 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2436 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2437 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2438 switch (T) {
2439 case T2S:
2440 case T4S:
2441 fmovs(tmp1, T, 0.5f);
2442 mov(rscratch1, jint_cast(0x1.0p23f));
2443 break;
2444 case T2D:
2445 fmovd(tmp1, T, 0.5);
2446 mov(rscratch1, julong_cast(0x1.0p52));
2447 break;
2448 default:
2449 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2450 }
2451 fadd(tmp1, T, tmp1, src);
2452 fcvtms(tmp1, T, tmp1);
2453 // tmp1 = floor(src + 0.5, ties to even)
2454
2455 fcvtas(dst, T, src);
2456 // dst = round(src), ties to away
2457
2458 fneg(tmp3, T, src);
2459 dup(tmp2, T, rscratch1);
2460 cm(HS, tmp3, T, tmp3, tmp2);
2461 // tmp3 is now a set of flags
2462
2463 bif(dst, T16B, tmp1, tmp3);
2464 // result in dst
2465 }
2466
2467 // Clobbers: rscratch1, rflags
2468 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2469 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2470 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2471 assert_different_registers(tmp1, tmp2, src, dst);
2472
2473 switch (T) {
2474 case S:
2475 mov(rscratch1, jint_cast(0x1.0p23f));
2476 break;
2477 case D:
2478 mov(rscratch1, julong_cast(0x1.0p52));
2479 break;
2480 default:
2481 assert(T == S || T == D, "invalid register variant");
2482 }
2483
2484 sve_frinta(dst, T, ptrue, src);
2485 // dst = round(src), ties to away
2486
2487 Label none;
2488
2489 sve_fneg(tmp1, T, ptrue, src);
2490 sve_dup(tmp2, T, rscratch1);
2491 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2492 br(EQ, none);
2493 {
2494 sve_cpy(tmp1, T, pgtmp, 0.5);
2495 sve_fadd(tmp1, T, pgtmp, src);
2496 sve_frintm(dst, T, pgtmp, tmp1);
2497 // dst = floor(src + 0.5, ties to even)
2498 }
2499 bind(none);
2500
2501 sve_fcvtzs(dst, T, ptrue, dst, T);
2502 // result in dst
2503 }
2504
2505 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2506 FloatRegister one, SIMD_Arrangement T) {
2507 assert_different_registers(dst, src, zero, one);
2508 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2509
2510 facgt(dst, T, src, zero);
2511 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2512 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2513 }
2514
2515 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2516 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2517 assert_different_registers(dst, src, zero, one, vtmp);
2518 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2519
2520 sve_orr(vtmp, src, src);
2521 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2522 switch (T) {
2523 case S:
2524 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2525 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2526 // on the sign of the float value
2527 break;
2528 case D:
2529 sve_and(vtmp, T, min_jlong);
2530 sve_orr(vtmp, T, jlong_cast(1.0));
2531 break;
2532 default:
2533 assert(false, "unsupported");
2534 ShouldNotReachHere();
2535 }
2536 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2537 // Result in dst
2538 }
2539
2540 bool C2_MacroAssembler::in_scratch_emit_size() {
2541 if (ciEnv::current()->task() != nullptr) {
2542 PhaseOutput* phase_output = Compile::current()->output();
2543 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2544 return true;
2545 }
2546 }
2547 return MacroAssembler::in_scratch_emit_size();
2548 }
2549
2550 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2551 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2552 }
2553
2554 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2555 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2556 if (t == TypeInt::INT) {
2557 return;
2558 }
2559 BLOCK_COMMENT("verify_int_in_range {");
2560 Label L_success, L_failure;
2561
2562 jint lo = t->_lo;
2563 jint hi = t->_hi;
2564
2565 if (lo != min_jint && hi != max_jint) {
2566 subsw(rtmp, rval, lo);
2567 br(Assembler::LT, L_failure);
2568 subsw(rtmp, rval, hi);
2569 br(Assembler::LE, L_success);
2570 } else if (lo != min_jint) {
2571 subsw(rtmp, rval, lo);
2572 br(Assembler::GE, L_success);
2573 } else if (hi != max_jint) {
2574 subsw(rtmp, rval, hi);
2575 br(Assembler::LE, L_success);
2576 } else {
2577 ShouldNotReachHere();
2578 }
2579
2580 bind(L_failure);
2581 movw(c_rarg0, idx);
2582 mov(c_rarg1, rval);
2583 movw(c_rarg2, lo);
2584 movw(c_rarg3, hi);
2585 reconstruct_frame_pointer(rtmp);
2586 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2587 hlt(0);
2588
2589 bind(L_success);
2590 BLOCK_COMMENT("} verify_int_in_range");
2591 }
2592
2593 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2594 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2595 }
2596
2597 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2598 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2599 if (t == TypeLong::LONG) {
2600 return;
2601 }
2602 BLOCK_COMMENT("verify_long_in_range {");
2603 Label L_success, L_failure;
2604
2605 jlong lo = t->_lo;
2606 jlong hi = t->_hi;
2607
2608 if (lo != min_jlong && hi != max_jlong) {
2609 subs(rtmp, rval, lo);
2610 br(Assembler::LT, L_failure);
2611 subs(rtmp, rval, hi);
2612 br(Assembler::LE, L_success);
2613 } else if (lo != min_jlong) {
2614 subs(rtmp, rval, lo);
2615 br(Assembler::GE, L_success);
2616 } else if (hi != max_jlong) {
2617 subs(rtmp, rval, hi);
2618 br(Assembler::LE, L_success);
2619 } else {
2620 ShouldNotReachHere();
2621 }
2622
2623 bind(L_failure);
2624 movw(c_rarg0, idx);
2625 mov(c_rarg1, rval);
2626 mov(c_rarg2, lo);
2627 mov(c_rarg3, hi);
2628 reconstruct_frame_pointer(rtmp);
2629 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2630 hlt(0);
2631
2632 bind(L_success);
2633 BLOCK_COMMENT("} verify_long_in_range");
2634 }
2635
2636 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2637 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2638 if (PreserveFramePointer) {
2639 // frame pointer is valid
2640 #ifdef ASSERT
2641 // Verify frame pointer value in rfp.
2642 add(rtmp, sp, framesize - 2 * wordSize);
2643 Label L_success;
2644 cmp(rfp, rtmp);
2645 br(Assembler::EQ, L_success);
2646 stop("frame pointer mismatch");
2647 bind(L_success);
2648 #endif // ASSERT
2649 } else {
2650 add(rfp, sp, framesize - 2 * wordSize);
2651 }
2652 }
2653
2654 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2655 // using Neon instructions and places it in the destination vector element corresponding to the
2656 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2657 // where NUM_ELEM is the number of BasicType elements per vector.
2658 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2659 // Otherwise, selects src2[idx – NUM_ELEM]
2660 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2661 FloatRegister src2, FloatRegister index,
2662 FloatRegister tmp, unsigned vector_length_in_bytes) {
2663 assert_different_registers(dst, src1, src2, tmp);
2664 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2665
2666 if (vector_length_in_bytes == 16) {
2667 assert(UseSVE <= 1, "sve must be <= 1");
2668 assert(src1->successor() == src2, "Source registers must be ordered");
2669 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2670 tbl(dst, size, src1, 2, index);
2671 } else { // vector length == 8
2672 assert(UseSVE == 0, "must be Neon only");
2673 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2674 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2675 // instruction with one vector lookup
2676 ins(tmp, D, src1, 0, 0);
2677 ins(tmp, D, src2, 1, 0);
2678 tbl(dst, size, tmp, 1, index);
2679 }
2680 }
2681
2682 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2683 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2684 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2685 // where NUM_ELEM is the number of BasicType elements per vector.
2686 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2687 // Otherwise, selects src2[idx – NUM_ELEM]
2688 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2689 FloatRegister src2, FloatRegister index,
2690 FloatRegister tmp, SIMD_RegVariant T,
2691 unsigned vector_length_in_bytes) {
2692 assert_different_registers(dst, src1, src2, index, tmp);
2693
2694 if (vector_length_in_bytes == 8) {
2695 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2696 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2697 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2698 // instruction with one vector lookup
2699 assert(UseSVE >= 1, "sve must be >= 1");
2700 ins(tmp, D, src1, 0, 0);
2701 ins(tmp, D, src2, 1, 0);
2702 sve_tbl(dst, T, tmp, index);
2703 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2704 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2705 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2706 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2707 // with the only exception of 8B vector length.
2708 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2709 assert(src1->successor() == src2, "Source registers must be ordered");
2710 sve_tbl(dst, T, src1, src2, index);
2711 }
2712 }
2713
2714 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2715 FloatRegister src2, FloatRegister index,
2716 FloatRegister tmp, BasicType bt,
2717 unsigned vector_length_in_bytes) {
2718
2719 assert_different_registers(dst, src1, src2, index, tmp);
2720
2721 // The cases that can reach this method are -
2722 // - UseSVE = 0, vector_length_in_bytes = 8 or 16
2723 // - UseSVE = 1, vector_length_in_bytes = 8 or 16
2724 // - UseSVE = 2, vector_length_in_bytes >= 8
2725 //
2726 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2727 // and UseSVE = 2 with vector_length_in_bytes >= 8
2728 //
2729 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2730 // UseSVE = 1 with vector_length_in_bytes = 16
2731
2732 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2733 SIMD_RegVariant T = elemType_to_regVariant(bt);
2734 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2735 return;
2736 }
2737
2738 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2739 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2740 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2741
2742 bool isQ = vector_length_in_bytes == 16;
2743
2744 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2745 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2746
2747 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2748 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2749 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2750 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2751 // the indices can range from [0, 8).
2752 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2753 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2754 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2755 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2756 // Add the multiplied result to the vector in tmp to obtain the byte level
2757 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2758 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2759
2760 if (bt == T_BYTE) {
2761 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2762 } else {
2763 int elem_size = (bt == T_SHORT) ? 2 : 4;
2764 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2765
2766 mov(tmp, size1, elem_size);
2767 mulv(dst, size2, index, tmp);
2768 mov(tmp, size2, tbl_offset);
2769 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2770 // to select a set of 2B/4B
2771 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2772 }
2773 }
2774
2775 // Vector expand implementation. Elements from the src vector are expanded into
2776 // the dst vector under the control of the vector mask.
2777 // Since there are no native instructions directly corresponding to expand before
2778 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2779 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2780 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2781 // for NEON and SVE, but with different instructions where appropriate.
2782
2783 // Vector expand implementation for NEON.
2784 //
2785 // An example of 128-bit Byte vector:
2786 // Data direction: high <== low
2787 // Input:
2788 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2789 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2790 // Expected result:
2791 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2792 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2793 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2794 int vector_length_in_bytes) {
2795 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2796 assert_different_registers(dst, src, mask, tmp1, tmp2);
2797 // Since the TBL instruction only supports byte table, we need to
2798 // compute indices in byte type for all types.
2799 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2800 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2801 dup(tmp1, size, zr);
2802 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2803 negr(dst, size, mask);
2804 // Calculate vector index for TBL with prefix sum algorithm.
2805 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2806 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2807 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2808 addv(dst, size, tmp2, dst);
2809 }
2810 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2811 orr(tmp2, size, mask, mask);
2812 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2813 bsl(tmp2, size, dst, tmp1);
2814 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2815 movi(tmp1, size, 1);
2816 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2817 subv(dst, size, tmp2, tmp1);
2818 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2819 tbl(dst, size, src, 1, dst);
2820 }
2821
2822 // Vector expand implementation for SVE.
2823 //
2824 // An example of 128-bit Short vector:
2825 // Data direction: high <== low
2826 // Input:
2827 // src = gf ed cb a9 87 65 43 21
2828 // pg = 00 01 00 01 00 01 00 01
2829 // Expected result:
2830 // dst = 00 87 00 65 00 43 00 21
2831 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2832 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2833 int vector_length_in_bytes) {
2834 assert(UseSVE > 0, "expand implementation only for SVE");
2835 assert_different_registers(dst, src, tmp1, tmp2);
2836 SIMD_RegVariant size = elemType_to_regVariant(bt);
2837
2838 // tmp1 = 00 00 00 00 00 00 00 00
2839 sve_dup(tmp1, size, 0);
2840 sve_movprfx(tmp2, tmp1);
2841 // tmp2 = 00 01 00 01 00 01 00 01
2842 sve_cpy(tmp2, size, pg, 1, true);
2843 // Calculate vector index for TBL with prefix sum algorithm.
2844 // tmp2 = 04 04 03 03 02 02 01 01
2845 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2846 sve_movprfx(dst, tmp1);
2847 // The EXT instruction operates on the full-width sve register. The correct
2848 // index calculation method is:
2849 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2850 // MaxVectorSize - i.
2851 sve_ext(dst, tmp2, MaxVectorSize - i);
2852 sve_add(tmp2, size, dst, tmp2);
2853 }
2854 // dst = 00 04 00 03 00 02 00 01
2855 sve_sel(dst, size, pg, tmp2, tmp1);
2856 // dst = -1 03 -1 02 -1 01 -1 00
2857 sve_sub(dst, size, 1);
2858 // dst = 00 87 00 65 00 43 00 21
2859 sve_tbl(dst, size, src, dst);
2860 }