1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/stubRoutines.hpp"
34 #include "utilities/globalDefinitions.hpp"
35 #include "utilities/powerOfTwo.hpp"
36
37 #ifdef PRODUCT
38 #define BLOCK_COMMENT(str) /* nothing */
39 #define STOP(error) stop(error)
40 #else
41 #define BLOCK_COMMENT(str) block_comment(str)
42 #define STOP(error) block_comment(error); stop(error)
43 #endif
44
45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46
47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
48
49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
51 FloatRegister vdata0, FloatRegister vdata1,
52 FloatRegister vdata2, FloatRegister vdata3,
53 FloatRegister vmul0, FloatRegister vmul1,
54 FloatRegister vmul2, FloatRegister vmul3,
55 FloatRegister vpow, FloatRegister vpowm,
56 BasicType eltype) {
57 ARRAYS_HASHCODE_REGISTERS;
58
59 Register tmp1 = rscratch1, tmp2 = rscratch2;
60
61 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
62
63 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
64 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
65 // use 4H for chars and shorts instead, but using 8H gives better performance.
66 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
67 : eltype == T_CHAR || eltype == T_SHORT ? 8
68 : eltype == T_INT ? 4
69 : 0;
70 guarantee(vf, "unsupported eltype");
71
72 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
73 const size_t unroll_factor = 4;
74
75 switch (eltype) {
76 case T_BOOLEAN:
77 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
78 break;
79 case T_CHAR:
80 BLOCK_COMMENT("arrays_hashcode(char) {");
81 break;
82 case T_BYTE:
83 BLOCK_COMMENT("arrays_hashcode(byte) {");
84 break;
85 case T_SHORT:
86 BLOCK_COMMENT("arrays_hashcode(short) {");
87 break;
88 case T_INT:
89 BLOCK_COMMENT("arrays_hashcode(int) {");
90 break;
91 default:
92 ShouldNotReachHere();
93 }
94
95 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
96 // implemented by the stub executes just once. Call the stub only if at least two iterations will
97 // be executed.
98 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
99 cmpw(cnt, large_threshold);
100 br(Assembler::HS, LARGE);
101
102 bind(TAIL);
103
104 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
105 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
106 // Iteration eats up the remainder, uf elements at a time.
107 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
108 andr(tmp2, cnt, unroll_factor - 1);
109 adr(tmp1, BR_BASE);
110 // For Cortex-A53 offset is 4 because 2 nops are generated.
111 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
112 movw(tmp2, 0x1f);
113 br(tmp1);
114
115 bind(LOOP);
116 for (size_t i = 0; i < unroll_factor; ++i) {
117 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
118 maddw(result, result, tmp2, tmp1);
119 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
120 // Generate 2nd nop to have 4 instructions per iteration.
121 if (VM_Version::supports_a53mac()) {
122 nop();
123 }
124 }
125 bind(BR_BASE);
126 subsw(cnt, cnt, unroll_factor);
127 br(Assembler::HS, LOOP);
128
129 b(DONE);
130
131 bind(LARGE);
132
133 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
134 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
135 address tpc = trampoline_call(stub);
136 if (tpc == nullptr) {
137 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
138 postcond(pc() == badAddress);
139 return nullptr;
140 }
141
142 bind(DONE);
143
144 BLOCK_COMMENT("} // arrays_hashcode");
145
146 postcond(pc() != badAddress);
147 return pc();
148 }
149
150 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
151 Register t2, Register t3) {
152 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
153
154 // Handle inflated monitor.
155 Label inflated;
156 // Finish fast lock successfully. MUST branch to with flag == EQ
157 Label locked;
158 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
159 Label slow_path;
160
161 if (UseObjectMonitorTable) {
162 // Clear cache in case fast locking succeeds or we need to take the slow-path.
163 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
164 }
165
166 if (DiagnoseSyncOnValueBasedClasses != 0) {
167 load_klass(t1, obj);
168 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
169 tst(t1, KlassFlags::_misc_is_value_based_class);
170 br(Assembler::NE, slow_path);
171 }
172
173 const Register t1_mark = t1;
174 const Register t3_t = t3;
175
176 { // Fast locking
177
178 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
179 Label push;
180
181 const Register t2_top = t2;
182
183 // Check if lock-stack is full.
184 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
185 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
186 br(Assembler::GT, slow_path);
187
188 // Check if recursive.
189 subw(t3_t, t2_top, oopSize);
190 ldr(t3_t, Address(rthread, t3_t));
191 cmp(obj, t3_t);
192 br(Assembler::EQ, push);
193
194 // Relaxed normal load to check for monitor. Optimization for monitor case.
195 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
196 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
197
198 // Not inflated
199 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
200
201 // Try to lock. Transition lock-bits 0b01 => 0b00
202 orr(t1_mark, t1_mark, markWord::unlocked_value);
203 eor(t3_t, t1_mark, markWord::unlocked_value);
204 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
205 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
206 br(Assembler::NE, slow_path);
207
208 bind(push);
209 // After successful lock, push object on lock-stack.
210 str(obj, Address(rthread, t2_top));
211 addw(t2_top, t2_top, oopSize);
212 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
213 b(locked);
214 }
215
216 { // Handle inflated monitor.
217 bind(inflated);
218
219 const Register t1_monitor = t1;
220
221 if (!UseObjectMonitorTable) {
222 assert(t1_monitor == t1_mark, "should be the same here");
223 } else {
224 Label monitor_found;
225
226 // Load cache address
227 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
228
229 const int num_unrolled = 2;
230 for (int i = 0; i < num_unrolled; i++) {
231 ldr(t1, Address(t3_t));
232 cmp(obj, t1);
233 br(Assembler::EQ, monitor_found);
234 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
235 }
236
237 Label loop;
238
239 // Search for obj in cache.
240 bind(loop);
241
242 // Check for match.
243 ldr(t1, Address(t3_t));
244 cmp(obj, t1);
245 br(Assembler::EQ, monitor_found);
246
247 // Search until null encountered, guaranteed _null_sentinel at end.
248 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
249 cbnz(t1, loop);
250 // Cache Miss, NE set from cmp above, cbnz does not set flags
251 b(slow_path);
252
253 bind(monitor_found);
254 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
255 }
256
257 const Register t2_owner_addr = t2;
258 const Register t3_owner = t3;
259 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
260 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
261 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
262
263 Label monitor_locked;
264
265 // Compute owner address.
266 lea(t2_owner_addr, owner_address);
267
268 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
269 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
270 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
271 /*release*/ false, /*weak*/ false, t3_owner);
272 br(Assembler::EQ, monitor_locked);
273
274 // Check if recursive.
275 cmp(t3_owner, rscratch2);
276 br(Assembler::NE, slow_path);
277
278 // Recursive.
279 increment(recursions_address, 1);
280
281 bind(monitor_locked);
282 if (UseObjectMonitorTable) {
283 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
284 }
285 }
286
287 bind(locked);
288
289 #ifdef ASSERT
290 // Check that locked label is reached with Flags == EQ.
291 Label flag_correct;
292 br(Assembler::EQ, flag_correct);
293 stop("Fast Lock Flag != EQ");
294 #endif
295
296 bind(slow_path);
297 #ifdef ASSERT
298 // Check that slow_path label is reached with Flags == NE.
299 br(Assembler::NE, flag_correct);
300 stop("Fast Lock Flag != NE");
301 bind(flag_correct);
302 #endif
303 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
304 }
305
306 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
307 Register t2, Register t3) {
308 assert_different_registers(obj, box, t1, t2, t3);
309
310 // Handle inflated monitor.
311 Label inflated, inflated_load_mark;
312 // Finish fast unlock successfully. MUST branch to with flag == EQ
313 Label unlocked;
314 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
315 Label slow_path;
316
317 const Register t1_mark = t1;
318 const Register t2_top = t2;
319 const Register t3_t = t3;
320
321 { // Fast unlock
322
323 Label push_and_slow_path;
324
325 // Check if obj is top of lock-stack.
326 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
327 subw(t2_top, t2_top, oopSize);
328 ldr(t3_t, Address(rthread, t2_top));
329 cmp(obj, t3_t);
330 // Top of lock stack was not obj. Must be monitor.
331 br(Assembler::NE, inflated_load_mark);
332
333 // Pop lock-stack.
334 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
335 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
336
337 // Check if recursive.
338 subw(t3_t, t2_top, oopSize);
339 ldr(t3_t, Address(rthread, t3_t));
340 cmp(obj, t3_t);
341 br(Assembler::EQ, unlocked);
342
343 // Not recursive.
344 // Load Mark.
345 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
346
347 // Check header for monitor (0b10).
348 // Because we got here by popping (meaning we pushed in locked)
349 // there will be no monitor in the box. So we need to push back the obj
350 // so that the runtime can fix any potential anonymous owner.
351 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
352
353 // Try to unlock. Transition lock bits 0b00 => 0b01
354 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
355 orr(t3_t, t1_mark, markWord::unlocked_value);
356 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
357 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
358 br(Assembler::EQ, unlocked);
359
360 bind(push_and_slow_path);
361 // Compare and exchange failed.
362 // Restore lock-stack and handle the unlock in runtime.
363 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
364 addw(t2_top, t2_top, oopSize);
365 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
366 b(slow_path);
367 }
368
369
370 { // Handle inflated monitor.
371 bind(inflated_load_mark);
372 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
373 #ifdef ASSERT
374 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
375 stop("Fast Unlock not monitor");
376 #endif
377
378 bind(inflated);
379
380 #ifdef ASSERT
381 Label check_done;
382 subw(t2_top, t2_top, oopSize);
383 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
384 br(Assembler::LT, check_done);
385 ldr(t3_t, Address(rthread, t2_top));
386 cmp(obj, t3_t);
387 br(Assembler::NE, inflated);
388 stop("Fast Unlock lock on stack");
389 bind(check_done);
390 #endif
391
392 const Register t1_monitor = t1;
393
394 if (!UseObjectMonitorTable) {
395 assert(t1_monitor == t1_mark, "should be the same here");
396
397 // Untag the monitor.
398 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
399 } else {
400 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
401 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
402 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
403 br(Assembler::LO, slow_path);
404 }
405
406 const Register t2_recursions = t2;
407 Label not_recursive;
408
409 // Check if recursive.
410 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
411 cbz(t2_recursions, not_recursive);
412
413 // Recursive unlock.
414 sub(t2_recursions, t2_recursions, 1u);
415 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
416 // Set flag == EQ
417 cmp(t2_recursions, t2_recursions);
418 b(unlocked);
419
420 bind(not_recursive);
421
422 const Register t2_owner_addr = t2;
423
424 // Compute owner address.
425 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
426
427 // Set owner to null.
428 // Release to satisfy the JMM
429 stlr(zr, t2_owner_addr);
430 // We need a full fence after clearing owner to avoid stranding.
431 // StoreLoad achieves this.
432 membar(StoreLoad);
433
434 // Check if the entry_list is empty.
435 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
436 cmp(rscratch1, zr);
437 br(Assembler::EQ, unlocked); // If so we are done.
438
439 // Check if there is a successor.
440 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
441 cmp(rscratch1, zr);
442 br(Assembler::NE, unlocked); // If so we are done.
443
444 // Save the monitor pointer in the current thread, so we can try to
445 // reacquire the lock in SharedRuntime::monitor_exit_helper().
446 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
447
448 cmp(zr, rthread); // Set Flag to NE => slow path
449 b(slow_path);
450 }
451
452 bind(unlocked);
453 cmp(zr, zr); // Set Flags to EQ => fast path
454
455 #ifdef ASSERT
456 // Check that unlocked label is reached with Flags == EQ.
457 Label flag_correct;
458 br(Assembler::EQ, flag_correct);
459 stop("Fast Unlock Flag != EQ");
460 #endif
461
462 bind(slow_path);
463 #ifdef ASSERT
464 // Check that slow_path label is reached with Flags == NE.
465 br(Assembler::NE, flag_correct);
466 stop("Fast Unlock Flag != NE");
467 bind(flag_correct);
468 #endif
469 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
470 }
471
472 // Search for str1 in str2 and return index or -1
473 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
474 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
475 Register cnt2, Register cnt1,
476 Register tmp1, Register tmp2,
477 Register tmp3, Register tmp4,
478 Register tmp5, Register tmp6,
479 int icnt1, Register result, int ae) {
480 // NOTE: tmp5, tmp6 can be zr depending on specific method version
481 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
482
483 Register ch1 = rscratch1;
484 Register ch2 = rscratch2;
485 Register cnt1tmp = tmp1;
486 Register cnt2tmp = tmp2;
487 Register cnt1_neg = cnt1;
488 Register cnt2_neg = cnt2;
489 Register result_tmp = tmp4;
490
491 bool isL = ae == StrIntrinsicNode::LL;
492
493 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
494 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
495 int str1_chr_shift = str1_isL ? 0:1;
496 int str2_chr_shift = str2_isL ? 0:1;
497 int str1_chr_size = str1_isL ? 1:2;
498 int str2_chr_size = str2_isL ? 1:2;
499 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
500 (chr_insn)&MacroAssembler::ldrh;
501 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
502 (chr_insn)&MacroAssembler::ldrh;
503 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
504 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
505
506 // Note, inline_string_indexOf() generates checks:
507 // if (substr.count > string.count) return -1;
508 // if (substr.count == 0) return 0;
509
510 // We have two strings, a source string in str2, cnt2 and a pattern string
511 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
512
513 // For larger pattern and source we use a simplified Boyer Moore algorithm.
514 // With a small pattern and source we use linear scan.
515
516 if (icnt1 == -1) {
517 sub(result_tmp, cnt2, cnt1);
518 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
519 br(LT, LINEARSEARCH);
520 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
521 subs(zr, cnt1, 256);
522 lsr(tmp1, cnt2, 2);
523 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
524 br(GE, LINEARSTUB);
525 }
526
527 // The Boyer Moore alogorithm is based on the description here:-
528 //
529 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
530 //
531 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
532 // and the 'Good Suffix' rule.
533 //
534 // These rules are essentially heuristics for how far we can shift the
535 // pattern along the search string.
536 //
537 // The implementation here uses the 'Bad Character' rule only because of the
538 // complexity of initialisation for the 'Good Suffix' rule.
539 //
540 // This is also known as the Boyer-Moore-Horspool algorithm:-
541 //
542 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
543 //
544 // This particular implementation has few java-specific optimizations.
545 //
546 // #define ASIZE 256
547 //
548 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
549 // int i, j;
550 // unsigned c;
551 // unsigned char bc[ASIZE];
552 //
553 // /* Preprocessing */
554 // for (i = 0; i < ASIZE; ++i)
555 // bc[i] = m;
556 // for (i = 0; i < m - 1; ) {
557 // c = x[i];
558 // ++i;
559 // // c < 256 for Latin1 string, so, no need for branch
560 // #ifdef PATTERN_STRING_IS_LATIN1
561 // bc[c] = m - i;
562 // #else
563 // if (c < ASIZE) bc[c] = m - i;
564 // #endif
565 // }
566 //
567 // /* Searching */
568 // j = 0;
569 // while (j <= n - m) {
570 // c = y[i+j];
571 // if (x[m-1] == c)
572 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
573 // if (i < 0) return j;
574 // // c < 256 for Latin1 string, so, no need for branch
575 // #ifdef SOURCE_STRING_IS_LATIN1
576 // // LL case: (c< 256) always true. Remove branch
577 // j += bc[y[j+m-1]];
578 // #endif
579 // #ifndef PATTERN_STRING_IS_UTF
580 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
581 // if (c < ASIZE)
582 // j += bc[y[j+m-1]];
583 // else
584 // j += 1
585 // #endif
586 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
587 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
588 // if (c < ASIZE)
589 // j += bc[y[j+m-1]];
590 // else
591 // j += m
592 // #endif
593 // }
594 // }
595
596 if (icnt1 == -1) {
597 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
598 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
599 Register cnt1end = tmp2;
600 Register str2end = cnt2;
601 Register skipch = tmp2;
602
603 // str1 length is >=8, so, we can read at least 1 register for cases when
604 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
605 // UL case. We'll re-read last character in inner pre-loop code to have
606 // single outer pre-loop load
607 const int firstStep = isL ? 7 : 3;
608
609 const int ASIZE = 256;
610 const int STORED_BYTES = 32; // amount of bytes stored per instruction
611 sub(sp, sp, ASIZE);
612 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
613 mov(ch1, sp);
614 BIND(BM_INIT_LOOP);
615 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
616 subs(tmp5, tmp5, 1);
617 br(GT, BM_INIT_LOOP);
618
619 sub(cnt1tmp, cnt1, 1);
620 mov(tmp5, str2);
621 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
622 sub(ch2, cnt1, 1);
623 mov(tmp3, str1);
624 BIND(BCLOOP);
625 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
626 if (!str1_isL) {
627 subs(zr, ch1, ASIZE);
628 br(HS, BCSKIP);
629 }
630 strb(ch2, Address(sp, ch1));
631 BIND(BCSKIP);
632 subs(ch2, ch2, 1);
633 br(GT, BCLOOP);
634
635 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
636 if (str1_isL == str2_isL) {
637 // load last 8 bytes (8LL/4UU symbols)
638 ldr(tmp6, Address(tmp6, -wordSize));
639 } else {
640 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
641 // convert Latin1 to UTF. We'll have to wait until load completed, but
642 // it's still faster than per-character loads+checks
643 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
644 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
645 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
646 andr(tmp6, tmp6, 0xFF); // str1[N-4]
647 orr(ch2, ch1, ch2, LSL, 16);
648 orr(tmp6, tmp6, tmp3, LSL, 48);
649 orr(tmp6, tmp6, ch2, LSL, 16);
650 }
651 BIND(BMLOOPSTR2);
652 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
653 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
654 if (str1_isL == str2_isL) {
655 // re-init tmp3. It's for free because it's executed in parallel with
656 // load above. Alternative is to initialize it before loop, but it'll
657 // affect performance on in-order systems with 2 or more ld/st pipelines
658 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
659 }
660 if (!isL) { // UU/UL case
661 lsl(ch2, cnt1tmp, 1); // offset in bytes
662 }
663 cmp(tmp3, skipch);
664 br(NE, BMSKIP);
665 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
666 mov(ch1, tmp6);
667 if (isL) {
668 b(BMLOOPSTR1_AFTER_LOAD);
669 } else {
670 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
671 b(BMLOOPSTR1_CMP);
672 }
673 BIND(BMLOOPSTR1);
674 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
675 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
676 BIND(BMLOOPSTR1_AFTER_LOAD);
677 subs(cnt1tmp, cnt1tmp, 1);
678 br(LT, BMLOOPSTR1_LASTCMP);
679 BIND(BMLOOPSTR1_CMP);
680 cmp(ch1, ch2);
681 br(EQ, BMLOOPSTR1);
682 BIND(BMSKIP);
683 if (!isL) {
684 // if we've met UTF symbol while searching Latin1 pattern, then we can
685 // skip cnt1 symbols
686 if (str1_isL != str2_isL) {
687 mov(result_tmp, cnt1);
688 } else {
689 mov(result_tmp, 1);
690 }
691 subs(zr, skipch, ASIZE);
692 br(HS, BMADV);
693 }
694 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
695 BIND(BMADV);
696 sub(cnt1tmp, cnt1, 1);
697 add(str2, str2, result_tmp, LSL, str2_chr_shift);
698 cmp(str2, str2end);
699 br(LE, BMLOOPSTR2);
700 add(sp, sp, ASIZE);
701 b(NOMATCH);
702 BIND(BMLOOPSTR1_LASTCMP);
703 cmp(ch1, ch2);
704 br(NE, BMSKIP);
705 BIND(BMMATCH);
706 sub(result, str2, tmp5);
707 if (!str2_isL) lsr(result, result, 1);
708 add(sp, sp, ASIZE);
709 b(DONE);
710
711 BIND(LINEARSTUB);
712 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
713 br(LT, LINEAR_MEDIUM);
714 mov(result, zr);
715 RuntimeAddress stub = nullptr;
716 if (isL) {
717 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
718 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
719 } else if (str1_isL) {
720 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
721 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
722 } else {
723 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
724 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
725 }
726 address call = trampoline_call(stub);
727 if (call == nullptr) {
728 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
729 ciEnv::current()->record_failure("CodeCache is full");
730 return;
731 }
732 b(DONE);
733 }
734
735 BIND(LINEARSEARCH);
736 {
737 Label DO1, DO2, DO3;
738
739 Register str2tmp = tmp2;
740 Register first = tmp3;
741
742 if (icnt1 == -1)
743 {
744 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
745
746 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
747 br(LT, DOSHORT);
748 BIND(LINEAR_MEDIUM);
749 (this->*str1_load_1chr)(first, Address(str1));
750 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
751 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
753 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
754
755 BIND(FIRST_LOOP);
756 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
757 cmp(first, ch2);
758 br(EQ, STR1_LOOP);
759 BIND(STR2_NEXT);
760 adds(cnt2_neg, cnt2_neg, str2_chr_size);
761 br(LE, FIRST_LOOP);
762 b(NOMATCH);
763
764 BIND(STR1_LOOP);
765 adds(cnt1tmp, cnt1_neg, str1_chr_size);
766 add(cnt2tmp, cnt2_neg, str2_chr_size);
767 br(GE, MATCH);
768
769 BIND(STR1_NEXT);
770 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
771 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
772 cmp(ch1, ch2);
773 br(NE, STR2_NEXT);
774 adds(cnt1tmp, cnt1tmp, str1_chr_size);
775 add(cnt2tmp, cnt2tmp, str2_chr_size);
776 br(LT, STR1_NEXT);
777 b(MATCH);
778
779 BIND(DOSHORT);
780 if (str1_isL == str2_isL) {
781 cmp(cnt1, (u1)2);
782 br(LT, DO1);
783 br(GT, DO3);
784 }
785 }
786
787 if (icnt1 == 4) {
788 Label CH1_LOOP;
789
790 (this->*load_4chr)(ch1, str1);
791 sub(result_tmp, cnt2, 4);
792 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
793 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
794
795 BIND(CH1_LOOP);
796 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
797 cmp(ch1, ch2);
798 br(EQ, MATCH);
799 adds(cnt2_neg, cnt2_neg, str2_chr_size);
800 br(LE, CH1_LOOP);
801 b(NOMATCH);
802 }
803
804 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
805 Label CH1_LOOP;
806
807 BIND(DO2);
808 (this->*load_2chr)(ch1, str1);
809 if (icnt1 == 2) {
810 sub(result_tmp, cnt2, 2);
811 }
812 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
813 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
814 BIND(CH1_LOOP);
815 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
816 cmp(ch1, ch2);
817 br(EQ, MATCH);
818 adds(cnt2_neg, cnt2_neg, str2_chr_size);
819 br(LE, CH1_LOOP);
820 b(NOMATCH);
821 }
822
823 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
824 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
825
826 BIND(DO3);
827 (this->*load_2chr)(first, str1);
828 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
829 if (icnt1 == 3) {
830 sub(result_tmp, cnt2, 3);
831 }
832 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
833 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
834 BIND(FIRST_LOOP);
835 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
836 cmpw(first, ch2);
837 br(EQ, STR1_LOOP);
838 BIND(STR2_NEXT);
839 adds(cnt2_neg, cnt2_neg, str2_chr_size);
840 br(LE, FIRST_LOOP);
841 b(NOMATCH);
842
843 BIND(STR1_LOOP);
844 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
845 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
846 cmp(ch1, ch2);
847 br(NE, STR2_NEXT);
848 b(MATCH);
849 }
850
851 if (icnt1 == -1 || icnt1 == 1) {
852 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
853
854 BIND(DO1);
855 (this->*str1_load_1chr)(ch1, str1);
856 cmp(cnt2, (u1)8);
857 br(LT, DO1_SHORT);
858
859 sub(result_tmp, cnt2, 8/str2_chr_size);
860 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
861 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
862 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
863
864 if (str2_isL) {
865 orr(ch1, ch1, ch1, LSL, 8);
866 }
867 orr(ch1, ch1, ch1, LSL, 16);
868 orr(ch1, ch1, ch1, LSL, 32);
869 BIND(CH1_LOOP);
870 ldr(ch2, Address(str2, cnt2_neg));
871 eor(ch2, ch1, ch2);
872 sub(tmp1, ch2, tmp3);
873 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
874 bics(tmp1, tmp1, tmp2);
875 br(NE, HAS_ZERO);
876 adds(cnt2_neg, cnt2_neg, 8);
877 br(LT, CH1_LOOP);
878
879 cmp(cnt2_neg, (u1)8);
880 mov(cnt2_neg, 0);
881 br(LT, CH1_LOOP);
882 b(NOMATCH);
883
884 BIND(HAS_ZERO);
885 rev(tmp1, tmp1);
886 clz(tmp1, tmp1);
887 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
888 b(MATCH);
889
890 BIND(DO1_SHORT);
891 mov(result_tmp, cnt2);
892 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
893 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
894 BIND(DO1_LOOP);
895 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
896 cmpw(ch1, ch2);
897 br(EQ, MATCH);
898 adds(cnt2_neg, cnt2_neg, str2_chr_size);
899 br(LT, DO1_LOOP);
900 }
901 }
902 BIND(NOMATCH);
903 mov(result, -1);
904 b(DONE);
905 BIND(MATCH);
906 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
907 BIND(DONE);
908 }
909
910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
912
913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
914 Register ch, Register result,
915 Register tmp1, Register tmp2, Register tmp3)
916 {
917 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
918 Register cnt1_neg = cnt1;
919 Register ch1 = rscratch1;
920 Register result_tmp = rscratch2;
921
922 cbz(cnt1, NOMATCH);
923
924 cmp(cnt1, (u1)4);
925 br(LT, DO1_SHORT);
926
927 orr(ch, ch, ch, LSL, 16);
928 orr(ch, ch, ch, LSL, 32);
929
930 sub(cnt1, cnt1, 4);
931 mov(result_tmp, cnt1);
932 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
933 sub(cnt1_neg, zr, cnt1, LSL, 1);
934
935 mov(tmp3, 0x0001000100010001);
936
937 BIND(CH1_LOOP);
938 ldr(ch1, Address(str1, cnt1_neg));
939 eor(ch1, ch, ch1);
940 sub(tmp1, ch1, tmp3);
941 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
942 bics(tmp1, tmp1, tmp2);
943 br(NE, HAS_ZERO);
944 adds(cnt1_neg, cnt1_neg, 8);
945 br(LT, CH1_LOOP);
946
947 cmp(cnt1_neg, (u1)8);
948 mov(cnt1_neg, 0);
949 br(LT, CH1_LOOP);
950 b(NOMATCH);
951
952 BIND(HAS_ZERO);
953 rev(tmp1, tmp1);
954 clz(tmp1, tmp1);
955 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
956 b(MATCH);
957
958 BIND(DO1_SHORT);
959 mov(result_tmp, cnt1);
960 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
961 sub(cnt1_neg, zr, cnt1, LSL, 1);
962 BIND(DO1_LOOP);
963 ldrh(ch1, Address(str1, cnt1_neg));
964 cmpw(ch, ch1);
965 br(EQ, MATCH);
966 adds(cnt1_neg, cnt1_neg, 2);
967 br(LT, DO1_LOOP);
968 BIND(NOMATCH);
969 mov(result, -1);
970 b(DONE);
971 BIND(MATCH);
972 add(result, result_tmp, cnt1_neg, ASR, 1);
973 BIND(DONE);
974 }
975
976 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
977 Register ch, Register result,
978 FloatRegister ztmp1,
979 FloatRegister ztmp2,
980 PRegister tmp_pg,
981 PRegister tmp_pdn, bool isL)
982 {
983 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
984 assert(tmp_pg->is_governing(),
985 "this register has to be a governing predicate register");
986
987 Label LOOP, MATCH, DONE, NOMATCH;
988 Register vec_len = rscratch1;
989 Register idx = rscratch2;
990
991 SIMD_RegVariant T = (isL == true) ? B : H;
992
993 cbz(cnt1, NOMATCH);
994
995 // Assign the particular char throughout the vector.
996 sve_dup(ztmp2, T, ch);
997 if (isL) {
998 sve_cntb(vec_len);
999 } else {
1000 sve_cnth(vec_len);
1001 }
1002 mov(idx, 0);
1003
1004 // Generate a predicate to control the reading of input string.
1005 sve_whilelt(tmp_pg, T, idx, cnt1);
1006
1007 BIND(LOOP);
1008 // Read a vector of 8- or 16-bit data depending on the string type. Note
1009 // that inactive elements indicated by the predicate register won't cause
1010 // a data read from memory to the destination vector.
1011 if (isL) {
1012 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1013 } else {
1014 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1015 }
1016 add(idx, idx, vec_len);
1017
1018 // Perform the comparison. An element of the destination predicate is set
1019 // to active if the particular char is matched.
1020 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1021
1022 // Branch if the particular char is found.
1023 br(NE, MATCH);
1024
1025 sve_whilelt(tmp_pg, T, idx, cnt1);
1026
1027 // Loop back if the particular char not found.
1028 br(MI, LOOP);
1029
1030 BIND(NOMATCH);
1031 mov(result, -1);
1032 b(DONE);
1033
1034 BIND(MATCH);
1035 // Undo the index increment.
1036 sub(idx, idx, vec_len);
1037
1038 // Crop the vector to find its location.
1039 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1040 add(result, idx, -1);
1041 sve_incp(result, T, tmp_pdn);
1042 BIND(DONE);
1043 }
1044
1045 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1046 Register ch, Register result,
1047 Register tmp1, Register tmp2, Register tmp3)
1048 {
1049 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1050 Register cnt1_neg = cnt1;
1051 Register ch1 = rscratch1;
1052 Register result_tmp = rscratch2;
1053
1054 cbz(cnt1, NOMATCH);
1055
1056 cmp(cnt1, (u1)8);
1057 br(LT, DO1_SHORT);
1058
1059 orr(ch, ch, ch, LSL, 8);
1060 orr(ch, ch, ch, LSL, 16);
1061 orr(ch, ch, ch, LSL, 32);
1062
1063 sub(cnt1, cnt1, 8);
1064 mov(result_tmp, cnt1);
1065 lea(str1, Address(str1, cnt1));
1066 sub(cnt1_neg, zr, cnt1);
1067
1068 mov(tmp3, 0x0101010101010101);
1069
1070 BIND(CH1_LOOP);
1071 ldr(ch1, Address(str1, cnt1_neg));
1072 eor(ch1, ch, ch1);
1073 sub(tmp1, ch1, tmp3);
1074 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1075 bics(tmp1, tmp1, tmp2);
1076 br(NE, HAS_ZERO);
1077 adds(cnt1_neg, cnt1_neg, 8);
1078 br(LT, CH1_LOOP);
1079
1080 cmp(cnt1_neg, (u1)8);
1081 mov(cnt1_neg, 0);
1082 br(LT, CH1_LOOP);
1083 b(NOMATCH);
1084
1085 BIND(HAS_ZERO);
1086 rev(tmp1, tmp1);
1087 clz(tmp1, tmp1);
1088 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1089 b(MATCH);
1090
1091 BIND(DO1_SHORT);
1092 mov(result_tmp, cnt1);
1093 lea(str1, Address(str1, cnt1));
1094 sub(cnt1_neg, zr, cnt1);
1095 BIND(DO1_LOOP);
1096 ldrb(ch1, Address(str1, cnt1_neg));
1097 cmp(ch, ch1);
1098 br(EQ, MATCH);
1099 adds(cnt1_neg, cnt1_neg, 1);
1100 br(LT, DO1_LOOP);
1101 BIND(NOMATCH);
1102 mov(result, -1);
1103 b(DONE);
1104 BIND(MATCH);
1105 add(result, result_tmp, cnt1_neg);
1106 BIND(DONE);
1107 }
1108
1109 // Compare strings.
1110 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1111 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1112 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1113 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1114 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1115 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1116 SHORT_LOOP_START, TAIL_CHECK;
1117
1118 bool isLL = ae == StrIntrinsicNode::LL;
1119 bool isLU = ae == StrIntrinsicNode::LU;
1120 bool isUL = ae == StrIntrinsicNode::UL;
1121
1122 // The stub threshold for LL strings is: 72 (64 + 8) chars
1123 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1124 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1125 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1126
1127 bool str1_isL = isLL || isLU;
1128 bool str2_isL = isLL || isUL;
1129
1130 int str1_chr_shift = str1_isL ? 0 : 1;
1131 int str2_chr_shift = str2_isL ? 0 : 1;
1132 int str1_chr_size = str1_isL ? 1 : 2;
1133 int str2_chr_size = str2_isL ? 1 : 2;
1134 int minCharsInWord = isLL ? wordSize : wordSize/2;
1135
1136 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1137 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1138 (chr_insn)&MacroAssembler::ldrh;
1139 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1140 (chr_insn)&MacroAssembler::ldrh;
1141 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1142 (uxt_insn)&MacroAssembler::uxthw;
1143
1144 BLOCK_COMMENT("string_compare {");
1145
1146 // Bizarrely, the counts are passed in bytes, regardless of whether they
1147 // are L or U strings, however the result is always in characters.
1148 if (!str1_isL) asrw(cnt1, cnt1, 1);
1149 if (!str2_isL) asrw(cnt2, cnt2, 1);
1150
1151 // Compute the minimum of the string lengths and save the difference.
1152 subsw(result, cnt1, cnt2);
1153 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1154
1155 // A very short string
1156 cmpw(cnt2, minCharsInWord);
1157 br(Assembler::LE, SHORT_STRING);
1158
1159 // Compare longwords
1160 // load first parts of strings and finish initialization while loading
1161 {
1162 if (str1_isL == str2_isL) { // LL or UU
1163 ldr(tmp1, Address(str1));
1164 cmp(str1, str2);
1165 br(Assembler::EQ, DONE);
1166 ldr(tmp2, Address(str2));
1167 cmp(cnt2, stub_threshold);
1168 br(GE, STUB);
1169 subsw(cnt2, cnt2, minCharsInWord);
1170 br(EQ, TAIL_CHECK);
1171 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1172 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1173 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1174 } else if (isLU) {
1175 ldrs(vtmp, Address(str1));
1176 ldr(tmp2, Address(str2));
1177 cmp(cnt2, stub_threshold);
1178 br(GE, STUB);
1179 subw(cnt2, cnt2, 4);
1180 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1181 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1182 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183 zip1(vtmp, T8B, vtmp, vtmpZ);
1184 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1185 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1186 add(cnt1, cnt1, 4);
1187 fmovd(tmp1, vtmp);
1188 } else { // UL case
1189 ldr(tmp1, Address(str1));
1190 ldrs(vtmp, Address(str2));
1191 cmp(cnt2, stub_threshold);
1192 br(GE, STUB);
1193 subw(cnt2, cnt2, 4);
1194 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1195 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1197 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1198 zip1(vtmp, T8B, vtmp, vtmpZ);
1199 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1200 add(cnt1, cnt1, 8);
1201 fmovd(tmp2, vtmp);
1202 }
1203 adds(cnt2, cnt2, isUL ? 4 : 8);
1204 br(GE, TAIL);
1205 eor(rscratch2, tmp1, tmp2);
1206 cbnz(rscratch2, DIFF);
1207 // main loop
1208 bind(NEXT_WORD);
1209 if (str1_isL == str2_isL) {
1210 ldr(tmp1, Address(str1, cnt2));
1211 ldr(tmp2, Address(str2, cnt2));
1212 adds(cnt2, cnt2, 8);
1213 } else if (isLU) {
1214 ldrs(vtmp, Address(str1, cnt1));
1215 ldr(tmp2, Address(str2, cnt2));
1216 add(cnt1, cnt1, 4);
1217 zip1(vtmp, T8B, vtmp, vtmpZ);
1218 fmovd(tmp1, vtmp);
1219 adds(cnt2, cnt2, 8);
1220 } else { // UL
1221 ldrs(vtmp, Address(str2, cnt2));
1222 ldr(tmp1, Address(str1, cnt1));
1223 zip1(vtmp, T8B, vtmp, vtmpZ);
1224 add(cnt1, cnt1, 8);
1225 fmovd(tmp2, vtmp);
1226 adds(cnt2, cnt2, 4);
1227 }
1228 br(GE, TAIL);
1229
1230 eor(rscratch2, tmp1, tmp2);
1231 cbz(rscratch2, NEXT_WORD);
1232 b(DIFF);
1233 bind(TAIL);
1234 eor(rscratch2, tmp1, tmp2);
1235 cbnz(rscratch2, DIFF);
1236 // Last longword. In the case where length == 4 we compare the
1237 // same longword twice, but that's still faster than another
1238 // conditional branch.
1239 if (str1_isL == str2_isL) {
1240 ldr(tmp1, Address(str1));
1241 ldr(tmp2, Address(str2));
1242 } else if (isLU) {
1243 ldrs(vtmp, Address(str1));
1244 ldr(tmp2, Address(str2));
1245 zip1(vtmp, T8B, vtmp, vtmpZ);
1246 fmovd(tmp1, vtmp);
1247 } else { // UL
1248 ldrs(vtmp, Address(str2));
1249 ldr(tmp1, Address(str1));
1250 zip1(vtmp, T8B, vtmp, vtmpZ);
1251 fmovd(tmp2, vtmp);
1252 }
1253 bind(TAIL_CHECK);
1254 eor(rscratch2, tmp1, tmp2);
1255 cbz(rscratch2, DONE);
1256
1257 // Find the first different characters in the longwords and
1258 // compute their difference.
1259 bind(DIFF);
1260 rev(rscratch2, rscratch2);
1261 clz(rscratch2, rscratch2);
1262 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1263 lsrv(tmp1, tmp1, rscratch2);
1264 (this->*ext_chr)(tmp1, tmp1);
1265 lsrv(tmp2, tmp2, rscratch2);
1266 (this->*ext_chr)(tmp2, tmp2);
1267 subw(result, tmp1, tmp2);
1268 b(DONE);
1269 }
1270
1271 bind(STUB);
1272 RuntimeAddress stub = nullptr;
1273 switch(ae) {
1274 case StrIntrinsicNode::LL:
1275 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1276 break;
1277 case StrIntrinsicNode::UU:
1278 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1279 break;
1280 case StrIntrinsicNode::LU:
1281 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1282 break;
1283 case StrIntrinsicNode::UL:
1284 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1285 break;
1286 default:
1287 ShouldNotReachHere();
1288 }
1289 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1290 address call = trampoline_call(stub);
1291 if (call == nullptr) {
1292 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1293 ciEnv::current()->record_failure("CodeCache is full");
1294 return;
1295 }
1296 b(DONE);
1297
1298 bind(SHORT_STRING);
1299 // Is the minimum length zero?
1300 cbz(cnt2, DONE);
1301 // arrange code to do most branches while loading and loading next characters
1302 // while comparing previous
1303 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1304 subs(cnt2, cnt2, 1);
1305 br(EQ, SHORT_LAST_INIT);
1306 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1307 b(SHORT_LOOP_START);
1308 bind(SHORT_LOOP);
1309 subs(cnt2, cnt2, 1);
1310 br(EQ, SHORT_LAST);
1311 bind(SHORT_LOOP_START);
1312 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1313 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1314 cmp(tmp1, cnt1);
1315 br(NE, SHORT_LOOP_TAIL);
1316 subs(cnt2, cnt2, 1);
1317 br(EQ, SHORT_LAST2);
1318 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1320 cmp(tmp2, rscratch1);
1321 br(EQ, SHORT_LOOP);
1322 sub(result, tmp2, rscratch1);
1323 b(DONE);
1324 bind(SHORT_LOOP_TAIL);
1325 sub(result, tmp1, cnt1);
1326 b(DONE);
1327 bind(SHORT_LAST2);
1328 cmp(tmp2, rscratch1);
1329 br(EQ, DONE);
1330 sub(result, tmp2, rscratch1);
1331
1332 b(DONE);
1333 bind(SHORT_LAST_INIT);
1334 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335 bind(SHORT_LAST);
1336 cmp(tmp1, cnt1);
1337 br(EQ, DONE);
1338 sub(result, tmp1, cnt1);
1339
1340 bind(DONE);
1341
1342 BLOCK_COMMENT("} string_compare");
1343 }
1344
1345 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1346 FloatRegister src2, Condition cond, bool isQ) {
1347 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1348 FloatRegister zn = src1, zm = src2;
1349 bool needs_negation = false;
1350 switch (cond) {
1351 case LT: cond = GT; zn = src2; zm = src1; break;
1352 case LE: cond = GE; zn = src2; zm = src1; break;
1353 case LO: cond = HI; zn = src2; zm = src1; break;
1354 case LS: cond = HS; zn = src2; zm = src1; break;
1355 case NE: cond = EQ; needs_negation = true; break;
1356 default:
1357 break;
1358 }
1359
1360 if (is_floating_point_type(bt)) {
1361 fcm(cond, dst, size, zn, zm);
1362 } else {
1363 cm(cond, dst, size, zn, zm);
1364 }
1365
1366 if (needs_negation) {
1367 notr(dst, isQ ? T16B : T8B, dst);
1368 }
1369 }
1370
1371 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1372 Condition cond, bool isQ) {
1373 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1374 if (bt == T_FLOAT || bt == T_DOUBLE) {
1375 if (cond == Assembler::NE) {
1376 fcm(Assembler::EQ, dst, size, src);
1377 notr(dst, isQ ? T16B : T8B, dst);
1378 } else {
1379 fcm(cond, dst, size, src);
1380 }
1381 } else {
1382 if (cond == Assembler::NE) {
1383 cm(Assembler::EQ, dst, size, src);
1384 notr(dst, isQ ? T16B : T8B, dst);
1385 } else {
1386 cm(cond, dst, size, src);
1387 }
1388 }
1389 }
1390
1391 // Compress the least significant bit of each byte to the rightmost and clear
1392 // the higher garbage bits.
1393 void C2_MacroAssembler::bytemask_compress(Register dst) {
1394 // Example input, dst = 0x01 00 00 00 01 01 00 01
1395 // The "??" bytes are garbage.
1396 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1397 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1398 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1399 andr(dst, dst, 0xff); // dst = 0x8D
1400 }
1401
1402 // Pack the value of each mask element in "src" into a long value in "dst", at most
1403 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1404 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1405 // one bit in "dst".
1406 //
1407 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1408 // Expected: dst = 0x658D
1409 //
1410 // Clobbers: rscratch1
1411 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1412 FloatRegister vtmp, int lane_cnt) {
1413 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1414 assert_different_registers(dst, rscratch1);
1415 assert_different_registers(src, vtmp);
1416 assert(UseSVE > 0, "must be");
1417
1418 // Compress the lowest 8 bytes.
1419 fmovd(dst, src);
1420 bytemask_compress(dst);
1421 if (lane_cnt <= 8) return;
1422
1423 // Repeat on higher bytes and join the results.
1424 // Compress 8 bytes in each iteration.
1425 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1426 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1427 bytemask_compress(rscratch1);
1428 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1429 }
1430 }
1431
1432 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1433 // instruction which requires the FEAT_BITPERM feature.
1434 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1435 FloatRegister vtmp1, FloatRegister vtmp2,
1436 int lane_cnt) {
1437 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1438 assert_different_registers(src, vtmp1, vtmp2);
1439 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1440
1441 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1442 // is to compress each significant bit of the byte in a cross-lane way. Due
1443 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1444 // (bit-compress in each lane) with the biggest lane size (T = D) then
1445 // concatenate the results.
1446
1447 // The second source input of BEXT, initialized with 0x01 in each byte.
1448 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1449 sve_dup(vtmp2, B, 1);
1450
1451 // BEXT vtmp1.D, src.D, vtmp2.D
1452 // src = 0x0001010000010001 | 0x0100000001010001
1453 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1454 // ---------------------------------------
1455 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1456 sve_bext(vtmp1, D, src, vtmp2);
1457
1458 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1459 // result to dst.
1460 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1461 // dst = 0x658D
1462 if (lane_cnt <= 8) {
1463 // No need to concatenate.
1464 umov(dst, vtmp1, B, 0);
1465 } else if (lane_cnt <= 16) {
1466 ins(vtmp1, B, vtmp1, 1, 8);
1467 umov(dst, vtmp1, H, 0);
1468 } else {
1469 // As the lane count is 64 at most, the final expected value must be in
1470 // the lowest 64 bits after narrowing vtmp1 from D to B.
1471 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1472 umov(dst, vtmp1, D, 0);
1473 }
1474 }
1475
1476 // Unpack the mask, a long value in "src", into a vector register of boolean
1477 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1478 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1479 // most 64 lanes.
1480 //
1481 // Below example gives the expected dst vector register, with a valid src(0x658D)
1482 // on a 128-bit vector size machine.
1483 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1484 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1485 FloatRegister vtmp, int lane_cnt) {
1486 assert_different_registers(dst, vtmp);
1487 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1488 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1489
1490 // Example: src = 0x658D, lane_cnt = 16
1491 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1492
1493 // Put long value from general purpose register into the first lane of vector.
1494 // vtmp = 0x0000000000000000 | 0x000000000000658D
1495 sve_dup(vtmp, B, 0);
1496 mov(vtmp, D, 0, src);
1497
1498 // Transform the value in the first lane which is mask in bit now to the mask in
1499 // byte, which can be done by SVE2's BDEP instruction.
1500
1501 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1502 // vtmp = 0x0000000000000065 | 0x000000000000008D
1503 if (lane_cnt <= 8) {
1504 // Nothing. As only one byte exsits.
1505 } else if (lane_cnt <= 16) {
1506 ins(vtmp, B, vtmp, 8, 1);
1507 } else {
1508 sve_vector_extend(vtmp, D, vtmp, B);
1509 }
1510
1511 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1512 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1513 sve_dup(dst, B, 1);
1514
1515 // BDEP dst.D, vtmp.D, dst.D
1516 // vtmp = 0x0000000000000065 | 0x000000000000008D
1517 // dst = 0x0101010101010101 | 0x0101010101010101
1518 // ---------------------------------------
1519 // dst = 0x0001010000010001 | 0x0100000001010001
1520 sve_bdep(dst, D, vtmp, dst);
1521 }
1522
1523 // Clobbers: rflags
1524 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1525 FloatRegister zn, FloatRegister zm, Condition cond) {
1526 assert(pg->is_governing(), "This register has to be a governing predicate register");
1527 FloatRegister z1 = zn, z2 = zm;
1528 switch (cond) {
1529 case LE: z1 = zm; z2 = zn; cond = GE; break;
1530 case LT: z1 = zm; z2 = zn; cond = GT; break;
1531 case LO: z1 = zm; z2 = zn; cond = HI; break;
1532 case LS: z1 = zm; z2 = zn; cond = HS; break;
1533 default:
1534 break;
1535 }
1536
1537 SIMD_RegVariant size = elemType_to_regVariant(bt);
1538 if (is_floating_point_type(bt)) {
1539 sve_fcm(cond, pd, size, pg, z1, z2);
1540 } else {
1541 assert(is_integral_type(bt), "unsupported element type");
1542 sve_cmp(cond, pd, size, pg, z1, z2);
1543 }
1544 }
1545
1546 // Get index of the last mask lane that is set
1547 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1548 SIMD_RegVariant size = elemType_to_regVariant(bt);
1549 sve_rev(ptmp, size, src);
1550 sve_brkb(ptmp, ptrue, ptmp, false);
1551 sve_cntp(dst, size, ptrue, ptmp);
1552 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1553 subw(dst, rscratch1, dst);
1554 }
1555
1556 // Extend integer vector src to dst with the same lane count
1557 // but larger element size, e.g. 4B -> 4I
1558 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1559 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1560 if (src_bt == T_BYTE) {
1561 // 4B to 4S/4I, 8B to 8S
1562 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1563 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1564 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1565 if (dst_bt == T_INT) {
1566 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1567 }
1568 } else if (src_bt == T_SHORT) {
1569 // 2S to 2I/2L, 4S to 4I
1570 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1571 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1572 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1573 if (dst_bt == T_LONG) {
1574 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1575 }
1576 } else if (src_bt == T_INT) {
1577 // 2I to 2L
1578 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1579 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1580 } else {
1581 ShouldNotReachHere();
1582 }
1583 }
1584
1585 // Narrow integer vector src down to dst with the same lane count
1586 // but smaller element size, e.g. 4I -> 4B
1587 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1588 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1589 if (src_bt == T_SHORT) {
1590 // 4S/8S to 4B/8B
1591 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1592 assert(dst_bt == T_BYTE, "unsupported");
1593 xtn(dst, T8B, src, T8H);
1594 } else if (src_bt == T_INT) {
1595 // 2I to 2S, 4I to 4B/4S
1596 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1597 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1598 xtn(dst, T4H, src, T4S);
1599 if (dst_bt == T_BYTE) {
1600 xtn(dst, T8B, dst, T8H);
1601 }
1602 } else if (src_bt == T_LONG) {
1603 // 2L to 2S/2I
1604 assert(src_vlen_in_bytes == 16, "unsupported");
1605 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1606 xtn(dst, T2S, src, T2D);
1607 if (dst_bt == T_SHORT) {
1608 xtn(dst, T4H, dst, T4S);
1609 }
1610 } else {
1611 ShouldNotReachHere();
1612 }
1613 }
1614
1615 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1616 FloatRegister src, SIMD_RegVariant src_size,
1617 bool is_unsigned) {
1618 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1619
1620 if (src_size == B) {
1621 switch (dst_size) {
1622 case H:
1623 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1624 break;
1625 case S:
1626 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1627 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1628 break;
1629 case D:
1630 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1631 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1632 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1633 break;
1634 default:
1635 ShouldNotReachHere();
1636 }
1637 } else if (src_size == H) {
1638 if (dst_size == S) {
1639 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1640 } else { // D
1641 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1642 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1643 }
1644 } else if (src_size == S) {
1645 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1646 }
1647 }
1648
1649 // Vector narrow from src to dst with specified element sizes.
1650 // High part of dst vector will be filled with zero.
1651 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1652 FloatRegister src, SIMD_RegVariant src_size,
1653 FloatRegister tmp) {
1654 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1655 assert_different_registers(src, tmp);
1656 sve_dup(tmp, src_size, 0);
1657 if (src_size == D) {
1658 switch (dst_size) {
1659 case S:
1660 sve_uzp1(dst, S, src, tmp);
1661 break;
1662 case H:
1663 assert_different_registers(dst, tmp);
1664 sve_uzp1(dst, S, src, tmp);
1665 sve_uzp1(dst, H, dst, tmp);
1666 break;
1667 case B:
1668 assert_different_registers(dst, tmp);
1669 sve_uzp1(dst, S, src, tmp);
1670 sve_uzp1(dst, H, dst, tmp);
1671 sve_uzp1(dst, B, dst, tmp);
1672 break;
1673 default:
1674 ShouldNotReachHere();
1675 }
1676 } else if (src_size == S) {
1677 if (dst_size == H) {
1678 sve_uzp1(dst, H, src, tmp);
1679 } else { // B
1680 assert_different_registers(dst, tmp);
1681 sve_uzp1(dst, H, src, tmp);
1682 sve_uzp1(dst, B, dst, tmp);
1683 }
1684 } else if (src_size == H) {
1685 sve_uzp1(dst, B, src, tmp);
1686 }
1687 }
1688
1689 // Extend src predicate to dst predicate with the same lane count but larger
1690 // element size, e.g. 64Byte -> 512Long
1691 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1692 uint dst_element_length_in_bytes,
1693 uint src_element_length_in_bytes) {
1694 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1695 sve_punpklo(dst, src);
1696 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1697 sve_punpklo(dst, src);
1698 sve_punpklo(dst, dst);
1699 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1700 sve_punpklo(dst, src);
1701 sve_punpklo(dst, dst);
1702 sve_punpklo(dst, dst);
1703 } else {
1704 assert(false, "unsupported");
1705 ShouldNotReachHere();
1706 }
1707 }
1708
1709 // Narrow src predicate to dst predicate with the same lane count but
1710 // smaller element size, e.g. 512Long -> 64Byte
1711 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1712 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1713 // The insignificant bits in src predicate are expected to be zero.
1714 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1715 // passed as the second argument. An example narrowing operation with a given mask would be -
1716 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1717 // Mask (for 2 Longs) : TF
1718 // Predicate register for the above mask (16 bits) : 00000001 00000000
1719 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1720 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1721 assert_different_registers(src, ptmp);
1722 assert_different_registers(dst, ptmp);
1723 sve_pfalse(ptmp);
1724 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1725 sve_uzp1(dst, B, src, ptmp);
1726 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1727 sve_uzp1(dst, H, src, ptmp);
1728 sve_uzp1(dst, B, dst, ptmp);
1729 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1730 sve_uzp1(dst, S, src, ptmp);
1731 sve_uzp1(dst, H, dst, ptmp);
1732 sve_uzp1(dst, B, dst, ptmp);
1733 } else {
1734 assert(false, "unsupported");
1735 ShouldNotReachHere();
1736 }
1737 }
1738
1739 // Vector reduction add for integral type with ASIMD instructions.
1740 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1741 Register isrc, FloatRegister vsrc,
1742 unsigned vector_length_in_bytes,
1743 FloatRegister vtmp) {
1744 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1745 assert_different_registers(dst, isrc);
1746 bool isQ = vector_length_in_bytes == 16;
1747
1748 BLOCK_COMMENT("neon_reduce_add_integral {");
1749 switch(bt) {
1750 case T_BYTE:
1751 addv(vtmp, isQ ? T16B : T8B, vsrc);
1752 smov(dst, vtmp, B, 0);
1753 addw(dst, dst, isrc, ext::sxtb);
1754 break;
1755 case T_SHORT:
1756 addv(vtmp, isQ ? T8H : T4H, vsrc);
1757 smov(dst, vtmp, H, 0);
1758 addw(dst, dst, isrc, ext::sxth);
1759 break;
1760 case T_INT:
1761 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1762 umov(dst, vtmp, S, 0);
1763 addw(dst, dst, isrc);
1764 break;
1765 case T_LONG:
1766 assert(isQ, "unsupported");
1767 addpd(vtmp, vsrc);
1768 umov(dst, vtmp, D, 0);
1769 add(dst, dst, isrc);
1770 break;
1771 default:
1772 assert(false, "unsupported");
1773 ShouldNotReachHere();
1774 }
1775 BLOCK_COMMENT("} neon_reduce_add_integral");
1776 }
1777
1778 // Vector reduction multiply for integral type with ASIMD instructions.
1779 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1780 // Clobbers: rscratch1
1781 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1782 Register isrc, FloatRegister vsrc,
1783 unsigned vector_length_in_bytes,
1784 FloatRegister vtmp1, FloatRegister vtmp2) {
1785 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1786 bool isQ = vector_length_in_bytes == 16;
1787
1788 BLOCK_COMMENT("neon_reduce_mul_integral {");
1789 switch(bt) {
1790 case T_BYTE:
1791 if (isQ) {
1792 // Multiply the lower half and higher half of vector iteratively.
1793 // vtmp1 = vsrc[8:15]
1794 ins(vtmp1, D, vsrc, 0, 1);
1795 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1796 mulv(vtmp1, T8B, vtmp1, vsrc);
1797 // vtmp2 = vtmp1[4:7]
1798 ins(vtmp2, S, vtmp1, 0, 1);
1799 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1800 mulv(vtmp1, T8B, vtmp2, vtmp1);
1801 } else {
1802 ins(vtmp1, S, vsrc, 0, 1);
1803 mulv(vtmp1, T8B, vtmp1, vsrc);
1804 }
1805 // vtmp2 = vtmp1[2:3]
1806 ins(vtmp2, H, vtmp1, 0, 1);
1807 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1808 mulv(vtmp2, T8B, vtmp2, vtmp1);
1809 // dst = vtmp2[0] * isrc * vtmp2[1]
1810 umov(rscratch1, vtmp2, B, 0);
1811 mulw(dst, rscratch1, isrc);
1812 sxtb(dst, dst);
1813 umov(rscratch1, vtmp2, B, 1);
1814 mulw(dst, rscratch1, dst);
1815 sxtb(dst, dst);
1816 break;
1817 case T_SHORT:
1818 if (isQ) {
1819 ins(vtmp2, D, vsrc, 0, 1);
1820 mulv(vtmp2, T4H, vtmp2, vsrc);
1821 ins(vtmp1, S, vtmp2, 0, 1);
1822 mulv(vtmp1, T4H, vtmp1, vtmp2);
1823 } else {
1824 ins(vtmp1, S, vsrc, 0, 1);
1825 mulv(vtmp1, T4H, vtmp1, vsrc);
1826 }
1827 umov(rscratch1, vtmp1, H, 0);
1828 mulw(dst, rscratch1, isrc);
1829 sxth(dst, dst);
1830 umov(rscratch1, vtmp1, H, 1);
1831 mulw(dst, rscratch1, dst);
1832 sxth(dst, dst);
1833 break;
1834 case T_INT:
1835 if (isQ) {
1836 ins(vtmp1, D, vsrc, 0, 1);
1837 mulv(vtmp1, T2S, vtmp1, vsrc);
1838 } else {
1839 vtmp1 = vsrc;
1840 }
1841 umov(rscratch1, vtmp1, S, 0);
1842 mul(dst, rscratch1, isrc);
1843 umov(rscratch1, vtmp1, S, 1);
1844 mul(dst, rscratch1, dst);
1845 break;
1846 case T_LONG:
1847 umov(rscratch1, vsrc, D, 0);
1848 mul(dst, isrc, rscratch1);
1849 umov(rscratch1, vsrc, D, 1);
1850 mul(dst, dst, rscratch1);
1851 break;
1852 default:
1853 assert(false, "unsupported");
1854 ShouldNotReachHere();
1855 }
1856 BLOCK_COMMENT("} neon_reduce_mul_integral");
1857 }
1858
1859 // Vector reduction multiply for floating-point type with ASIMD instructions.
1860 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1861 FloatRegister fsrc, FloatRegister vsrc,
1862 unsigned vector_length_in_bytes,
1863 FloatRegister vtmp) {
1864 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1865 bool isQ = vector_length_in_bytes == 16;
1866
1867 BLOCK_COMMENT("neon_reduce_mul_fp {");
1868 switch(bt) {
1869 case T_FLOAT:
1870 fmuls(dst, fsrc, vsrc);
1871 ins(vtmp, S, vsrc, 0, 1);
1872 fmuls(dst, dst, vtmp);
1873 if (isQ) {
1874 ins(vtmp, S, vsrc, 0, 2);
1875 fmuls(dst, dst, vtmp);
1876 ins(vtmp, S, vsrc, 0, 3);
1877 fmuls(dst, dst, vtmp);
1878 }
1879 break;
1880 case T_DOUBLE:
1881 assert(isQ, "unsupported");
1882 fmuld(dst, fsrc, vsrc);
1883 ins(vtmp, D, vsrc, 0, 1);
1884 fmuld(dst, dst, vtmp);
1885 break;
1886 default:
1887 assert(false, "unsupported");
1888 ShouldNotReachHere();
1889 }
1890 BLOCK_COMMENT("} neon_reduce_mul_fp");
1891 }
1892
1893 // Helper to select logical instruction
1894 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1895 Register Rn, Register Rm,
1896 enum shift_kind kind, unsigned shift) {
1897 switch(opc) {
1898 case Op_AndReductionV:
1899 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1900 break;
1901 case Op_OrReductionV:
1902 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1903 break;
1904 case Op_XorReductionV:
1905 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1906 break;
1907 default:
1908 assert(false, "unsupported");
1909 ShouldNotReachHere();
1910 }
1911 }
1912
1913 // Vector reduction logical operations And, Or, Xor
1914 // Clobbers: rscratch1
1915 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1916 Register isrc, FloatRegister vsrc,
1917 unsigned vector_length_in_bytes) {
1918 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1919 "unsupported");
1920 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1921 assert_different_registers(dst, isrc);
1922 bool isQ = vector_length_in_bytes == 16;
1923
1924 BLOCK_COMMENT("neon_reduce_logical {");
1925 umov(rscratch1, vsrc, isQ ? D : S, 0);
1926 umov(dst, vsrc, isQ ? D : S, 1);
1927 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1928 switch(bt) {
1929 case T_BYTE:
1930 if (isQ) {
1931 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1932 }
1933 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1934 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1935 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1936 sxtb(dst, dst);
1937 break;
1938 case T_SHORT:
1939 if (isQ) {
1940 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1941 }
1942 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1943 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1944 sxth(dst, dst);
1945 break;
1946 case T_INT:
1947 if (isQ) {
1948 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1949 }
1950 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1951 break;
1952 case T_LONG:
1953 assert(isQ, "unsupported");
1954 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1955 break;
1956 default:
1957 assert(false, "unsupported");
1958 ShouldNotReachHere();
1959 }
1960 BLOCK_COMMENT("} neon_reduce_logical");
1961 }
1962
1963 // Helper function to decode min/max reduction operation properties
1964 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
1965 bool* is_unsigned,
1966 Condition* cond) {
1967 switch(opc) {
1968 case Op_MinReductionV:
1969 *is_min = true; *is_unsigned = false; *cond = LT; break;
1970 case Op_MaxReductionV:
1971 *is_min = false; *is_unsigned = false; *cond = GT; break;
1972 case Op_UMinReductionV:
1973 *is_min = true; *is_unsigned = true; *cond = LO; break;
1974 case Op_UMaxReductionV:
1975 *is_min = false; *is_unsigned = true; *cond = HI; break;
1976 default:
1977 ShouldNotReachHere();
1978 }
1979 }
1980
1981 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
1982 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1983 // Clobbers: rscratch1, rflags
1984 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1985 Register isrc, FloatRegister vsrc,
1986 unsigned vector_length_in_bytes,
1987 FloatRegister vtmp) {
1988 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
1989 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
1990 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1991 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1992 assert_different_registers(dst, isrc);
1993 bool isQ = vector_length_in_bytes == 16;
1994 bool is_min;
1995 bool is_unsigned;
1996 Condition cond;
1997 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
1998 BLOCK_COMMENT("neon_reduce_minmax_integral {");
1999 if (bt == T_LONG) {
2000 assert(vtmp == fnoreg, "should be");
2001 assert(isQ, "should be");
2002 umov(rscratch1, vsrc, D, 0);
2003 cmp(isrc, rscratch1);
2004 csel(dst, isrc, rscratch1, cond);
2005 umov(rscratch1, vsrc, D, 1);
2006 cmp(dst, rscratch1);
2007 csel(dst, dst, rscratch1, cond);
2008 } else {
2009 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2010 if (size == T2S) {
2011 // For T2S (2x32-bit elements), use pairwise instructions because
2012 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2013 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2014 } else {
2015 // For other sizes, use reduction to scalar instructions.
2016 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2017 }
2018 if (bt == T_INT) {
2019 umov(dst, vtmp, S, 0);
2020 } else if (is_unsigned) {
2021 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2022 } else {
2023 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2024 }
2025 cmpw(dst, isrc);
2026 cselw(dst, dst, isrc, cond);
2027 }
2028 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2029 }
2030
2031 // Vector reduction for integral type with SVE instruction.
2032 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2033 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2034 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2035 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2036 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2037 assert(pg->is_governing(), "This register has to be a governing predicate register");
2038 assert_different_registers(src1, dst);
2039 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2040 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2041 switch (opc) {
2042 case Op_AddReductionVI: {
2043 sve_uaddv(tmp, size, pg, src2);
2044 if (bt == T_BYTE) {
2045 smov(dst, tmp, size, 0);
2046 addw(dst, src1, dst, ext::sxtb);
2047 } else if (bt == T_SHORT) {
2048 smov(dst, tmp, size, 0);
2049 addw(dst, src1, dst, ext::sxth);
2050 } else {
2051 umov(dst, tmp, size, 0);
2052 addw(dst, dst, src1);
2053 }
2054 break;
2055 }
2056 case Op_AddReductionVL: {
2057 sve_uaddv(tmp, size, pg, src2);
2058 umov(dst, tmp, size, 0);
2059 add(dst, dst, src1);
2060 break;
2061 }
2062 case Op_AndReductionV: {
2063 sve_andv(tmp, size, pg, src2);
2064 if (bt == T_INT || bt == T_LONG) {
2065 umov(dst, tmp, size, 0);
2066 } else {
2067 smov(dst, tmp, size, 0);
2068 }
2069 if (bt == T_LONG) {
2070 andr(dst, dst, src1);
2071 } else {
2072 andw(dst, dst, src1);
2073 }
2074 break;
2075 }
2076 case Op_OrReductionV: {
2077 sve_orv(tmp, size, pg, src2);
2078 if (bt == T_INT || bt == T_LONG) {
2079 umov(dst, tmp, size, 0);
2080 } else {
2081 smov(dst, tmp, size, 0);
2082 }
2083 if (bt == T_LONG) {
2084 orr(dst, dst, src1);
2085 } else {
2086 orrw(dst, dst, src1);
2087 }
2088 break;
2089 }
2090 case Op_XorReductionV: {
2091 sve_eorv(tmp, size, pg, src2);
2092 if (bt == T_INT || bt == T_LONG) {
2093 umov(dst, tmp, size, 0);
2094 } else {
2095 smov(dst, tmp, size, 0);
2096 }
2097 if (bt == T_LONG) {
2098 eor(dst, dst, src1);
2099 } else {
2100 eorw(dst, dst, src1);
2101 }
2102 break;
2103 }
2104 case Op_MaxReductionV:
2105 case Op_MinReductionV:
2106 case Op_UMaxReductionV:
2107 case Op_UMinReductionV: {
2108 bool is_min;
2109 bool is_unsigned;
2110 Condition cond;
2111 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2112 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2113 // Move result from vector to general register
2114 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2115 umov(dst, tmp, size, 0);
2116 } else {
2117 smov(dst, tmp, size, 0);
2118 }
2119 if (bt == T_LONG) {
2120 cmp(dst, src1);
2121 csel(dst, dst, src1, cond);
2122 } else {
2123 cmpw(dst, src1);
2124 cselw(dst, dst, src1, cond);
2125 }
2126 break;
2127 }
2128 default:
2129 assert(false, "unsupported");
2130 ShouldNotReachHere();
2131 }
2132
2133 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2134 if (bt == T_BYTE) {
2135 sxtb(dst, dst);
2136 } else if (bt == T_SHORT) {
2137 sxth(dst, dst);
2138 }
2139 }
2140 }
2141
2142 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2143 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2144 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2145 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2146 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2147 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2148
2149 // Set all elements to false if the input "lane_cnt" is zero.
2150 if (lane_cnt == 0) {
2151 sve_pfalse(dst);
2152 return;
2153 }
2154
2155 SIMD_RegVariant size = elemType_to_regVariant(bt);
2156 assert(size != Q, "invalid size");
2157
2158 // Set all true if "lane_cnt" equals to the max lane count.
2159 if (lane_cnt == max_vector_length) {
2160 sve_ptrue(dst, size, /* ALL */ 0b11111);
2161 return;
2162 }
2163
2164 // Fixed numbers for "ptrue".
2165 switch(lane_cnt) {
2166 case 1: /* VL1 */
2167 case 2: /* VL2 */
2168 case 3: /* VL3 */
2169 case 4: /* VL4 */
2170 case 5: /* VL5 */
2171 case 6: /* VL6 */
2172 case 7: /* VL7 */
2173 case 8: /* VL8 */
2174 sve_ptrue(dst, size, lane_cnt);
2175 return;
2176 case 16:
2177 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2178 return;
2179 case 32:
2180 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2181 return;
2182 case 64:
2183 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2184 return;
2185 case 128:
2186 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2187 return;
2188 case 256:
2189 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2190 return;
2191 default:
2192 break;
2193 }
2194
2195 // Special patterns for "ptrue".
2196 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2197 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2198 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2199 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2200 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2201 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2202 } else {
2203 // Encode to "whileltw" for the remaining cases.
2204 mov(rscratch1, lane_cnt);
2205 sve_whileltw(dst, size, zr, rscratch1);
2206 }
2207 }
2208
2209 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2210 // Any remaining elements of dst will be filled with zero.
2211 // Clobbers: rscratch1
2212 // Preserves: mask, vzr
2213 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2214 FloatRegister vzr, FloatRegister vtmp,
2215 PRegister pgtmp, unsigned vector_length_in_bytes) {
2216 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2217 // When called by sve_compress_byte, src and vtmp may be the same register.
2218 assert_different_registers(dst, src, vzr);
2219 assert_different_registers(dst, vtmp, vzr);
2220 assert_different_registers(mask, pgtmp);
2221 // high <-- low
2222 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2223 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2224 // Expected result: dst = 00 00 00 hh ee dd bb aa
2225
2226 // Extend lowest half to type INT.
2227 // dst = 00dd 00cc 00bb 00aa
2228 sve_uunpklo(dst, S, src);
2229 // pgtmp = 0001 0000 0001 0001
2230 sve_punpklo(pgtmp, mask);
2231 // Pack the active elements in size of type INT to the right,
2232 // and fill the remainings with zero.
2233 // dst = 0000 00dd 00bb 00aa
2234 sve_compact(dst, S, dst, pgtmp);
2235 // Narrow the result back to type SHORT.
2236 // dst = 00 00 00 00 00 dd bb aa
2237 sve_uzp1(dst, H, dst, vzr);
2238
2239 // Return if the vector length is no more than MaxVectorSize/2, since the
2240 // highest half is invalid.
2241 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2242 return;
2243 }
2244
2245 // Count the active elements of lowest half.
2246 // rscratch1 = 3
2247 sve_cntp(rscratch1, S, ptrue, pgtmp);
2248
2249 // Repeat to the highest half.
2250 // pgtmp = 0001 0000 0000 0001
2251 sve_punpkhi(pgtmp, mask);
2252 // vtmp = 00hh 00gg 00ff 00ee
2253 sve_uunpkhi(vtmp, S, src);
2254 // vtmp = 0000 0000 00hh 00ee
2255 sve_compact(vtmp, S, vtmp, pgtmp);
2256 // vtmp = 00 00 00 00 00 00 hh ee
2257 sve_uzp1(vtmp, H, vtmp, vzr);
2258
2259 // pgtmp = 00 00 00 00 00 01 01 01
2260 sve_whilelt(pgtmp, H, zr, rscratch1);
2261 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2262 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2263 // Combine the compressed low with the compressed high:
2264 // dst = 00 00 00 hh ee dd bb aa
2265 sve_splice(dst, H, pgtmp, vtmp);
2266 }
2267
2268 // Clobbers: rscratch1, rscratch2
2269 // Preserves: src, mask
2270 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2271 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2272 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2273 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2274 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2275 assert_different_registers(mask, ptmp, pgtmp);
2276 // high <-- low
2277 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2278 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2279 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2280 FloatRegister vzr = vtmp3;
2281 sve_dup(vzr, B, 0);
2282
2283 // Extend lowest half to type SHORT.
2284 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2285 sve_uunpklo(vtmp1, H, src);
2286 // ptmp = 00 01 00 00 00 01 00 01
2287 sve_punpklo(ptmp, mask);
2288 // Pack the active elements in size of type SHORT to the right,
2289 // and fill the remainings with zero.
2290 // dst = 00 00 00 00 00 0g 0c 0a
2291 unsigned extended_size = vector_length_in_bytes << 1;
2292 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2293 // Narrow the result back to type BYTE.
2294 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2295 sve_uzp1(dst, B, dst, vzr);
2296
2297 // Return if the vector length is no more than MaxVectorSize/2, since the
2298 // highest half is invalid.
2299 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2300 return;
2301 }
2302 // Count the active elements of lowest half.
2303 // rscratch2 = 3
2304 sve_cntp(rscratch2, H, ptrue, ptmp);
2305
2306 // Repeat to the highest half.
2307 // ptmp = 00 01 00 00 00 00 00 01
2308 sve_punpkhi(ptmp, mask);
2309 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2310 sve_uunpkhi(vtmp2, H, src);
2311 // vtmp1 = 00 00 00 00 00 00 0p 0i
2312 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2313 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2314 sve_uzp1(vtmp1, B, vtmp1, vzr);
2315
2316 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2317 sve_whilelt(ptmp, B, zr, rscratch2);
2318 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2319 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2320 // Combine the compressed low with the compressed high:
2321 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2322 sve_splice(dst, B, ptmp, vtmp1);
2323 }
2324
2325 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2326 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2327 SIMD_Arrangement size = isQ ? T16B : T8B;
2328 if (bt == T_BYTE) {
2329 rbit(dst, size, src);
2330 } else {
2331 neon_reverse_bytes(dst, src, bt, isQ);
2332 rbit(dst, size, dst);
2333 }
2334 }
2335
2336 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2337 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2338 SIMD_Arrangement size = isQ ? T16B : T8B;
2339 switch (bt) {
2340 case T_BYTE:
2341 if (dst != src) {
2342 orr(dst, size, src, src);
2343 }
2344 break;
2345 case T_SHORT:
2346 rev16(dst, size, src);
2347 break;
2348 case T_INT:
2349 rev32(dst, size, src);
2350 break;
2351 case T_LONG:
2352 rev64(dst, size, src);
2353 break;
2354 default:
2355 assert(false, "unsupported");
2356 ShouldNotReachHere();
2357 }
2358 }
2359
2360 // VectorRearrange implementation for short/int/float/long/double types with NEON
2361 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2362 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2363 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2364 // and use bsl to implement the operation.
2365 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2366 FloatRegister shuffle, FloatRegister tmp,
2367 BasicType bt, bool isQ) {
2368 assert_different_registers(dst, src, shuffle, tmp);
2369 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2370 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2371
2372 // Here is an example that rearranges a NEON vector with 4 ints:
2373 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2374 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2375 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2376 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2377 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2378 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2379 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2380 // 4. Use Vm as index register, and use V1 as table register.
2381 // Then get V2 as the result by tbl NEON instructions.
2382 switch (bt) {
2383 case T_SHORT:
2384 mov(tmp, size1, 0x02);
2385 mulv(dst, size2, shuffle, tmp);
2386 mov(tmp, size2, 0x0100);
2387 addv(dst, size1, dst, tmp);
2388 tbl(dst, size1, src, 1, dst);
2389 break;
2390 case T_INT:
2391 case T_FLOAT:
2392 mov(tmp, size1, 0x04);
2393 mulv(dst, size2, shuffle, tmp);
2394 mov(tmp, size2, 0x03020100);
2395 addv(dst, size1, dst, tmp);
2396 tbl(dst, size1, src, 1, dst);
2397 break;
2398 case T_LONG:
2399 case T_DOUBLE:
2400 // Load the iota indices for Long type. The indices are ordered by
2401 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2402 // the offset for L is 48.
2403 lea(rscratch1,
2404 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2405 ldrq(tmp, rscratch1);
2406 // Check whether the input "shuffle" is the same with iota indices.
2407 // Return "src" if true, otherwise swap the two elements of "src".
2408 cm(EQ, dst, size2, shuffle, tmp);
2409 ext(tmp, size1, src, src, 8);
2410 bsl(dst, size1, src, tmp);
2411 break;
2412 default:
2413 assert(false, "unsupported element type");
2414 ShouldNotReachHere();
2415 }
2416 }
2417
2418 // Extract a scalar element from an sve vector at position 'idx'.
2419 // The input elements in src are expected to be of integral type.
2420 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2421 int idx, FloatRegister vtmp) {
2422 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2423 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2424 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2425 if (bt == T_INT || bt == T_LONG) {
2426 umov(dst, src, size, idx);
2427 } else {
2428 smov(dst, src, size, idx);
2429 }
2430 } else {
2431 sve_orr(vtmp, src, src);
2432 sve_ext(vtmp, vtmp, idx << size);
2433 if (bt == T_INT || bt == T_LONG) {
2434 umov(dst, vtmp, size, 0);
2435 } else {
2436 smov(dst, vtmp, size, 0);
2437 }
2438 }
2439 }
2440
2441 // java.lang.Math::round intrinsics
2442
2443 // Clobbers: rscratch1, rflags
2444 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2445 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2446 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2447 switch (T) {
2448 case T2S:
2449 case T4S:
2450 fmovs(tmp1, T, 0.5f);
2451 mov(rscratch1, jint_cast(0x1.0p23f));
2452 break;
2453 case T2D:
2454 fmovd(tmp1, T, 0.5);
2455 mov(rscratch1, julong_cast(0x1.0p52));
2456 break;
2457 default:
2458 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2459 }
2460 fadd(tmp1, T, tmp1, src);
2461 fcvtms(tmp1, T, tmp1);
2462 // tmp1 = floor(src + 0.5, ties to even)
2463
2464 fcvtas(dst, T, src);
2465 // dst = round(src), ties to away
2466
2467 fneg(tmp3, T, src);
2468 dup(tmp2, T, rscratch1);
2469 cm(HS, tmp3, T, tmp3, tmp2);
2470 // tmp3 is now a set of flags
2471
2472 bif(dst, T16B, tmp1, tmp3);
2473 // result in dst
2474 }
2475
2476 // Clobbers: rscratch1, rflags
2477 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2478 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2479 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2480 assert_different_registers(tmp1, tmp2, src, dst);
2481
2482 switch (T) {
2483 case S:
2484 mov(rscratch1, jint_cast(0x1.0p23f));
2485 break;
2486 case D:
2487 mov(rscratch1, julong_cast(0x1.0p52));
2488 break;
2489 default:
2490 assert(T == S || T == D, "invalid register variant");
2491 }
2492
2493 sve_frinta(dst, T, ptrue, src);
2494 // dst = round(src), ties to away
2495
2496 Label none;
2497
2498 sve_fneg(tmp1, T, ptrue, src);
2499 sve_dup(tmp2, T, rscratch1);
2500 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2501 br(EQ, none);
2502 {
2503 sve_cpy(tmp1, T, pgtmp, 0.5);
2504 sve_fadd(tmp1, T, pgtmp, src);
2505 sve_frintm(dst, T, pgtmp, tmp1);
2506 // dst = floor(src + 0.5, ties to even)
2507 }
2508 bind(none);
2509
2510 sve_fcvtzs(dst, T, ptrue, dst, T);
2511 // result in dst
2512 }
2513
2514 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2515 FloatRegister one, SIMD_Arrangement T) {
2516 assert_different_registers(dst, src, zero, one);
2517 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2518
2519 facgt(dst, T, src, zero);
2520 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2521 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2522 }
2523
2524 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2525 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2526 assert_different_registers(dst, src, zero, one, vtmp);
2527 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2528
2529 sve_orr(vtmp, src, src);
2530 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2531 switch (T) {
2532 case S:
2533 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2534 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2535 // on the sign of the float value
2536 break;
2537 case D:
2538 sve_and(vtmp, T, min_jlong);
2539 sve_orr(vtmp, T, jlong_cast(1.0));
2540 break;
2541 default:
2542 assert(false, "unsupported");
2543 ShouldNotReachHere();
2544 }
2545 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2546 // Result in dst
2547 }
2548
2549 bool C2_MacroAssembler::in_scratch_emit_size() {
2550 if (ciEnv::current()->task() != nullptr) {
2551 PhaseOutput* phase_output = Compile::current()->output();
2552 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2553 return true;
2554 }
2555 }
2556 return MacroAssembler::in_scratch_emit_size();
2557 }
2558
2559 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2560 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2561 }
2562
2563 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2564 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2565 if (t == TypeInt::INT) {
2566 return;
2567 }
2568
2569 BLOCK_COMMENT("verify_int_in_range {");
2570 Label L_success, L_failure;
2571
2572 jint lo = t->_lo;
2573 jint hi = t->_hi;
2574
2575 if (lo != min_jint) {
2576 subsw(rtmp, rval, lo);
2577 br(Assembler::LT, L_failure);
2578 }
2579 if (hi != max_jint) {
2580 subsw(rtmp, rval, hi);
2581 br(Assembler::GT, L_failure);
2582 }
2583 b(L_success);
2584
2585 bind(L_failure);
2586 movw(c_rarg0, idx);
2587 mov(c_rarg1, rval);
2588 movw(c_rarg2, lo);
2589 movw(c_rarg3, hi);
2590 reconstruct_frame_pointer(rtmp);
2591 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2592 hlt(0);
2593
2594 bind(L_success);
2595 BLOCK_COMMENT("} verify_int_in_range");
2596 }
2597
2598 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2599 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2600 }
2601
2602 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2603 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2604 if (t == TypeLong::LONG) {
2605 return;
2606 }
2607
2608 BLOCK_COMMENT("verify_long_in_range {");
2609 Label L_success, L_failure;
2610
2611 jlong lo = t->_lo;
2612 jlong hi = t->_hi;
2613
2614 if (lo != min_jlong) {
2615 subs(rtmp, rval, lo);
2616 br(Assembler::LT, L_failure);
2617 }
2618 if (hi != max_jlong) {
2619 subs(rtmp, rval, hi);
2620 br(Assembler::GT, L_failure);
2621 }
2622 b(L_success);
2623
2624 bind(L_failure);
2625 movw(c_rarg0, idx);
2626 mov(c_rarg1, rval);
2627 mov(c_rarg2, lo);
2628 mov(c_rarg3, hi);
2629 reconstruct_frame_pointer(rtmp);
2630 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2631 hlt(0);
2632
2633 bind(L_success);
2634 BLOCK_COMMENT("} verify_long_in_range");
2635 }
2636
2637 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2638 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2639 if (PreserveFramePointer) {
2640 // frame pointer is valid
2641 #ifdef ASSERT
2642 // Verify frame pointer value in rfp.
2643 add(rtmp, sp, framesize - 2 * wordSize);
2644 Label L_success;
2645 cmp(rfp, rtmp);
2646 br(Assembler::EQ, L_success);
2647 stop("frame pointer mismatch");
2648 bind(L_success);
2649 #endif // ASSERT
2650 } else {
2651 add(rfp, sp, framesize - 2 * wordSize);
2652 }
2653 }
2654
2655 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2656 // using Neon instructions and places it in the destination vector element corresponding to the
2657 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2658 // where NUM_ELEM is the number of BasicType elements per vector.
2659 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2660 // Otherwise, selects src2[idx – NUM_ELEM]
2661 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2662 FloatRegister src2, FloatRegister index,
2663 FloatRegister tmp, unsigned vector_length_in_bytes) {
2664 assert_different_registers(dst, src1, src2, tmp);
2665 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2666
2667 if (vector_length_in_bytes == 16) {
2668 assert(UseSVE <= 1, "sve must be <= 1");
2669 assert(src1->successor() == src2, "Source registers must be ordered");
2670 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2671 tbl(dst, size, src1, 2, index);
2672 } else { // vector length == 8
2673 assert(UseSVE == 0, "must be Neon only");
2674 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2675 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2676 // instruction with one vector lookup
2677 ins(tmp, D, src1, 0, 0);
2678 ins(tmp, D, src2, 1, 0);
2679 tbl(dst, size, tmp, 1, index);
2680 }
2681 }
2682
2683 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2684 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2685 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2686 // where NUM_ELEM is the number of BasicType elements per vector.
2687 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2688 // Otherwise, selects src2[idx – NUM_ELEM]
2689 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2690 FloatRegister src2, FloatRegister index,
2691 FloatRegister tmp, SIMD_RegVariant T,
2692 unsigned vector_length_in_bytes) {
2693 assert_different_registers(dst, src1, src2, index, tmp);
2694
2695 if (vector_length_in_bytes == 8) {
2696 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2697 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2698 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2699 // instruction with one vector lookup
2700 assert(UseSVE >= 1, "sve must be >= 1");
2701 ins(tmp, D, src1, 0, 0);
2702 ins(tmp, D, src2, 1, 0);
2703 sve_tbl(dst, T, tmp, index);
2704 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2705 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2706 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2707 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2708 // with the only exception of 8B vector length.
2709 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2710 assert(src1->successor() == src2, "Source registers must be ordered");
2711 sve_tbl(dst, T, src1, src2, index);
2712 }
2713 }
2714
2715 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2716 FloatRegister src2, FloatRegister index,
2717 FloatRegister tmp, BasicType bt,
2718 unsigned vector_length_in_bytes) {
2719
2720 assert_different_registers(dst, src1, src2, index, tmp);
2721
2722 // The cases that can reach this method are -
2723 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2724 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2725 //
2726 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2727 // and UseSVE = 2 with vector_length_in_bytes >= 8
2728 //
2729 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2730 // UseSVE = 1 with vector_length_in_bytes = 16
2731
2732 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2733 SIMD_RegVariant T = elemType_to_regVariant(bt);
2734 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2735 return;
2736 }
2737
2738 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2739 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2740 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2741
2742 bool isQ = vector_length_in_bytes == 16;
2743
2744 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2745 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2746
2747 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2748 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2749 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2750 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2751 // the indices can range from [0, 8).
2752 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2753 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2754 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2755 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2756 // Add the multiplied result to the vector in tmp to obtain the byte level
2757 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2758 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2759
2760 if (bt == T_BYTE) {
2761 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2762 } else {
2763 int elem_size = (bt == T_SHORT) ? 2 : 4;
2764 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2765
2766 mov(tmp, size1, elem_size);
2767 mulv(dst, size2, index, tmp);
2768 mov(tmp, size2, tbl_offset);
2769 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2770 // to select a set of 2B/4B
2771 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2772 }
2773 }
2774
2775 // Vector expand implementation. Elements from the src vector are expanded into
2776 // the dst vector under the control of the vector mask.
2777 // Since there are no native instructions directly corresponding to expand before
2778 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2779 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2780 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2781 // for NEON and SVE, but with different instructions where appropriate.
2782
2783 // Vector expand implementation for NEON.
2784 //
2785 // An example of 128-bit Byte vector:
2786 // Data direction: high <== low
2787 // Input:
2788 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2789 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2790 // Expected result:
2791 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2792 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2793 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2794 int vector_length_in_bytes) {
2795 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2796 assert_different_registers(dst, src, mask, tmp1, tmp2);
2797 // Since the TBL instruction only supports byte table, we need to
2798 // compute indices in byte type for all types.
2799 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2800 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2801 dup(tmp1, size, zr);
2802 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2803 negr(dst, size, mask);
2804 // Calculate vector index for TBL with prefix sum algorithm.
2805 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2806 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2807 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2808 addv(dst, size, tmp2, dst);
2809 }
2810 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2811 orr(tmp2, size, mask, mask);
2812 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2813 bsl(tmp2, size, dst, tmp1);
2814 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2815 movi(tmp1, size, 1);
2816 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2817 subv(dst, size, tmp2, tmp1);
2818 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2819 tbl(dst, size, src, 1, dst);
2820 }
2821
2822 // Vector expand implementation for SVE.
2823 //
2824 // An example of 128-bit Short vector:
2825 // Data direction: high <== low
2826 // Input:
2827 // src = gf ed cb a9 87 65 43 21
2828 // pg = 00 01 00 01 00 01 00 01
2829 // Expected result:
2830 // dst = 00 87 00 65 00 43 00 21
2831 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2832 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2833 int vector_length_in_bytes) {
2834 assert(UseSVE > 0, "expand implementation only for SVE");
2835 assert_different_registers(dst, src, tmp1, tmp2);
2836 SIMD_RegVariant size = elemType_to_regVariant(bt);
2837
2838 // tmp1 = 00 00 00 00 00 00 00 00
2839 sve_dup(tmp1, size, 0);
2840 sve_movprfx(tmp2, tmp1);
2841 // tmp2 = 00 01 00 01 00 01 00 01
2842 sve_cpy(tmp2, size, pg, 1, true);
2843 // Calculate vector index for TBL with prefix sum algorithm.
2844 // tmp2 = 04 04 03 03 02 02 01 01
2845 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2846 sve_movprfx(dst, tmp1);
2847 // The EXT instruction operates on the full-width sve register. The correct
2848 // index calculation method is:
2849 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2850 // MaxVectorSize - i.
2851 sve_ext(dst, tmp2, MaxVectorSize - i);
2852 sve_add(tmp2, size, dst, tmp2);
2853 }
2854 // dst = 00 04 00 03 00 02 00 01
2855 sve_sel(dst, size, pg, tmp2, tmp1);
2856 // dst = -1 03 -1 02 -1 01 -1 00
2857 sve_sub(dst, size, 1);
2858 // dst = 00 87 00 65 00 43 00 21
2859 sve_tbl(dst, size, src, dst);
2860 }