1 /*
2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/stubRoutines.hpp"
34 #include "utilities/globalDefinitions.hpp"
35 #include "utilities/powerOfTwo.hpp"
36
37 #ifdef PRODUCT
38 #define BLOCK_COMMENT(str) /* nothing */
39 #define STOP(error) stop(error)
40 #else
41 #define BLOCK_COMMENT(str) block_comment(str)
42 #define STOP(error) block_comment(error); stop(error)
43 #endif
44
45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46
47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
48
49 // jdk.internal.util.ArraysSupport.vectorizedHashCode
50 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
51 FloatRegister vdata0, FloatRegister vdata1,
52 FloatRegister vdata2, FloatRegister vdata3,
53 FloatRegister vmul0, FloatRegister vmul1,
54 FloatRegister vmul2, FloatRegister vmul3,
55 FloatRegister vpow, FloatRegister vpowm,
56 BasicType eltype) {
57 ARRAYS_HASHCODE_REGISTERS;
58
59 Register tmp1 = rscratch1, tmp2 = rscratch2;
60
61 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
62
63 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
64 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
65 // use 4H for chars and shorts instead, but using 8H gives better performance.
66 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
67 : eltype == T_CHAR || eltype == T_SHORT ? 8
68 : eltype == T_INT ? 4
69 : 0;
70 guarantee(vf, "unsupported eltype");
71
72 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
73 const size_t unroll_factor = 4;
74
75 switch (eltype) {
76 case T_BOOLEAN:
77 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
78 break;
79 case T_CHAR:
80 BLOCK_COMMENT("arrays_hashcode(char) {");
81 break;
82 case T_BYTE:
83 BLOCK_COMMENT("arrays_hashcode(byte) {");
84 break;
85 case T_SHORT:
86 BLOCK_COMMENT("arrays_hashcode(short) {");
87 break;
88 case T_INT:
89 BLOCK_COMMENT("arrays_hashcode(int) {");
90 break;
91 default:
92 ShouldNotReachHere();
93 }
94
95 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
96 // implemented by the stub executes just once. Call the stub only if at least two iterations will
97 // be executed.
98 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
99 cmpw(cnt, large_threshold);
100 br(Assembler::HS, LARGE);
101
102 bind(TAIL);
103
104 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
105 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
106 // Iteration eats up the remainder, uf elements at a time.
107 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
108 andr(tmp2, cnt, unroll_factor - 1);
109 adr(tmp1, BR_BASE);
110 // For Cortex-A53 offset is 4 because 2 nops are generated.
111 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
112 movw(tmp2, 0x1f);
113 br(tmp1);
114
115 bind(LOOP);
116 for (size_t i = 0; i < unroll_factor; ++i) {
117 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
118 maddw(result, result, tmp2, tmp1);
119 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
120 // Generate 2nd nop to have 4 instructions per iteration.
121 if (VM_Version::supports_a53mac()) {
122 nop();
123 }
124 }
125 bind(BR_BASE);
126 subsw(cnt, cnt, unroll_factor);
127 br(Assembler::HS, LOOP);
128
129 b(DONE);
130
131 bind(LARGE);
132
133 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
134 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
135 address tpc = trampoline_call(stub);
136 if (tpc == nullptr) {
137 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
138 postcond(pc() == badAddress);
139 return nullptr;
140 }
141
142 bind(DONE);
143
144 BLOCK_COMMENT("} // arrays_hashcode");
145
146 postcond(pc() != badAddress);
147 return pc();
148 }
149
150 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
151 Register t2, Register t3) {
152 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
153
154 // Handle inflated monitor.
155 Label inflated;
156 // Finish fast lock successfully. MUST branch to with flag == EQ
157 Label locked;
158 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
159 Label slow_path;
160
161 if (UseObjectMonitorTable) {
162 // Clear cache in case fast locking succeeds or we need to take the slow-path.
163 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
164 }
165
166 if (DiagnoseSyncOnValueBasedClasses != 0) {
167 load_klass(t1, obj);
168 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
169 tst(t1, KlassFlags::_misc_is_value_based_class);
170 br(Assembler::NE, slow_path);
171 }
172
173 const Register t1_mark = t1;
174 const Register t3_t = t3;
175
176 { // Fast locking
177
178 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
179 Label push;
180
181 const Register t2_top = t2;
182
183 // Check if lock-stack is full.
184 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
185 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
186 br(Assembler::GT, slow_path);
187
188 // Check if recursive.
189 subw(t3_t, t2_top, oopSize);
190 ldr(t3_t, Address(rthread, t3_t));
191 cmp(obj, t3_t);
192 br(Assembler::EQ, push);
193
194 // Relaxed normal load to check for monitor. Optimization for monitor case.
195 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
196 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
197
198 // Not inflated
199 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
200
201 // Try to lock. Transition lock-bits 0b01 => 0b00
202 orr(t1_mark, t1_mark, markWord::unlocked_value);
203 eor(t3_t, t1_mark, markWord::unlocked_value);
204 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
205 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
206 br(Assembler::NE, slow_path);
207
208 bind(push);
209 // After successful lock, push object on lock-stack.
210 str(obj, Address(rthread, t2_top));
211 addw(t2_top, t2_top, oopSize);
212 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
213 b(locked);
214 }
215
216 { // Handle inflated monitor.
217 bind(inflated);
218
219 const Register t1_monitor = t1;
220
221 if (!UseObjectMonitorTable) {
222 assert(t1_monitor == t1_mark, "should be the same here");
223 } else {
224 Label monitor_found;
225
226 // Load cache address
227 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
228
229 const int num_unrolled = 2;
230 for (int i = 0; i < num_unrolled; i++) {
231 ldr(t1, Address(t3_t));
232 cmp(obj, t1);
233 br(Assembler::EQ, monitor_found);
234 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
235 }
236
237 Label loop;
238
239 // Search for obj in cache.
240 bind(loop);
241
242 // Check for match.
243 ldr(t1, Address(t3_t));
244 cmp(obj, t1);
245 br(Assembler::EQ, monitor_found);
246
247 // Search until null encountered, guaranteed _null_sentinel at end.
248 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
249 cbnz(t1, loop);
250 // Cache Miss, NE set from cmp above, cbnz does not set flags
251 b(slow_path);
252
253 bind(monitor_found);
254 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
255 }
256
257 const Register t2_owner_addr = t2;
258 const Register t3_owner = t3;
259 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
260 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
261 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
262
263 Label monitor_locked;
264
265 // Compute owner address.
266 lea(t2_owner_addr, owner_address);
267
268 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
269 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
270 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
271 /*release*/ false, /*weak*/ false, t3_owner);
272 br(Assembler::EQ, monitor_locked);
273
274 // Check if recursive.
275 cmp(t3_owner, rscratch2);
276 br(Assembler::NE, slow_path);
277
278 // Recursive.
279 increment(recursions_address, 1);
280
281 bind(monitor_locked);
282 if (UseObjectMonitorTable) {
283 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
284 }
285 }
286
287 bind(locked);
288
289 #ifdef ASSERT
290 // Check that locked label is reached with Flags == EQ.
291 Label flag_correct;
292 br(Assembler::EQ, flag_correct);
293 stop("Fast Lock Flag != EQ");
294 #endif
295
296 bind(slow_path);
297 #ifdef ASSERT
298 // Check that slow_path label is reached with Flags == NE.
299 br(Assembler::NE, flag_correct);
300 stop("Fast Lock Flag != NE");
301 bind(flag_correct);
302 #endif
303 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
304 }
305
306 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
307 Register t2, Register t3) {
308 assert_different_registers(obj, box, t1, t2, t3);
309
310 // Handle inflated monitor.
311 Label inflated, inflated_load_mark;
312 // Finish fast unlock successfully. MUST branch to with flag == EQ
313 Label unlocked;
314 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
315 Label slow_path;
316
317 const Register t1_mark = t1;
318 const Register t2_top = t2;
319 const Register t3_t = t3;
320
321 { // Fast unlock
322
323 Label push_and_slow_path;
324
325 // Check if obj is top of lock-stack.
326 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
327 subw(t2_top, t2_top, oopSize);
328 ldr(t3_t, Address(rthread, t2_top));
329 cmp(obj, t3_t);
330 // Top of lock stack was not obj. Must be monitor.
331 br(Assembler::NE, inflated_load_mark);
332
333 // Pop lock-stack.
334 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
335 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
336
337 // Check if recursive.
338 subw(t3_t, t2_top, oopSize);
339 ldr(t3_t, Address(rthread, t3_t));
340 cmp(obj, t3_t);
341 br(Assembler::EQ, unlocked);
342
343 // Not recursive.
344 // Load Mark.
345 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
346
347 // Check header for monitor (0b10).
348 // Because we got here by popping (meaning we pushed in locked)
349 // there will be no monitor in the box. So we need to push back the obj
350 // so that the runtime can fix any potential anonymous owner.
351 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
352
353 // Try to unlock. Transition lock bits 0b00 => 0b01
354 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
355 orr(t3_t, t1_mark, markWord::unlocked_value);
356 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
357 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
358 br(Assembler::EQ, unlocked);
359
360 bind(push_and_slow_path);
361 // Compare and exchange failed.
362 // Restore lock-stack and handle the unlock in runtime.
363 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
364 addw(t2_top, t2_top, oopSize);
365 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
366 b(slow_path);
367 }
368
369
370 { // Handle inflated monitor.
371 bind(inflated_load_mark);
372 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
373 #ifdef ASSERT
374 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
375 stop("Fast Unlock not monitor");
376 #endif
377
378 bind(inflated);
379
380 #ifdef ASSERT
381 Label check_done;
382 subw(t2_top, t2_top, oopSize);
383 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
384 br(Assembler::LT, check_done);
385 ldr(t3_t, Address(rthread, t2_top));
386 cmp(obj, t3_t);
387 br(Assembler::NE, inflated);
388 stop("Fast Unlock lock on stack");
389 bind(check_done);
390 #endif
391
392 const Register t1_monitor = t1;
393
394 if (!UseObjectMonitorTable) {
395 assert(t1_monitor == t1_mark, "should be the same here");
396
397 // Untag the monitor.
398 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
399 } else {
400 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
401 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
402 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
403 br(Assembler::LO, slow_path);
404 }
405
406 const Register t2_recursions = t2;
407 Label not_recursive;
408
409 // Check if recursive.
410 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
411 cbz(t2_recursions, not_recursive);
412
413 // Recursive unlock.
414 sub(t2_recursions, t2_recursions, 1u);
415 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
416 // Set flag == EQ
417 cmp(t2_recursions, t2_recursions);
418 b(unlocked);
419
420 bind(not_recursive);
421
422 const Register t2_owner_addr = t2;
423
424 // Compute owner address.
425 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
426
427 // Set owner to null.
428 // Release to satisfy the JMM
429 stlr(zr, t2_owner_addr);
430 // We need a full fence after clearing owner to avoid stranding.
431 // StoreLoad achieves this.
432 membar(StoreLoad);
433
434 // Check if the entry_list is empty.
435 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
436 cmp(rscratch1, zr);
437 br(Assembler::EQ, unlocked); // If so we are done.
438
439 // Check if there is a successor.
440 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
441 cmp(rscratch1, zr);
442 br(Assembler::NE, unlocked); // If so we are done.
443
444 // Save the monitor pointer in the current thread, so we can try to
445 // reacquire the lock in SharedRuntime::monitor_exit_helper().
446 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
447
448 cmp(zr, rthread); // Set Flag to NE => slow path
449 b(slow_path);
450 }
451
452 bind(unlocked);
453 cmp(zr, zr); // Set Flags to EQ => fast path
454
455 #ifdef ASSERT
456 // Check that unlocked label is reached with Flags == EQ.
457 Label flag_correct;
458 br(Assembler::EQ, flag_correct);
459 stop("Fast Unlock Flag != EQ");
460 #endif
461
462 bind(slow_path);
463 #ifdef ASSERT
464 // Check that slow_path label is reached with Flags == NE.
465 br(Assembler::NE, flag_correct);
466 stop("Fast Unlock Flag != NE");
467 bind(flag_correct);
468 #endif
469 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
470 }
471
472 // Search for str1 in str2 and return index or -1
473 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
474 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
475 Register cnt2, Register cnt1,
476 Register tmp1, Register tmp2,
477 Register tmp3, Register tmp4,
478 Register tmp5, Register tmp6,
479 int icnt1, Register result, int ae) {
480 // NOTE: tmp5, tmp6 can be zr depending on specific method version
481 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
482
483 Register ch1 = rscratch1;
484 Register ch2 = rscratch2;
485 Register cnt1tmp = tmp1;
486 Register cnt2tmp = tmp2;
487 Register cnt1_neg = cnt1;
488 Register cnt2_neg = cnt2;
489 Register result_tmp = tmp4;
490
491 bool isL = ae == StrIntrinsicNode::LL;
492
493 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
494 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
495 int str1_chr_shift = str1_isL ? 0:1;
496 int str2_chr_shift = str2_isL ? 0:1;
497 int str1_chr_size = str1_isL ? 1:2;
498 int str2_chr_size = str2_isL ? 1:2;
499 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
500 (chr_insn)&MacroAssembler::ldrh;
501 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
502 (chr_insn)&MacroAssembler::ldrh;
503 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
504 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
505
506 // Note, inline_string_indexOf() generates checks:
507 // if (substr.count > string.count) return -1;
508 // if (substr.count == 0) return 0;
509
510 // We have two strings, a source string in str2, cnt2 and a pattern string
511 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
512
513 // For larger pattern and source we use a simplified Boyer Moore algorithm.
514 // With a small pattern and source we use linear scan.
515
516 if (icnt1 == -1) {
517 sub(result_tmp, cnt2, cnt1);
518 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
519 br(LT, LINEARSEARCH);
520 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
521 subs(zr, cnt1, 256);
522 lsr(tmp1, cnt2, 2);
523 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
524 br(GE, LINEARSTUB);
525 }
526
527 // The Boyer Moore alogorithm is based on the description here:-
528 //
529 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
530 //
531 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
532 // and the 'Good Suffix' rule.
533 //
534 // These rules are essentially heuristics for how far we can shift the
535 // pattern along the search string.
536 //
537 // The implementation here uses the 'Bad Character' rule only because of the
538 // complexity of initialisation for the 'Good Suffix' rule.
539 //
540 // This is also known as the Boyer-Moore-Horspool algorithm:-
541 //
542 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
543 //
544 // This particular implementation has few java-specific optimizations.
545 //
546 // #define ASIZE 256
547 //
548 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
549 // int i, j;
550 // unsigned c;
551 // unsigned char bc[ASIZE];
552 //
553 // /* Preprocessing */
554 // for (i = 0; i < ASIZE; ++i)
555 // bc[i] = m;
556 // for (i = 0; i < m - 1; ) {
557 // c = x[i];
558 // ++i;
559 // // c < 256 for Latin1 string, so, no need for branch
560 // #ifdef PATTERN_STRING_IS_LATIN1
561 // bc[c] = m - i;
562 // #else
563 // if (c < ASIZE) bc[c] = m - i;
564 // #endif
565 // }
566 //
567 // /* Searching */
568 // j = 0;
569 // while (j <= n - m) {
570 // c = y[i+j];
571 // if (x[m-1] == c)
572 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
573 // if (i < 0) return j;
574 // // c < 256 for Latin1 string, so, no need for branch
575 // #ifdef SOURCE_STRING_IS_LATIN1
576 // // LL case: (c< 256) always true. Remove branch
577 // j += bc[y[j+m-1]];
578 // #endif
579 // #ifndef PATTERN_STRING_IS_UTF
580 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
581 // if (c < ASIZE)
582 // j += bc[y[j+m-1]];
583 // else
584 // j += 1
585 // #endif
586 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
587 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
588 // if (c < ASIZE)
589 // j += bc[y[j+m-1]];
590 // else
591 // j += m
592 // #endif
593 // }
594 // }
595
596 if (icnt1 == -1) {
597 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
598 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
599 Register cnt1end = tmp2;
600 Register str2end = cnt2;
601 Register skipch = tmp2;
602
603 // str1 length is >=8, so, we can read at least 1 register for cases when
604 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
605 // UL case. We'll re-read last character in inner pre-loop code to have
606 // single outer pre-loop load
607 const int firstStep = isL ? 7 : 3;
608
609 const int ASIZE = 256;
610 const int STORED_BYTES = 32; // amount of bytes stored per instruction
611 sub(sp, sp, ASIZE);
612 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
613 mov(ch1, sp);
614 BIND(BM_INIT_LOOP);
615 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
616 subs(tmp5, tmp5, 1);
617 br(GT, BM_INIT_LOOP);
618
619 sub(cnt1tmp, cnt1, 1);
620 mov(tmp5, str2);
621 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
622 sub(ch2, cnt1, 1);
623 mov(tmp3, str1);
624 BIND(BCLOOP);
625 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
626 if (!str1_isL) {
627 subs(zr, ch1, ASIZE);
628 br(HS, BCSKIP);
629 }
630 strb(ch2, Address(sp, ch1));
631 BIND(BCSKIP);
632 subs(ch2, ch2, 1);
633 br(GT, BCLOOP);
634
635 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
636 if (str1_isL == str2_isL) {
637 // load last 8 bytes (8LL/4UU symbols)
638 ldr(tmp6, Address(tmp6, -wordSize));
639 } else {
640 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
641 // convert Latin1 to UTF. We'll have to wait until load completed, but
642 // it's still faster than per-character loads+checks
643 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
644 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
645 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
646 andr(tmp6, tmp6, 0xFF); // str1[N-4]
647 orr(ch2, ch1, ch2, LSL, 16);
648 orr(tmp6, tmp6, tmp3, LSL, 48);
649 orr(tmp6, tmp6, ch2, LSL, 16);
650 }
651 BIND(BMLOOPSTR2);
652 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
653 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
654 if (str1_isL == str2_isL) {
655 // re-init tmp3. It's for free because it's executed in parallel with
656 // load above. Alternative is to initialize it before loop, but it'll
657 // affect performance on in-order systems with 2 or more ld/st pipelines
658 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
659 }
660 if (!isL) { // UU/UL case
661 lsl(ch2, cnt1tmp, 1); // offset in bytes
662 }
663 cmp(tmp3, skipch);
664 br(NE, BMSKIP);
665 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
666 mov(ch1, tmp6);
667 if (isL) {
668 b(BMLOOPSTR1_AFTER_LOAD);
669 } else {
670 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
671 b(BMLOOPSTR1_CMP);
672 }
673 BIND(BMLOOPSTR1);
674 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
675 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
676 BIND(BMLOOPSTR1_AFTER_LOAD);
677 subs(cnt1tmp, cnt1tmp, 1);
678 br(LT, BMLOOPSTR1_LASTCMP);
679 BIND(BMLOOPSTR1_CMP);
680 cmp(ch1, ch2);
681 br(EQ, BMLOOPSTR1);
682 BIND(BMSKIP);
683 if (!isL) {
684 // if we've met UTF symbol while searching Latin1 pattern, then we can
685 // skip cnt1 symbols
686 if (str1_isL != str2_isL) {
687 mov(result_tmp, cnt1);
688 } else {
689 mov(result_tmp, 1);
690 }
691 subs(zr, skipch, ASIZE);
692 br(HS, BMADV);
693 }
694 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
695 BIND(BMADV);
696 sub(cnt1tmp, cnt1, 1);
697 add(str2, str2, result_tmp, LSL, str2_chr_shift);
698 cmp(str2, str2end);
699 br(LE, BMLOOPSTR2);
700 add(sp, sp, ASIZE);
701 b(NOMATCH);
702 BIND(BMLOOPSTR1_LASTCMP);
703 cmp(ch1, ch2);
704 br(NE, BMSKIP);
705 BIND(BMMATCH);
706 sub(result, str2, tmp5);
707 if (!str2_isL) lsr(result, result, 1);
708 add(sp, sp, ASIZE);
709 b(DONE);
710
711 BIND(LINEARSTUB);
712 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
713 br(LT, LINEAR_MEDIUM);
714 mov(result, zr);
715 RuntimeAddress stub = nullptr;
716 if (isL) {
717 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
718 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
719 } else if (str1_isL) {
720 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
721 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
722 } else {
723 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
724 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
725 }
726 address call = trampoline_call(stub);
727 if (call == nullptr) {
728 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
729 ciEnv::current()->record_failure("CodeCache is full");
730 return;
731 }
732 b(DONE);
733 }
734
735 BIND(LINEARSEARCH);
736 {
737 Label DO1, DO2, DO3;
738
739 Register str2tmp = tmp2;
740 Register first = tmp3;
741
742 if (icnt1 == -1)
743 {
744 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
745
746 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
747 br(LT, DOSHORT);
748 BIND(LINEAR_MEDIUM);
749 (this->*str1_load_1chr)(first, Address(str1));
750 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
751 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
753 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
754
755 BIND(FIRST_LOOP);
756 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
757 cmp(first, ch2);
758 br(EQ, STR1_LOOP);
759 BIND(STR2_NEXT);
760 adds(cnt2_neg, cnt2_neg, str2_chr_size);
761 br(LE, FIRST_LOOP);
762 b(NOMATCH);
763
764 BIND(STR1_LOOP);
765 adds(cnt1tmp, cnt1_neg, str1_chr_size);
766 add(cnt2tmp, cnt2_neg, str2_chr_size);
767 br(GE, MATCH);
768
769 BIND(STR1_NEXT);
770 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
771 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
772 cmp(ch1, ch2);
773 br(NE, STR2_NEXT);
774 adds(cnt1tmp, cnt1tmp, str1_chr_size);
775 add(cnt2tmp, cnt2tmp, str2_chr_size);
776 br(LT, STR1_NEXT);
777 b(MATCH);
778
779 BIND(DOSHORT);
780 if (str1_isL == str2_isL) {
781 cmp(cnt1, (u1)2);
782 br(LT, DO1);
783 br(GT, DO3);
784 }
785 }
786
787 if (icnt1 == 4) {
788 Label CH1_LOOP;
789
790 (this->*load_4chr)(ch1, str1);
791 sub(result_tmp, cnt2, 4);
792 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
793 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
794
795 BIND(CH1_LOOP);
796 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
797 cmp(ch1, ch2);
798 br(EQ, MATCH);
799 adds(cnt2_neg, cnt2_neg, str2_chr_size);
800 br(LE, CH1_LOOP);
801 b(NOMATCH);
802 }
803
804 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
805 Label CH1_LOOP;
806
807 BIND(DO2);
808 (this->*load_2chr)(ch1, str1);
809 if (icnt1 == 2) {
810 sub(result_tmp, cnt2, 2);
811 }
812 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
813 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
814 BIND(CH1_LOOP);
815 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
816 cmp(ch1, ch2);
817 br(EQ, MATCH);
818 adds(cnt2_neg, cnt2_neg, str2_chr_size);
819 br(LE, CH1_LOOP);
820 b(NOMATCH);
821 }
822
823 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
824 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
825
826 BIND(DO3);
827 (this->*load_2chr)(first, str1);
828 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
829 if (icnt1 == 3) {
830 sub(result_tmp, cnt2, 3);
831 }
832 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
833 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
834 BIND(FIRST_LOOP);
835 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
836 cmpw(first, ch2);
837 br(EQ, STR1_LOOP);
838 BIND(STR2_NEXT);
839 adds(cnt2_neg, cnt2_neg, str2_chr_size);
840 br(LE, FIRST_LOOP);
841 b(NOMATCH);
842
843 BIND(STR1_LOOP);
844 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
845 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
846 cmp(ch1, ch2);
847 br(NE, STR2_NEXT);
848 b(MATCH);
849 }
850
851 if (icnt1 == -1 || icnt1 == 1) {
852 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
853
854 BIND(DO1);
855 (this->*str1_load_1chr)(ch1, str1);
856 cmp(cnt2, (u1)8);
857 br(LT, DO1_SHORT);
858
859 sub(result_tmp, cnt2, 8/str2_chr_size);
860 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
861 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
862 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
863
864 if (str2_isL) {
865 orr(ch1, ch1, ch1, LSL, 8);
866 }
867 orr(ch1, ch1, ch1, LSL, 16);
868 orr(ch1, ch1, ch1, LSL, 32);
869 BIND(CH1_LOOP);
870 ldr(ch2, Address(str2, cnt2_neg));
871 eor(ch2, ch1, ch2);
872 sub(tmp1, ch2, tmp3);
873 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
874 bics(tmp1, tmp1, tmp2);
875 br(NE, HAS_ZERO);
876 adds(cnt2_neg, cnt2_neg, 8);
877 br(LT, CH1_LOOP);
878
879 cmp(cnt2_neg, (u1)8);
880 mov(cnt2_neg, 0);
881 br(LT, CH1_LOOP);
882 b(NOMATCH);
883
884 BIND(HAS_ZERO);
885 rev(tmp1, tmp1);
886 clz(tmp1, tmp1);
887 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
888 b(MATCH);
889
890 BIND(DO1_SHORT);
891 mov(result_tmp, cnt2);
892 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
893 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
894 BIND(DO1_LOOP);
895 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
896 cmpw(ch1, ch2);
897 br(EQ, MATCH);
898 adds(cnt2_neg, cnt2_neg, str2_chr_size);
899 br(LT, DO1_LOOP);
900 }
901 }
902 BIND(NOMATCH);
903 mov(result, -1);
904 b(DONE);
905 BIND(MATCH);
906 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
907 BIND(DONE);
908 }
909
910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
912
913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
914 Register ch, Register result,
915 Register tmp1, Register tmp2, Register tmp3)
916 {
917 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
918 Register cnt1_neg = cnt1;
919 Register ch1 = rscratch1;
920 Register result_tmp = rscratch2;
921
922 cbz(cnt1, NOMATCH);
923
924 cmp(cnt1, (u1)4);
925 br(LT, DO1_SHORT);
926
927 orr(ch, ch, ch, LSL, 16);
928 orr(ch, ch, ch, LSL, 32);
929
930 sub(cnt1, cnt1, 4);
931 mov(result_tmp, cnt1);
932 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
933 sub(cnt1_neg, zr, cnt1, LSL, 1);
934
935 mov(tmp3, 0x0001000100010001);
936
937 BIND(CH1_LOOP);
938 ldr(ch1, Address(str1, cnt1_neg));
939 eor(ch1, ch, ch1);
940 sub(tmp1, ch1, tmp3);
941 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
942 bics(tmp1, tmp1, tmp2);
943 br(NE, HAS_ZERO);
944 adds(cnt1_neg, cnt1_neg, 8);
945 br(LT, CH1_LOOP);
946
947 cmp(cnt1_neg, (u1)8);
948 mov(cnt1_neg, 0);
949 br(LT, CH1_LOOP);
950 b(NOMATCH);
951
952 BIND(HAS_ZERO);
953 rev(tmp1, tmp1);
954 clz(tmp1, tmp1);
955 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
956 b(MATCH);
957
958 BIND(DO1_SHORT);
959 mov(result_tmp, cnt1);
960 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
961 sub(cnt1_neg, zr, cnt1, LSL, 1);
962 BIND(DO1_LOOP);
963 ldrh(ch1, Address(str1, cnt1_neg));
964 cmpw(ch, ch1);
965 br(EQ, MATCH);
966 adds(cnt1_neg, cnt1_neg, 2);
967 br(LT, DO1_LOOP);
968 BIND(NOMATCH);
969 mov(result, -1);
970 b(DONE);
971 BIND(MATCH);
972 add(result, result_tmp, cnt1_neg, ASR, 1);
973 BIND(DONE);
974 }
975
976 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
977 Register ch, Register result,
978 FloatRegister ztmp1,
979 FloatRegister ztmp2,
980 PRegister tmp_pg,
981 PRegister tmp_pdn, bool isL)
982 {
983 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
984 assert(tmp_pg->is_governing(),
985 "this register has to be a governing predicate register");
986
987 Label LOOP, MATCH, DONE, NOMATCH;
988 Register vec_len = rscratch1;
989 Register idx = rscratch2;
990
991 SIMD_RegVariant T = (isL == true) ? B : H;
992
993 cbz(cnt1, NOMATCH);
994
995 // Assign the particular char throughout the vector.
996 sve_dup(ztmp2, T, ch);
997 if (isL) {
998 sve_cntb(vec_len);
999 } else {
1000 sve_cnth(vec_len);
1001 }
1002 mov(idx, 0);
1003
1004 // Generate a predicate to control the reading of input string.
1005 sve_whilelt(tmp_pg, T, idx, cnt1);
1006
1007 BIND(LOOP);
1008 // Read a vector of 8- or 16-bit data depending on the string type. Note
1009 // that inactive elements indicated by the predicate register won't cause
1010 // a data read from memory to the destination vector.
1011 if (isL) {
1012 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1013 } else {
1014 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1015 }
1016 add(idx, idx, vec_len);
1017
1018 // Perform the comparison. An element of the destination predicate is set
1019 // to active if the particular char is matched.
1020 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1021
1022 // Branch if the particular char is found.
1023 br(NE, MATCH);
1024
1025 sve_whilelt(tmp_pg, T, idx, cnt1);
1026
1027 // Loop back if the particular char not found.
1028 br(MI, LOOP);
1029
1030 BIND(NOMATCH);
1031 mov(result, -1);
1032 b(DONE);
1033
1034 BIND(MATCH);
1035 // Undo the index increment.
1036 sub(idx, idx, vec_len);
1037
1038 // Crop the vector to find its location.
1039 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1040 add(result, idx, -1);
1041 sve_incp(result, T, tmp_pdn);
1042 BIND(DONE);
1043 }
1044
1045 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1046 Register ch, Register result,
1047 Register tmp1, Register tmp2, Register tmp3)
1048 {
1049 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1050 Register cnt1_neg = cnt1;
1051 Register ch1 = rscratch1;
1052 Register result_tmp = rscratch2;
1053
1054 cbz(cnt1, NOMATCH);
1055
1056 cmp(cnt1, (u1)8);
1057 br(LT, DO1_SHORT);
1058
1059 orr(ch, ch, ch, LSL, 8);
1060 orr(ch, ch, ch, LSL, 16);
1061 orr(ch, ch, ch, LSL, 32);
1062
1063 sub(cnt1, cnt1, 8);
1064 mov(result_tmp, cnt1);
1065 lea(str1, Address(str1, cnt1));
1066 sub(cnt1_neg, zr, cnt1);
1067
1068 mov(tmp3, 0x0101010101010101);
1069
1070 BIND(CH1_LOOP);
1071 ldr(ch1, Address(str1, cnt1_neg));
1072 eor(ch1, ch, ch1);
1073 sub(tmp1, ch1, tmp3);
1074 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1075 bics(tmp1, tmp1, tmp2);
1076 br(NE, HAS_ZERO);
1077 adds(cnt1_neg, cnt1_neg, 8);
1078 br(LT, CH1_LOOP);
1079
1080 cmp(cnt1_neg, (u1)8);
1081 mov(cnt1_neg, 0);
1082 br(LT, CH1_LOOP);
1083 b(NOMATCH);
1084
1085 BIND(HAS_ZERO);
1086 rev(tmp1, tmp1);
1087 clz(tmp1, tmp1);
1088 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1089 b(MATCH);
1090
1091 BIND(DO1_SHORT);
1092 mov(result_tmp, cnt1);
1093 lea(str1, Address(str1, cnt1));
1094 sub(cnt1_neg, zr, cnt1);
1095 BIND(DO1_LOOP);
1096 ldrb(ch1, Address(str1, cnt1_neg));
1097 cmp(ch, ch1);
1098 br(EQ, MATCH);
1099 adds(cnt1_neg, cnt1_neg, 1);
1100 br(LT, DO1_LOOP);
1101 BIND(NOMATCH);
1102 mov(result, -1);
1103 b(DONE);
1104 BIND(MATCH);
1105 add(result, result_tmp, cnt1_neg);
1106 BIND(DONE);
1107 }
1108
1109 // Compare strings.
1110 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1111 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1112 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1113 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1114 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1115 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1116 SHORT_LOOP_START, TAIL_CHECK;
1117
1118 bool isLL = ae == StrIntrinsicNode::LL;
1119 bool isLU = ae == StrIntrinsicNode::LU;
1120 bool isUL = ae == StrIntrinsicNode::UL;
1121
1122 // The stub threshold for LL strings is: 72 (64 + 8) chars
1123 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1124 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1125 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1126
1127 bool str1_isL = isLL || isLU;
1128 bool str2_isL = isLL || isUL;
1129
1130 int str1_chr_shift = str1_isL ? 0 : 1;
1131 int str2_chr_shift = str2_isL ? 0 : 1;
1132 int str1_chr_size = str1_isL ? 1 : 2;
1133 int str2_chr_size = str2_isL ? 1 : 2;
1134 int minCharsInWord = isLL ? wordSize : wordSize/2;
1135
1136 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1137 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1138 (chr_insn)&MacroAssembler::ldrh;
1139 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1140 (chr_insn)&MacroAssembler::ldrh;
1141 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1142 (uxt_insn)&MacroAssembler::uxthw;
1143
1144 BLOCK_COMMENT("string_compare {");
1145
1146 // Bizarrely, the counts are passed in bytes, regardless of whether they
1147 // are L or U strings, however the result is always in characters.
1148 if (!str1_isL) asrw(cnt1, cnt1, 1);
1149 if (!str2_isL) asrw(cnt2, cnt2, 1);
1150
1151 // Compute the minimum of the string lengths and save the difference.
1152 subsw(result, cnt1, cnt2);
1153 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1154
1155 // A very short string
1156 cmpw(cnt2, minCharsInWord);
1157 br(Assembler::LE, SHORT_STRING);
1158
1159 // Compare longwords
1160 // load first parts of strings and finish initialization while loading
1161 {
1162 if (str1_isL == str2_isL) { // LL or UU
1163 ldr(tmp1, Address(str1));
1164 cmp(str1, str2);
1165 br(Assembler::EQ, DONE);
1166 ldr(tmp2, Address(str2));
1167 cmp(cnt2, stub_threshold);
1168 br(GE, STUB);
1169 subsw(cnt2, cnt2, minCharsInWord);
1170 br(EQ, TAIL_CHECK);
1171 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1172 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1173 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1174 } else if (isLU) {
1175 ldrs(vtmp, Address(str1));
1176 ldr(tmp2, Address(str2));
1177 cmp(cnt2, stub_threshold);
1178 br(GE, STUB);
1179 subw(cnt2, cnt2, 4);
1180 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1181 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1182 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183 zip1(vtmp, T8B, vtmp, vtmpZ);
1184 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1185 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1186 add(cnt1, cnt1, 4);
1187 fmovd(tmp1, vtmp);
1188 } else { // UL case
1189 ldr(tmp1, Address(str1));
1190 ldrs(vtmp, Address(str2));
1191 cmp(cnt2, stub_threshold);
1192 br(GE, STUB);
1193 subw(cnt2, cnt2, 4);
1194 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1195 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1197 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1198 zip1(vtmp, T8B, vtmp, vtmpZ);
1199 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1200 add(cnt1, cnt1, 8);
1201 fmovd(tmp2, vtmp);
1202 }
1203 adds(cnt2, cnt2, isUL ? 4 : 8);
1204 br(GE, TAIL);
1205 eor(rscratch2, tmp1, tmp2);
1206 cbnz(rscratch2, DIFF);
1207 // main loop
1208 bind(NEXT_WORD);
1209 if (str1_isL == str2_isL) {
1210 ldr(tmp1, Address(str1, cnt2));
1211 ldr(tmp2, Address(str2, cnt2));
1212 adds(cnt2, cnt2, 8);
1213 } else if (isLU) {
1214 ldrs(vtmp, Address(str1, cnt1));
1215 ldr(tmp2, Address(str2, cnt2));
1216 add(cnt1, cnt1, 4);
1217 zip1(vtmp, T8B, vtmp, vtmpZ);
1218 fmovd(tmp1, vtmp);
1219 adds(cnt2, cnt2, 8);
1220 } else { // UL
1221 ldrs(vtmp, Address(str2, cnt2));
1222 ldr(tmp1, Address(str1, cnt1));
1223 zip1(vtmp, T8B, vtmp, vtmpZ);
1224 add(cnt1, cnt1, 8);
1225 fmovd(tmp2, vtmp);
1226 adds(cnt2, cnt2, 4);
1227 }
1228 br(GE, TAIL);
1229
1230 eor(rscratch2, tmp1, tmp2);
1231 cbz(rscratch2, NEXT_WORD);
1232 b(DIFF);
1233 bind(TAIL);
1234 eor(rscratch2, tmp1, tmp2);
1235 cbnz(rscratch2, DIFF);
1236 // Last longword. In the case where length == 4 we compare the
1237 // same longword twice, but that's still faster than another
1238 // conditional branch.
1239 if (str1_isL == str2_isL) {
1240 ldr(tmp1, Address(str1));
1241 ldr(tmp2, Address(str2));
1242 } else if (isLU) {
1243 ldrs(vtmp, Address(str1));
1244 ldr(tmp2, Address(str2));
1245 zip1(vtmp, T8B, vtmp, vtmpZ);
1246 fmovd(tmp1, vtmp);
1247 } else { // UL
1248 ldrs(vtmp, Address(str2));
1249 ldr(tmp1, Address(str1));
1250 zip1(vtmp, T8B, vtmp, vtmpZ);
1251 fmovd(tmp2, vtmp);
1252 }
1253 bind(TAIL_CHECK);
1254 eor(rscratch2, tmp1, tmp2);
1255 cbz(rscratch2, DONE);
1256
1257 // Find the first different characters in the longwords and
1258 // compute their difference.
1259 bind(DIFF);
1260 rev(rscratch2, rscratch2);
1261 clz(rscratch2, rscratch2);
1262 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1263 lsrv(tmp1, tmp1, rscratch2);
1264 (this->*ext_chr)(tmp1, tmp1);
1265 lsrv(tmp2, tmp2, rscratch2);
1266 (this->*ext_chr)(tmp2, tmp2);
1267 subw(result, tmp1, tmp2);
1268 b(DONE);
1269 }
1270
1271 bind(STUB);
1272 RuntimeAddress stub = nullptr;
1273 switch(ae) {
1274 case StrIntrinsicNode::LL:
1275 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1276 break;
1277 case StrIntrinsicNode::UU:
1278 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1279 break;
1280 case StrIntrinsicNode::LU:
1281 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1282 break;
1283 case StrIntrinsicNode::UL:
1284 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1285 break;
1286 default:
1287 ShouldNotReachHere();
1288 }
1289 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1290 address call = trampoline_call(stub);
1291 if (call == nullptr) {
1292 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1293 ciEnv::current()->record_failure("CodeCache is full");
1294 return;
1295 }
1296 b(DONE);
1297
1298 bind(SHORT_STRING);
1299 // Is the minimum length zero?
1300 cbz(cnt2, DONE);
1301 // arrange code to do most branches while loading and loading next characters
1302 // while comparing previous
1303 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1304 subs(cnt2, cnt2, 1);
1305 br(EQ, SHORT_LAST_INIT);
1306 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1307 b(SHORT_LOOP_START);
1308 bind(SHORT_LOOP);
1309 subs(cnt2, cnt2, 1);
1310 br(EQ, SHORT_LAST);
1311 bind(SHORT_LOOP_START);
1312 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1313 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1314 cmp(tmp1, cnt1);
1315 br(NE, SHORT_LOOP_TAIL);
1316 subs(cnt2, cnt2, 1);
1317 br(EQ, SHORT_LAST2);
1318 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1320 cmp(tmp2, rscratch1);
1321 br(EQ, SHORT_LOOP);
1322 sub(result, tmp2, rscratch1);
1323 b(DONE);
1324 bind(SHORT_LOOP_TAIL);
1325 sub(result, tmp1, cnt1);
1326 b(DONE);
1327 bind(SHORT_LAST2);
1328 cmp(tmp2, rscratch1);
1329 br(EQ, DONE);
1330 sub(result, tmp2, rscratch1);
1331
1332 b(DONE);
1333 bind(SHORT_LAST_INIT);
1334 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335 bind(SHORT_LAST);
1336 cmp(tmp1, cnt1);
1337 br(EQ, DONE);
1338 sub(result, tmp1, cnt1);
1339
1340 bind(DONE);
1341
1342 BLOCK_COMMENT("} string_compare");
1343 }
1344
1345 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1346 FloatRegister src2, Condition cond, bool isQ) {
1347 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1348 FloatRegister zn = src1, zm = src2;
1349 bool needs_negation = false;
1350 switch (cond) {
1351 case LT: cond = GT; zn = src2; zm = src1; break;
1352 case LE: cond = GE; zn = src2; zm = src1; break;
1353 case LO: cond = HI; zn = src2; zm = src1; break;
1354 case LS: cond = HS; zn = src2; zm = src1; break;
1355 case NE: cond = EQ; needs_negation = true; break;
1356 default:
1357 break;
1358 }
1359
1360 if (is_floating_point_type(bt)) {
1361 fcm(cond, dst, size, zn, zm);
1362 } else {
1363 cm(cond, dst, size, zn, zm);
1364 }
1365
1366 if (needs_negation) {
1367 notr(dst, isQ ? T16B : T8B, dst);
1368 }
1369 }
1370
1371 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1372 Condition cond, bool isQ) {
1373 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1374 if (bt == T_FLOAT || bt == T_DOUBLE) {
1375 if (cond == Assembler::NE) {
1376 fcm(Assembler::EQ, dst, size, src);
1377 notr(dst, isQ ? T16B : T8B, dst);
1378 } else {
1379 fcm(cond, dst, size, src);
1380 }
1381 } else {
1382 if (cond == Assembler::NE) {
1383 cm(Assembler::EQ, dst, size, src);
1384 notr(dst, isQ ? T16B : T8B, dst);
1385 } else {
1386 cm(cond, dst, size, src);
1387 }
1388 }
1389 }
1390
1391 // Compress the least significant bit of each byte to the rightmost and clear
1392 // the higher garbage bits.
1393 void C2_MacroAssembler::bytemask_compress(Register dst) {
1394 // Example input, dst = 0x01 00 00 00 01 01 00 01
1395 // The "??" bytes are garbage.
1396 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1397 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1398 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1399 andr(dst, dst, 0xff); // dst = 0x8D
1400 }
1401
1402 // Pack the value of each mask element in "src" into a long value in "dst", at most
1403 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1404 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1405 // one bit in "dst".
1406 //
1407 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1408 // Expected: dst = 0x658D
1409 //
1410 // Clobbers: rscratch1
1411 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1412 FloatRegister vtmp, int lane_cnt) {
1413 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1414 assert_different_registers(dst, rscratch1);
1415 assert_different_registers(src, vtmp);
1416 assert(UseSVE > 0, "must be");
1417
1418 // Compress the lowest 8 bytes.
1419 fmovd(dst, src);
1420 bytemask_compress(dst);
1421 if (lane_cnt <= 8) return;
1422
1423 // Repeat on higher bytes and join the results.
1424 // Compress 8 bytes in each iteration.
1425 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1426 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1427 bytemask_compress(rscratch1);
1428 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1429 }
1430 }
1431
1432 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1433 // instruction which requires the FEAT_BITPERM feature.
1434 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1435 FloatRegister vtmp1, FloatRegister vtmp2,
1436 int lane_cnt) {
1437 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1438 assert_different_registers(src, vtmp1, vtmp2);
1439 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1440
1441 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1442 // is to compress each significant bit of the byte in a cross-lane way. Due
1443 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1444 // (bit-compress in each lane) with the biggest lane size (T = D) then
1445 // concatenate the results.
1446
1447 // The second source input of BEXT, initialized with 0x01 in each byte.
1448 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1449 sve_dup(vtmp2, B, 1);
1450
1451 // BEXT vtmp1.D, src.D, vtmp2.D
1452 // src = 0x0001010000010001 | 0x0100000001010001
1453 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1454 // ---------------------------------------
1455 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1456 sve_bext(vtmp1, D, src, vtmp2);
1457
1458 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1459 // result to dst.
1460 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1461 // dst = 0x658D
1462 if (lane_cnt <= 8) {
1463 // No need to concatenate.
1464 umov(dst, vtmp1, B, 0);
1465 } else if (lane_cnt <= 16) {
1466 ins(vtmp1, B, vtmp1, 1, 8);
1467 umov(dst, vtmp1, H, 0);
1468 } else {
1469 // As the lane count is 64 at most, the final expected value must be in
1470 // the lowest 64 bits after narrowing vtmp1 from D to B.
1471 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1472 umov(dst, vtmp1, D, 0);
1473 }
1474 }
1475
1476 // Unpack the mask, a long value in "src", into a vector register of boolean
1477 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1478 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1479 // most 64 lanes.
1480 //
1481 // Below example gives the expected dst vector register, with a valid src(0x658D)
1482 // on a 128-bit vector size machine.
1483 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1484 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1485 FloatRegister vtmp, int lane_cnt) {
1486 assert_different_registers(dst, vtmp);
1487 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1488 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1489
1490 // Example: src = 0x658D, lane_cnt = 16
1491 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1492
1493 // Put long value from general purpose register into the first lane of vector.
1494 // vtmp = 0x0000000000000000 | 0x000000000000658D
1495 sve_dup(vtmp, B, 0);
1496 mov(vtmp, D, 0, src);
1497
1498 // Transform the value in the first lane which is mask in bit now to the mask in
1499 // byte, which can be done by SVE2's BDEP instruction.
1500
1501 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1502 // vtmp = 0x0000000000000065 | 0x000000000000008D
1503 if (lane_cnt <= 8) {
1504 // Nothing. As only one byte exsits.
1505 } else if (lane_cnt <= 16) {
1506 ins(vtmp, B, vtmp, 8, 1);
1507 } else {
1508 sve_vector_extend(vtmp, D, vtmp, B);
1509 }
1510
1511 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1512 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1513 sve_dup(dst, B, 1);
1514
1515 // BDEP dst.D, vtmp.D, dst.D
1516 // vtmp = 0x0000000000000065 | 0x000000000000008D
1517 // dst = 0x0101010101010101 | 0x0101010101010101
1518 // ---------------------------------------
1519 // dst = 0x0001010000010001 | 0x0100000001010001
1520 sve_bdep(dst, D, vtmp, dst);
1521 }
1522
1523 // Clobbers: rflags
1524 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1525 FloatRegister zn, FloatRegister zm, Condition cond) {
1526 assert(pg->is_governing(), "This register has to be a governing predicate register");
1527 FloatRegister z1 = zn, z2 = zm;
1528 switch (cond) {
1529 case LE: z1 = zm; z2 = zn; cond = GE; break;
1530 case LT: z1 = zm; z2 = zn; cond = GT; break;
1531 case LO: z1 = zm; z2 = zn; cond = HI; break;
1532 case LS: z1 = zm; z2 = zn; cond = HS; break;
1533 default:
1534 break;
1535 }
1536
1537 SIMD_RegVariant size = elemType_to_regVariant(bt);
1538 if (is_floating_point_type(bt)) {
1539 sve_fcm(cond, pd, size, pg, z1, z2);
1540 } else {
1541 assert(is_integral_type(bt), "unsupported element type");
1542 sve_cmp(cond, pd, size, pg, z1, z2);
1543 }
1544 }
1545
1546 // Get index of the last mask lane that is set
1547 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1548 SIMD_RegVariant size = elemType_to_regVariant(bt);
1549 sve_rev(ptmp, size, src);
1550 sve_brkb(ptmp, ptrue, ptmp, false);
1551 sve_cntp(dst, size, ptrue, ptmp);
1552 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1553 subw(dst, rscratch1, dst);
1554 }
1555
1556 // Extend integer vector src to dst with the same lane count
1557 // but larger element size, e.g. 4B -> 4I
1558 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1559 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1560 if (src_bt == T_BYTE) {
1561 // 4B to 4S/4I, 8B to 8S
1562 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1563 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1564 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1565 if (dst_bt == T_INT) {
1566 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1567 }
1568 } else if (src_bt == T_SHORT) {
1569 // 2S to 2I/2L, 4S to 4I
1570 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1571 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1572 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1573 if (dst_bt == T_LONG) {
1574 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1575 }
1576 } else if (src_bt == T_INT) {
1577 // 2I to 2L
1578 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1579 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1580 } else {
1581 ShouldNotReachHere();
1582 }
1583 }
1584
1585 // Narrow integer vector src down to dst with the same lane count
1586 // but smaller element size, e.g. 4I -> 4B
1587 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1588 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1589 if (src_bt == T_SHORT) {
1590 // 4S/8S to 4B/8B
1591 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1592 assert(dst_bt == T_BYTE, "unsupported");
1593 xtn(dst, T8B, src, T8H);
1594 } else if (src_bt == T_INT) {
1595 // 2I to 2S, 4I to 4B/4S
1596 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1597 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1598 xtn(dst, T4H, src, T4S);
1599 if (dst_bt == T_BYTE) {
1600 xtn(dst, T8B, dst, T8H);
1601 }
1602 } else if (src_bt == T_LONG) {
1603 // 2L to 2S/2I
1604 assert(src_vlen_in_bytes == 16, "unsupported");
1605 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1606 xtn(dst, T2S, src, T2D);
1607 if (dst_bt == T_SHORT) {
1608 xtn(dst, T4H, dst, T4S);
1609 }
1610 } else {
1611 ShouldNotReachHere();
1612 }
1613 }
1614
1615 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1616 FloatRegister src, SIMD_RegVariant src_size,
1617 bool is_unsigned) {
1618 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1619
1620 if (src_size == B) {
1621 switch (dst_size) {
1622 case H:
1623 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1624 break;
1625 case S:
1626 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1627 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1628 break;
1629 case D:
1630 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1631 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1632 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1633 break;
1634 default:
1635 ShouldNotReachHere();
1636 }
1637 } else if (src_size == H) {
1638 if (dst_size == S) {
1639 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1640 } else { // D
1641 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1642 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1643 }
1644 } else if (src_size == S) {
1645 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1646 }
1647 }
1648
1649 // Vector narrow from src to dst with specified element sizes.
1650 // High part of dst vector will be filled with zero.
1651 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1652 FloatRegister src, SIMD_RegVariant src_size,
1653 FloatRegister tmp) {
1654 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1655 assert_different_registers(src, tmp);
1656 sve_dup(tmp, src_size, 0);
1657 if (src_size == D) {
1658 switch (dst_size) {
1659 case S:
1660 sve_uzp1(dst, S, src, tmp);
1661 break;
1662 case H:
1663 assert_different_registers(dst, tmp);
1664 sve_uzp1(dst, S, src, tmp);
1665 sve_uzp1(dst, H, dst, tmp);
1666 break;
1667 case B:
1668 assert_different_registers(dst, tmp);
1669 sve_uzp1(dst, S, src, tmp);
1670 sve_uzp1(dst, H, dst, tmp);
1671 sve_uzp1(dst, B, dst, tmp);
1672 break;
1673 default:
1674 ShouldNotReachHere();
1675 }
1676 } else if (src_size == S) {
1677 if (dst_size == H) {
1678 sve_uzp1(dst, H, src, tmp);
1679 } else { // B
1680 assert_different_registers(dst, tmp);
1681 sve_uzp1(dst, H, src, tmp);
1682 sve_uzp1(dst, B, dst, tmp);
1683 }
1684 } else if (src_size == H) {
1685 sve_uzp1(dst, B, src, tmp);
1686 }
1687 }
1688
1689 // Extend src predicate to dst predicate with the same lane count but larger
1690 // element size, e.g. 64Byte -> 512Long
1691 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1692 uint dst_element_length_in_bytes,
1693 uint src_element_length_in_bytes) {
1694 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1695 sve_punpklo(dst, src);
1696 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1697 sve_punpklo(dst, src);
1698 sve_punpklo(dst, dst);
1699 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1700 sve_punpklo(dst, src);
1701 sve_punpklo(dst, dst);
1702 sve_punpklo(dst, dst);
1703 } else {
1704 assert(false, "unsupported");
1705 ShouldNotReachHere();
1706 }
1707 }
1708
1709 // Narrow src predicate to dst predicate with the same lane count but
1710 // smaller element size, e.g. 512Long -> 64Byte
1711 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1712 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1713 // The insignificant bits in src predicate are expected to be zero.
1714 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1715 // passed as the second argument. An example narrowing operation with a given mask would be -
1716 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1717 // Mask (for 2 Longs) : TF
1718 // Predicate register for the above mask (16 bits) : 00000001 00000000
1719 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1720 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1721 assert_different_registers(src, ptmp);
1722 assert_different_registers(dst, ptmp);
1723 sve_pfalse(ptmp);
1724 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1725 sve_uzp1(dst, B, src, ptmp);
1726 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1727 sve_uzp1(dst, H, src, ptmp);
1728 sve_uzp1(dst, B, dst, ptmp);
1729 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1730 sve_uzp1(dst, S, src, ptmp);
1731 sve_uzp1(dst, H, dst, ptmp);
1732 sve_uzp1(dst, B, dst, ptmp);
1733 } else {
1734 assert(false, "unsupported");
1735 ShouldNotReachHere();
1736 }
1737 }
1738
1739 // Vector reduction add for integral type with ASIMD instructions.
1740 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1741 Register isrc, FloatRegister vsrc,
1742 unsigned vector_length_in_bytes,
1743 FloatRegister vtmp) {
1744 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1745 assert_different_registers(dst, isrc);
1746 bool isQ = vector_length_in_bytes == 16;
1747
1748 BLOCK_COMMENT("neon_reduce_add_integral {");
1749 switch(bt) {
1750 case T_BYTE:
1751 addv(vtmp, isQ ? T16B : T8B, vsrc);
1752 smov(dst, vtmp, B, 0);
1753 addw(dst, dst, isrc, ext::sxtb);
1754 break;
1755 case T_SHORT:
1756 addv(vtmp, isQ ? T8H : T4H, vsrc);
1757 smov(dst, vtmp, H, 0);
1758 addw(dst, dst, isrc, ext::sxth);
1759 break;
1760 case T_INT:
1761 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1762 umov(dst, vtmp, S, 0);
1763 addw(dst, dst, isrc);
1764 break;
1765 case T_LONG:
1766 assert(isQ, "unsupported");
1767 addpd(vtmp, vsrc);
1768 umov(dst, vtmp, D, 0);
1769 add(dst, dst, isrc);
1770 break;
1771 default:
1772 assert(false, "unsupported");
1773 ShouldNotReachHere();
1774 }
1775 BLOCK_COMMENT("} neon_reduce_add_integral");
1776 }
1777
1778 // Vector reduction multiply for integral type with ASIMD instructions.
1779 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1780 // Clobbers: rscratch1
1781 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1782 Register isrc, FloatRegister vsrc,
1783 unsigned vector_length_in_bytes,
1784 FloatRegister vtmp1, FloatRegister vtmp2) {
1785 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1786 bool isQ = vector_length_in_bytes == 16;
1787
1788 BLOCK_COMMENT("neon_reduce_mul_integral {");
1789 switch(bt) {
1790 case T_BYTE:
1791 if (isQ) {
1792 // Multiply the lower half and higher half of vector iteratively.
1793 // vtmp1 = vsrc[8:15]
1794 ins(vtmp1, D, vsrc, 0, 1);
1795 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1796 mulv(vtmp1, T8B, vtmp1, vsrc);
1797 // vtmp2 = vtmp1[4:7]
1798 ins(vtmp2, S, vtmp1, 0, 1);
1799 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1800 mulv(vtmp1, T8B, vtmp2, vtmp1);
1801 } else {
1802 ins(vtmp1, S, vsrc, 0, 1);
1803 mulv(vtmp1, T8B, vtmp1, vsrc);
1804 }
1805 // vtmp2 = vtmp1[2:3]
1806 ins(vtmp2, H, vtmp1, 0, 1);
1807 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1808 mulv(vtmp2, T8B, vtmp2, vtmp1);
1809 // dst = vtmp2[0] * isrc * vtmp2[1]
1810 umov(rscratch1, vtmp2, B, 0);
1811 mulw(dst, rscratch1, isrc);
1812 sxtb(dst, dst);
1813 umov(rscratch1, vtmp2, B, 1);
1814 mulw(dst, rscratch1, dst);
1815 sxtb(dst, dst);
1816 break;
1817 case T_SHORT:
1818 if (isQ) {
1819 ins(vtmp2, D, vsrc, 0, 1);
1820 mulv(vtmp2, T4H, vtmp2, vsrc);
1821 ins(vtmp1, S, vtmp2, 0, 1);
1822 mulv(vtmp1, T4H, vtmp1, vtmp2);
1823 } else {
1824 ins(vtmp1, S, vsrc, 0, 1);
1825 mulv(vtmp1, T4H, vtmp1, vsrc);
1826 }
1827 umov(rscratch1, vtmp1, H, 0);
1828 mulw(dst, rscratch1, isrc);
1829 sxth(dst, dst);
1830 umov(rscratch1, vtmp1, H, 1);
1831 mulw(dst, rscratch1, dst);
1832 sxth(dst, dst);
1833 break;
1834 case T_INT:
1835 if (isQ) {
1836 ins(vtmp1, D, vsrc, 0, 1);
1837 mulv(vtmp1, T2S, vtmp1, vsrc);
1838 } else {
1839 vtmp1 = vsrc;
1840 }
1841 umov(rscratch1, vtmp1, S, 0);
1842 mul(dst, rscratch1, isrc);
1843 umov(rscratch1, vtmp1, S, 1);
1844 mul(dst, rscratch1, dst);
1845 break;
1846 case T_LONG:
1847 umov(rscratch1, vsrc, D, 0);
1848 mul(dst, isrc, rscratch1);
1849 umov(rscratch1, vsrc, D, 1);
1850 mul(dst, dst, rscratch1);
1851 break;
1852 default:
1853 assert(false, "unsupported");
1854 ShouldNotReachHere();
1855 }
1856 BLOCK_COMMENT("} neon_reduce_mul_integral");
1857 }
1858
1859 // Vector reduction multiply for floating-point type with ASIMD instructions.
1860 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1861 FloatRegister fsrc, FloatRegister vsrc,
1862 unsigned vector_length_in_bytes,
1863 FloatRegister vtmp) {
1864 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1865 bool isQ = vector_length_in_bytes == 16;
1866
1867 BLOCK_COMMENT("neon_reduce_mul_fp {");
1868 switch(bt) {
1869 case T_FLOAT:
1870 fmuls(dst, fsrc, vsrc);
1871 ins(vtmp, S, vsrc, 0, 1);
1872 fmuls(dst, dst, vtmp);
1873 if (isQ) {
1874 ins(vtmp, S, vsrc, 0, 2);
1875 fmuls(dst, dst, vtmp);
1876 ins(vtmp, S, vsrc, 0, 3);
1877 fmuls(dst, dst, vtmp);
1878 }
1879 break;
1880 case T_DOUBLE:
1881 assert(isQ, "unsupported");
1882 fmuld(dst, fsrc, vsrc);
1883 ins(vtmp, D, vsrc, 0, 1);
1884 fmuld(dst, dst, vtmp);
1885 break;
1886 default:
1887 assert(false, "unsupported");
1888 ShouldNotReachHere();
1889 }
1890 BLOCK_COMMENT("} neon_reduce_mul_fp");
1891 }
1892
1893 // Helper to select logical instruction
1894 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1895 Register Rn, Register Rm,
1896 enum shift_kind kind, unsigned shift) {
1897 switch(opc) {
1898 case Op_AndReductionV:
1899 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1900 break;
1901 case Op_OrReductionV:
1902 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1903 break;
1904 case Op_XorReductionV:
1905 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1906 break;
1907 default:
1908 assert(false, "unsupported");
1909 ShouldNotReachHere();
1910 }
1911 }
1912
1913 // Vector reduction logical operations And, Or, Xor
1914 // Clobbers: rscratch1
1915 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1916 Register isrc, FloatRegister vsrc,
1917 unsigned vector_length_in_bytes) {
1918 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1919 "unsupported");
1920 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1921 assert_different_registers(dst, isrc);
1922 bool isQ = vector_length_in_bytes == 16;
1923
1924 BLOCK_COMMENT("neon_reduce_logical {");
1925 umov(rscratch1, vsrc, isQ ? D : S, 0);
1926 umov(dst, vsrc, isQ ? D : S, 1);
1927 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1928 switch(bt) {
1929 case T_BYTE:
1930 if (isQ) {
1931 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1932 }
1933 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1934 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1935 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1936 sxtb(dst, dst);
1937 break;
1938 case T_SHORT:
1939 if (isQ) {
1940 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1941 }
1942 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1943 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1944 sxth(dst, dst);
1945 break;
1946 case T_INT:
1947 if (isQ) {
1948 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1949 }
1950 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1951 break;
1952 case T_LONG:
1953 assert(isQ, "unsupported");
1954 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1955 break;
1956 default:
1957 assert(false, "unsupported");
1958 ShouldNotReachHere();
1959 }
1960 BLOCK_COMMENT("} neon_reduce_logical");
1961 }
1962
1963 // Vector reduction min/max for integral type with ASIMD instructions.
1964 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1965 // Clobbers: rscratch1, rflags
1966 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1967 Register isrc, FloatRegister vsrc,
1968 unsigned vector_length_in_bytes,
1969 FloatRegister vtmp) {
1970 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1971 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1972 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1973 assert_different_registers(dst, isrc);
1974 bool isQ = vector_length_in_bytes == 16;
1975 bool is_min = opc == Op_MinReductionV;
1976
1977 BLOCK_COMMENT("neon_reduce_minmax_integral {");
1978 if (bt == T_LONG) {
1979 assert(vtmp == fnoreg, "should be");
1980 assert(isQ, "should be");
1981 umov(rscratch1, vsrc, D, 0);
1982 cmp(isrc, rscratch1);
1983 csel(dst, isrc, rscratch1, is_min ? LT : GT);
1984 umov(rscratch1, vsrc, D, 1);
1985 cmp(dst, rscratch1);
1986 csel(dst, dst, rscratch1, is_min ? LT : GT);
1987 } else {
1988 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1989 if (size == T2S) {
1990 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
1991 } else {
1992 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
1993 }
1994 if (bt == T_INT) {
1995 umov(dst, vtmp, S, 0);
1996 } else {
1997 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
1998 }
1999 cmpw(dst, isrc);
2000 cselw(dst, dst, isrc, is_min ? LT : GT);
2001 }
2002 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2003 }
2004
2005 // Vector reduction for integral type with SVE instruction.
2006 // Supported operations are Add, And, Or, Xor, Max, Min.
2007 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2008 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2009 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2010 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2011 assert(pg->is_governing(), "This register has to be a governing predicate register");
2012 assert_different_registers(src1, dst);
2013 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2014 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2015 switch (opc) {
2016 case Op_AddReductionVI: {
2017 sve_uaddv(tmp, size, pg, src2);
2018 if (bt == T_BYTE) {
2019 smov(dst, tmp, size, 0);
2020 addw(dst, src1, dst, ext::sxtb);
2021 } else if (bt == T_SHORT) {
2022 smov(dst, tmp, size, 0);
2023 addw(dst, src1, dst, ext::sxth);
2024 } else {
2025 umov(dst, tmp, size, 0);
2026 addw(dst, dst, src1);
2027 }
2028 break;
2029 }
2030 case Op_AddReductionVL: {
2031 sve_uaddv(tmp, size, pg, src2);
2032 umov(dst, tmp, size, 0);
2033 add(dst, dst, src1);
2034 break;
2035 }
2036 case Op_AndReductionV: {
2037 sve_andv(tmp, size, pg, src2);
2038 if (bt == T_INT || bt == T_LONG) {
2039 umov(dst, tmp, size, 0);
2040 } else {
2041 smov(dst, tmp, size, 0);
2042 }
2043 if (bt == T_LONG) {
2044 andr(dst, dst, src1);
2045 } else {
2046 andw(dst, dst, src1);
2047 }
2048 break;
2049 }
2050 case Op_OrReductionV: {
2051 sve_orv(tmp, size, pg, src2);
2052 if (bt == T_INT || bt == T_LONG) {
2053 umov(dst, tmp, size, 0);
2054 } else {
2055 smov(dst, tmp, size, 0);
2056 }
2057 if (bt == T_LONG) {
2058 orr(dst, dst, src1);
2059 } else {
2060 orrw(dst, dst, src1);
2061 }
2062 break;
2063 }
2064 case Op_XorReductionV: {
2065 sve_eorv(tmp, size, pg, src2);
2066 if (bt == T_INT || bt == T_LONG) {
2067 umov(dst, tmp, size, 0);
2068 } else {
2069 smov(dst, tmp, size, 0);
2070 }
2071 if (bt == T_LONG) {
2072 eor(dst, dst, src1);
2073 } else {
2074 eorw(dst, dst, src1);
2075 }
2076 break;
2077 }
2078 case Op_MaxReductionV: {
2079 sve_smaxv(tmp, size, pg, src2);
2080 if (bt == T_INT || bt == T_LONG) {
2081 umov(dst, tmp, size, 0);
2082 } else {
2083 smov(dst, tmp, size, 0);
2084 }
2085 if (bt == T_LONG) {
2086 cmp(dst, src1);
2087 csel(dst, dst, src1, Assembler::GT);
2088 } else {
2089 cmpw(dst, src1);
2090 cselw(dst, dst, src1, Assembler::GT);
2091 }
2092 break;
2093 }
2094 case Op_MinReductionV: {
2095 sve_sminv(tmp, size, pg, src2);
2096 if (bt == T_INT || bt == T_LONG) {
2097 umov(dst, tmp, size, 0);
2098 } else {
2099 smov(dst, tmp, size, 0);
2100 }
2101 if (bt == T_LONG) {
2102 cmp(dst, src1);
2103 csel(dst, dst, src1, Assembler::LT);
2104 } else {
2105 cmpw(dst, src1);
2106 cselw(dst, dst, src1, Assembler::LT);
2107 }
2108 break;
2109 }
2110 default:
2111 assert(false, "unsupported");
2112 ShouldNotReachHere();
2113 }
2114
2115 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2116 if (bt == T_BYTE) {
2117 sxtb(dst, dst);
2118 } else if (bt == T_SHORT) {
2119 sxth(dst, dst);
2120 }
2121 }
2122 }
2123
2124 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2125 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2126 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2127 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2128 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2129 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2130
2131 // Set all elements to false if the input "lane_cnt" is zero.
2132 if (lane_cnt == 0) {
2133 sve_pfalse(dst);
2134 return;
2135 }
2136
2137 SIMD_RegVariant size = elemType_to_regVariant(bt);
2138 assert(size != Q, "invalid size");
2139
2140 // Set all true if "lane_cnt" equals to the max lane count.
2141 if (lane_cnt == max_vector_length) {
2142 sve_ptrue(dst, size, /* ALL */ 0b11111);
2143 return;
2144 }
2145
2146 // Fixed numbers for "ptrue".
2147 switch(lane_cnt) {
2148 case 1: /* VL1 */
2149 case 2: /* VL2 */
2150 case 3: /* VL3 */
2151 case 4: /* VL4 */
2152 case 5: /* VL5 */
2153 case 6: /* VL6 */
2154 case 7: /* VL7 */
2155 case 8: /* VL8 */
2156 sve_ptrue(dst, size, lane_cnt);
2157 return;
2158 case 16:
2159 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2160 return;
2161 case 32:
2162 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2163 return;
2164 case 64:
2165 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2166 return;
2167 case 128:
2168 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2169 return;
2170 case 256:
2171 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2172 return;
2173 default:
2174 break;
2175 }
2176
2177 // Special patterns for "ptrue".
2178 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2179 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2180 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2181 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2182 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2183 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2184 } else {
2185 // Encode to "whileltw" for the remaining cases.
2186 mov(rscratch1, lane_cnt);
2187 sve_whileltw(dst, size, zr, rscratch1);
2188 }
2189 }
2190
2191 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2192 // Any remaining elements of dst will be filled with zero.
2193 // Clobbers: rscratch1
2194 // Preserves: mask, vzr
2195 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2196 FloatRegister vzr, FloatRegister vtmp,
2197 PRegister pgtmp, unsigned vector_length_in_bytes) {
2198 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2199 // When called by sve_compress_byte, src and vtmp may be the same register.
2200 assert_different_registers(dst, src, vzr);
2201 assert_different_registers(dst, vtmp, vzr);
2202 assert_different_registers(mask, pgtmp);
2203 // high <-- low
2204 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2205 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2206 // Expected result: dst = 00 00 00 hh ee dd bb aa
2207
2208 // Extend lowest half to type INT.
2209 // dst = 00dd 00cc 00bb 00aa
2210 sve_uunpklo(dst, S, src);
2211 // pgtmp = 0001 0000 0001 0001
2212 sve_punpklo(pgtmp, mask);
2213 // Pack the active elements in size of type INT to the right,
2214 // and fill the remainings with zero.
2215 // dst = 0000 00dd 00bb 00aa
2216 sve_compact(dst, S, dst, pgtmp);
2217 // Narrow the result back to type SHORT.
2218 // dst = 00 00 00 00 00 dd bb aa
2219 sve_uzp1(dst, H, dst, vzr);
2220
2221 // Return if the vector length is no more than MaxVectorSize/2, since the
2222 // highest half is invalid.
2223 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2224 return;
2225 }
2226
2227 // Count the active elements of lowest half.
2228 // rscratch1 = 3
2229 sve_cntp(rscratch1, S, ptrue, pgtmp);
2230
2231 // Repeat to the highest half.
2232 // pgtmp = 0001 0000 0000 0001
2233 sve_punpkhi(pgtmp, mask);
2234 // vtmp = 00hh 00gg 00ff 00ee
2235 sve_uunpkhi(vtmp, S, src);
2236 // vtmp = 0000 0000 00hh 00ee
2237 sve_compact(vtmp, S, vtmp, pgtmp);
2238 // vtmp = 00 00 00 00 00 00 hh ee
2239 sve_uzp1(vtmp, H, vtmp, vzr);
2240
2241 // pgtmp = 00 00 00 00 00 01 01 01
2242 sve_whilelt(pgtmp, H, zr, rscratch1);
2243 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2244 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2245 // Combine the compressed low with the compressed high:
2246 // dst = 00 00 00 hh ee dd bb aa
2247 sve_splice(dst, H, pgtmp, vtmp);
2248 }
2249
2250 // Clobbers: rscratch1, rscratch2
2251 // Preserves: src, mask
2252 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2253 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2254 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2255 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2256 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2257 assert_different_registers(mask, ptmp, pgtmp);
2258 // high <-- low
2259 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2260 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2261 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2262 FloatRegister vzr = vtmp3;
2263 sve_dup(vzr, B, 0);
2264
2265 // Extend lowest half to type SHORT.
2266 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2267 sve_uunpklo(vtmp1, H, src);
2268 // ptmp = 00 01 00 00 00 01 00 01
2269 sve_punpklo(ptmp, mask);
2270 // Pack the active elements in size of type SHORT to the right,
2271 // and fill the remainings with zero.
2272 // dst = 00 00 00 00 00 0g 0c 0a
2273 unsigned extended_size = vector_length_in_bytes << 1;
2274 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2275 // Narrow the result back to type BYTE.
2276 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2277 sve_uzp1(dst, B, dst, vzr);
2278
2279 // Return if the vector length is no more than MaxVectorSize/2, since the
2280 // highest half is invalid.
2281 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2282 return;
2283 }
2284 // Count the active elements of lowest half.
2285 // rscratch2 = 3
2286 sve_cntp(rscratch2, H, ptrue, ptmp);
2287
2288 // Repeat to the highest half.
2289 // ptmp = 00 01 00 00 00 00 00 01
2290 sve_punpkhi(ptmp, mask);
2291 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2292 sve_uunpkhi(vtmp2, H, src);
2293 // vtmp1 = 00 00 00 00 00 00 0p 0i
2294 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2295 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2296 sve_uzp1(vtmp1, B, vtmp1, vzr);
2297
2298 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2299 sve_whilelt(ptmp, B, zr, rscratch2);
2300 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2301 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2302 // Combine the compressed low with the compressed high:
2303 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2304 sve_splice(dst, B, ptmp, vtmp1);
2305 }
2306
2307 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2308 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2309 SIMD_Arrangement size = isQ ? T16B : T8B;
2310 if (bt == T_BYTE) {
2311 rbit(dst, size, src);
2312 } else {
2313 neon_reverse_bytes(dst, src, bt, isQ);
2314 rbit(dst, size, dst);
2315 }
2316 }
2317
2318 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2319 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2320 SIMD_Arrangement size = isQ ? T16B : T8B;
2321 switch (bt) {
2322 case T_BYTE:
2323 if (dst != src) {
2324 orr(dst, size, src, src);
2325 }
2326 break;
2327 case T_SHORT:
2328 rev16(dst, size, src);
2329 break;
2330 case T_INT:
2331 rev32(dst, size, src);
2332 break;
2333 case T_LONG:
2334 rev64(dst, size, src);
2335 break;
2336 default:
2337 assert(false, "unsupported");
2338 ShouldNotReachHere();
2339 }
2340 }
2341
2342 // VectorRearrange implementation for short/int/float/long/double types with NEON
2343 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2344 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2345 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2346 // and use bsl to implement the operation.
2347 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2348 FloatRegister shuffle, FloatRegister tmp,
2349 BasicType bt, bool isQ) {
2350 assert_different_registers(dst, src, shuffle, tmp);
2351 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2352 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2353
2354 // Here is an example that rearranges a NEON vector with 4 ints:
2355 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2356 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2357 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2358 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2359 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2360 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2361 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2362 // 4. Use Vm as index register, and use V1 as table register.
2363 // Then get V2 as the result by tbl NEON instructions.
2364 switch (bt) {
2365 case T_SHORT:
2366 mov(tmp, size1, 0x02);
2367 mulv(dst, size2, shuffle, tmp);
2368 mov(tmp, size2, 0x0100);
2369 addv(dst, size1, dst, tmp);
2370 tbl(dst, size1, src, 1, dst);
2371 break;
2372 case T_INT:
2373 case T_FLOAT:
2374 mov(tmp, size1, 0x04);
2375 mulv(dst, size2, shuffle, tmp);
2376 mov(tmp, size2, 0x03020100);
2377 addv(dst, size1, dst, tmp);
2378 tbl(dst, size1, src, 1, dst);
2379 break;
2380 case T_LONG:
2381 case T_DOUBLE:
2382 // Load the iota indices for Long type. The indices are ordered by
2383 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2384 // the offset for L is 48.
2385 lea(rscratch1,
2386 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2387 ldrq(tmp, rscratch1);
2388 // Check whether the input "shuffle" is the same with iota indices.
2389 // Return "src" if true, otherwise swap the two elements of "src".
2390 cm(EQ, dst, size2, shuffle, tmp);
2391 ext(tmp, size1, src, src, 8);
2392 bsl(dst, size1, src, tmp);
2393 break;
2394 default:
2395 assert(false, "unsupported element type");
2396 ShouldNotReachHere();
2397 }
2398 }
2399
2400 // Extract a scalar element from an sve vector at position 'idx'.
2401 // The input elements in src are expected to be of integral type.
2402 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2403 int idx, FloatRegister vtmp) {
2404 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2405 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2406 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2407 if (bt == T_INT || bt == T_LONG) {
2408 umov(dst, src, size, idx);
2409 } else {
2410 smov(dst, src, size, idx);
2411 }
2412 } else {
2413 sve_orr(vtmp, src, src);
2414 sve_ext(vtmp, vtmp, idx << size);
2415 if (bt == T_INT || bt == T_LONG) {
2416 umov(dst, vtmp, size, 0);
2417 } else {
2418 smov(dst, vtmp, size, 0);
2419 }
2420 }
2421 }
2422
2423 // java.lang.Math::round intrinsics
2424
2425 // Clobbers: rscratch1, rflags
2426 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2427 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2428 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2429 switch (T) {
2430 case T2S:
2431 case T4S:
2432 fmovs(tmp1, T, 0.5f);
2433 mov(rscratch1, jint_cast(0x1.0p23f));
2434 break;
2435 case T2D:
2436 fmovd(tmp1, T, 0.5);
2437 mov(rscratch1, julong_cast(0x1.0p52));
2438 break;
2439 default:
2440 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2441 }
2442 fadd(tmp1, T, tmp1, src);
2443 fcvtms(tmp1, T, tmp1);
2444 // tmp1 = floor(src + 0.5, ties to even)
2445
2446 fcvtas(dst, T, src);
2447 // dst = round(src), ties to away
2448
2449 fneg(tmp3, T, src);
2450 dup(tmp2, T, rscratch1);
2451 cm(HS, tmp3, T, tmp3, tmp2);
2452 // tmp3 is now a set of flags
2453
2454 bif(dst, T16B, tmp1, tmp3);
2455 // result in dst
2456 }
2457
2458 // Clobbers: rscratch1, rflags
2459 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2460 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2461 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2462 assert_different_registers(tmp1, tmp2, src, dst);
2463
2464 switch (T) {
2465 case S:
2466 mov(rscratch1, jint_cast(0x1.0p23f));
2467 break;
2468 case D:
2469 mov(rscratch1, julong_cast(0x1.0p52));
2470 break;
2471 default:
2472 assert(T == S || T == D, "invalid register variant");
2473 }
2474
2475 sve_frinta(dst, T, ptrue, src);
2476 // dst = round(src), ties to away
2477
2478 Label none;
2479
2480 sve_fneg(tmp1, T, ptrue, src);
2481 sve_dup(tmp2, T, rscratch1);
2482 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2483 br(EQ, none);
2484 {
2485 sve_cpy(tmp1, T, pgtmp, 0.5);
2486 sve_fadd(tmp1, T, pgtmp, src);
2487 sve_frintm(dst, T, pgtmp, tmp1);
2488 // dst = floor(src + 0.5, ties to even)
2489 }
2490 bind(none);
2491
2492 sve_fcvtzs(dst, T, ptrue, dst, T);
2493 // result in dst
2494 }
2495
2496 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2497 FloatRegister one, SIMD_Arrangement T) {
2498 assert_different_registers(dst, src, zero, one);
2499 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2500
2501 facgt(dst, T, src, zero);
2502 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2503 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2504 }
2505
2506 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2507 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2508 assert_different_registers(dst, src, zero, one, vtmp);
2509 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2510
2511 sve_orr(vtmp, src, src);
2512 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2513 switch (T) {
2514 case S:
2515 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2516 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2517 // on the sign of the float value
2518 break;
2519 case D:
2520 sve_and(vtmp, T, min_jlong);
2521 sve_orr(vtmp, T, jlong_cast(1.0));
2522 break;
2523 default:
2524 assert(false, "unsupported");
2525 ShouldNotReachHere();
2526 }
2527 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2528 // Result in dst
2529 }
2530
2531 bool C2_MacroAssembler::in_scratch_emit_size() {
2532 if (ciEnv::current()->task() != nullptr) {
2533 PhaseOutput* phase_output = Compile::current()->output();
2534 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2535 return true;
2536 }
2537 }
2538 return MacroAssembler::in_scratch_emit_size();
2539 }
2540
2541 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2542 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2543 }
2544
2545 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2546 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2547 if (t == TypeInt::INT) {
2548 return;
2549 }
2550 BLOCK_COMMENT("verify_int_in_range {");
2551 Label L_success, L_failure;
2552
2553 jint lo = t->_lo;
2554 jint hi = t->_hi;
2555
2556 if (lo != min_jint && hi != max_jint) {
2557 subsw(rtmp, rval, lo);
2558 br(Assembler::LT, L_failure);
2559 subsw(rtmp, rval, hi);
2560 br(Assembler::LE, L_success);
2561 } else if (lo != min_jint) {
2562 subsw(rtmp, rval, lo);
2563 br(Assembler::GE, L_success);
2564 } else if (hi != max_jint) {
2565 subsw(rtmp, rval, hi);
2566 br(Assembler::LE, L_success);
2567 } else {
2568 ShouldNotReachHere();
2569 }
2570
2571 bind(L_failure);
2572 movw(c_rarg0, idx);
2573 mov(c_rarg1, rval);
2574 movw(c_rarg2, lo);
2575 movw(c_rarg3, hi);
2576 reconstruct_frame_pointer(rtmp);
2577 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2578 hlt(0);
2579
2580 bind(L_success);
2581 BLOCK_COMMENT("} verify_int_in_range");
2582 }
2583
2584 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2585 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2586 }
2587
2588 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2589 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2590 if (t == TypeLong::LONG) {
2591 return;
2592 }
2593 BLOCK_COMMENT("verify_long_in_range {");
2594 Label L_success, L_failure;
2595
2596 jlong lo = t->_lo;
2597 jlong hi = t->_hi;
2598
2599 if (lo != min_jlong && hi != max_jlong) {
2600 subs(rtmp, rval, lo);
2601 br(Assembler::LT, L_failure);
2602 subs(rtmp, rval, hi);
2603 br(Assembler::LE, L_success);
2604 } else if (lo != min_jlong) {
2605 subs(rtmp, rval, lo);
2606 br(Assembler::GE, L_success);
2607 } else if (hi != max_jlong) {
2608 subs(rtmp, rval, hi);
2609 br(Assembler::LE, L_success);
2610 } else {
2611 ShouldNotReachHere();
2612 }
2613
2614 bind(L_failure);
2615 movw(c_rarg0, idx);
2616 mov(c_rarg1, rval);
2617 mov(c_rarg2, lo);
2618 mov(c_rarg3, hi);
2619 reconstruct_frame_pointer(rtmp);
2620 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2621 hlt(0);
2622
2623 bind(L_success);
2624 BLOCK_COMMENT("} verify_long_in_range");
2625 }
2626
2627 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2628 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2629 if (PreserveFramePointer) {
2630 // frame pointer is valid
2631 #ifdef ASSERT
2632 // Verify frame pointer value in rfp.
2633 add(rtmp, sp, framesize - 2 * wordSize);
2634 Label L_success;
2635 cmp(rfp, rtmp);
2636 br(Assembler::EQ, L_success);
2637 stop("frame pointer mismatch");
2638 bind(L_success);
2639 #endif // ASSERT
2640 } else {
2641 add(rfp, sp, framesize - 2 * wordSize);
2642 }
2643 }
2644
2645 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2646 // using Neon instructions and places it in the destination vector element corresponding to the
2647 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2648 // where NUM_ELEM is the number of BasicType elements per vector.
2649 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2650 // Otherwise, selects src2[idx – NUM_ELEM]
2651 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2652 FloatRegister src2, FloatRegister index,
2653 FloatRegister tmp, unsigned vector_length_in_bytes) {
2654 assert_different_registers(dst, src1, src2, tmp);
2655 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2656
2657 if (vector_length_in_bytes == 16) {
2658 assert(UseSVE <= 1, "sve must be <= 1");
2659 assert(src1->successor() == src2, "Source registers must be ordered");
2660 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2661 tbl(dst, size, src1, 2, index);
2662 } else { // vector length == 8
2663 assert(UseSVE == 0, "must be Neon only");
2664 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2665 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2666 // instruction with one vector lookup
2667 ins(tmp, D, src1, 0, 0);
2668 ins(tmp, D, src2, 1, 0);
2669 tbl(dst, size, tmp, 1, index);
2670 }
2671 }
2672
2673 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2674 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2675 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2676 // where NUM_ELEM is the number of BasicType elements per vector.
2677 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2678 // Otherwise, selects src2[idx – NUM_ELEM]
2679 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2680 FloatRegister src2, FloatRegister index,
2681 FloatRegister tmp, SIMD_RegVariant T,
2682 unsigned vector_length_in_bytes) {
2683 assert_different_registers(dst, src1, src2, index, tmp);
2684
2685 if (vector_length_in_bytes == 8) {
2686 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2687 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2688 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2689 // instruction with one vector lookup
2690 assert(UseSVE >= 1, "sve must be >= 1");
2691 ins(tmp, D, src1, 0, 0);
2692 ins(tmp, D, src2, 1, 0);
2693 sve_tbl(dst, T, tmp, index);
2694 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2695 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2696 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2697 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2698 // with the only exception of 8B vector length.
2699 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2700 assert(src1->successor() == src2, "Source registers must be ordered");
2701 sve_tbl(dst, T, src1, src2, index);
2702 }
2703 }
2704
2705 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2706 FloatRegister src2, FloatRegister index,
2707 FloatRegister tmp, BasicType bt,
2708 unsigned vector_length_in_bytes) {
2709
2710 assert_different_registers(dst, src1, src2, index, tmp);
2711
2712 // The cases that can reach this method are -
2713 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2714 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2715 //
2716 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2717 // and UseSVE = 2 with vector_length_in_bytes >= 8
2718 //
2719 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2720 // UseSVE = 1 with vector_length_in_bytes = 16
2721
2722 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2723 SIMD_RegVariant T = elemType_to_regVariant(bt);
2724 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2725 return;
2726 }
2727
2728 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2729 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2730 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2731
2732 bool isQ = vector_length_in_bytes == 16;
2733
2734 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2735 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2736
2737 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2738 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2739 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2740 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2741 // the indices can range from [0, 8).
2742 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2743 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2744 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2745 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2746 // Add the multiplied result to the vector in tmp to obtain the byte level
2747 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2748 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2749
2750 if (bt == T_BYTE) {
2751 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2752 } else {
2753 int elem_size = (bt == T_SHORT) ? 2 : 4;
2754 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2755
2756 mov(tmp, size1, elem_size);
2757 mulv(dst, size2, index, tmp);
2758 mov(tmp, size2, tbl_offset);
2759 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2760 // to select a set of 2B/4B
2761 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2762 }
2763 }
2764
2765 // Vector expand implementation. Elements from the src vector are expanded into
2766 // the dst vector under the control of the vector mask.
2767 // Since there are no native instructions directly corresponding to expand before
2768 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2769 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2770 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2771 // for NEON and SVE, but with different instructions where appropriate.
2772
2773 // Vector expand implementation for NEON.
2774 //
2775 // An example of 128-bit Byte vector:
2776 // Data direction: high <== low
2777 // Input:
2778 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2779 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2780 // Expected result:
2781 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2782 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2783 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2784 int vector_length_in_bytes) {
2785 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2786 assert_different_registers(dst, src, mask, tmp1, tmp2);
2787 // Since the TBL instruction only supports byte table, we need to
2788 // compute indices in byte type for all types.
2789 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2790 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2791 dup(tmp1, size, zr);
2792 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2793 negr(dst, size, mask);
2794 // Calculate vector index for TBL with prefix sum algorithm.
2795 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2796 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2797 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2798 addv(dst, size, tmp2, dst);
2799 }
2800 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2801 orr(tmp2, size, mask, mask);
2802 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2803 bsl(tmp2, size, dst, tmp1);
2804 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2805 movi(tmp1, size, 1);
2806 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2807 subv(dst, size, tmp2, tmp1);
2808 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2809 tbl(dst, size, src, 1, dst);
2810 }
2811
2812 // Vector expand implementation for SVE.
2813 //
2814 // An example of 128-bit Short vector:
2815 // Data direction: high <== low
2816 // Input:
2817 // src = gf ed cb a9 87 65 43 21
2818 // pg = 00 01 00 01 00 01 00 01
2819 // Expected result:
2820 // dst = 00 87 00 65 00 43 00 21
2821 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2822 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2823 int vector_length_in_bytes) {
2824 assert(UseSVE > 0, "expand implementation only for SVE");
2825 assert_different_registers(dst, src, tmp1, tmp2);
2826 SIMD_RegVariant size = elemType_to_regVariant(bt);
2827
2828 // tmp1 = 00 00 00 00 00 00 00 00
2829 sve_dup(tmp1, size, 0);
2830 sve_movprfx(tmp2, tmp1);
2831 // tmp2 = 00 01 00 01 00 01 00 01
2832 sve_cpy(tmp2, size, pg, 1, true);
2833 // Calculate vector index for TBL with prefix sum algorithm.
2834 // tmp2 = 04 04 03 03 02 02 01 01
2835 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2836 sve_movprfx(dst, tmp1);
2837 // The EXT instruction operates on the full-width sve register. The correct
2838 // index calculation method is:
2839 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2840 // MaxVectorSize - i.
2841 sve_ext(dst, tmp2, MaxVectorSize - i);
2842 sve_add(tmp2, size, dst, tmp2);
2843 }
2844 // dst = 00 04 00 03 00 02 00 01
2845 sve_sel(dst, size, pg, tmp2, tmp1);
2846 // dst = -1 03 -1 02 -1 01 -1 00
2847 sve_sub(dst, size, 1);
2848 // dst = 00 87 00 65 00 43 00 21
2849 sve_tbl(dst, size, src, dst);
2850 }