1 /*
2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/stubRoutines.hpp"
34 #include "utilities/globalDefinitions.hpp"
35 #include "utilities/powerOfTwo.hpp"
36
37 #ifdef PRODUCT
38 #define BLOCK_COMMENT(str) /* nothing */
39 #define STOP(error) stop(error)
40 #else
41 #define BLOCK_COMMENT(str) block_comment(str)
42 #define STOP(error) block_comment(error); stop(error)
43 #endif
44
45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46
47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
48
49 void C2_MacroAssembler::entry_barrier() {
50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
51 // Dummy labels for just measuring the code size
52 Label dummy_slow_path;
53 Label dummy_continuation;
54 Label dummy_guard;
55 Label* slow_path = &dummy_slow_path;
56 Label* continuation = &dummy_continuation;
57 Label* guard = &dummy_guard;
58 if (!Compile::current()->output()->in_scratch_emit_size()) {
59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
61 Compile::current()->output()->add_stub(stub);
62 slow_path = &stub->entry();
63 continuation = &stub->continuation();
64 guard = &stub->guard();
65 }
66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
68 }
69
70 // jdk.internal.util.ArraysSupport.vectorizedHashCode
71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
72 FloatRegister vdata0, FloatRegister vdata1,
73 FloatRegister vdata2, FloatRegister vdata3,
74 FloatRegister vmul0, FloatRegister vmul1,
75 FloatRegister vmul2, FloatRegister vmul3,
76 FloatRegister vpow, FloatRegister vpowm,
77 BasicType eltype) {
78 ARRAYS_HASHCODE_REGISTERS;
79
80 Register tmp1 = rscratch1, tmp2 = rscratch2;
81
82 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
83
84 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
85 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
86 // use 4H for chars and shorts instead, but using 8H gives better performance.
87 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
88 : eltype == T_CHAR || eltype == T_SHORT ? 8
89 : eltype == T_INT ? 4
90 : 0;
91 guarantee(vf, "unsupported eltype");
92
93 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
94 const size_t unroll_factor = 4;
95
96 switch (eltype) {
97 case T_BOOLEAN:
98 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
99 break;
100 case T_CHAR:
101 BLOCK_COMMENT("arrays_hashcode(char) {");
102 break;
103 case T_BYTE:
104 BLOCK_COMMENT("arrays_hashcode(byte) {");
105 break;
106 case T_SHORT:
107 BLOCK_COMMENT("arrays_hashcode(short) {");
108 break;
109 case T_INT:
110 BLOCK_COMMENT("arrays_hashcode(int) {");
111 break;
112 default:
113 ShouldNotReachHere();
114 }
115
116 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
117 // implemented by the stub executes just once. Call the stub only if at least two iterations will
118 // be executed.
119 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
120 cmpw(cnt, large_threshold);
121 br(Assembler::HS, LARGE);
122
123 bind(TAIL);
124
125 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
126 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
127 // Iteration eats up the remainder, uf elements at a time.
128 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
129 andr(tmp2, cnt, unroll_factor - 1);
130 adr(tmp1, BR_BASE);
131 // For Cortex-A53 offset is 4 because 2 nops are generated.
132 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
133 movw(tmp2, 0x1f);
134 br(tmp1);
135
136 bind(LOOP);
137 for (size_t i = 0; i < unroll_factor; ++i) {
138 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
139 maddw(result, result, tmp2, tmp1);
140 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
141 // Generate 2nd nop to have 4 instructions per iteration.
142 if (VM_Version::supports_a53mac()) {
143 nop();
144 }
145 }
146 bind(BR_BASE);
147 subsw(cnt, cnt, unroll_factor);
148 br(Assembler::HS, LOOP);
149
150 b(DONE);
151
152 bind(LARGE);
153
154 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
155 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
156 address tpc = trampoline_call(stub);
157 if (tpc == nullptr) {
158 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
159 postcond(pc() == badAddress);
160 return nullptr;
161 }
162
163 bind(DONE);
164
165 BLOCK_COMMENT("} // arrays_hashcode");
166
167 postcond(pc() != badAddress);
168 return pc();
169 }
170
171 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
172 Register t2, Register t3) {
173 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
174
175 // Handle inflated monitor.
176 Label inflated;
177 // Finish fast lock successfully. MUST branch to with flag == EQ
178 Label locked;
179 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
180 Label slow_path;
181
182 if (UseObjectMonitorTable) {
183 // Clear cache in case fast locking succeeds or we need to take the slow-path.
184 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
185 }
186
187 if (DiagnoseSyncOnValueBasedClasses != 0) {
188 load_klass(t1, obj);
189 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
190 tst(t1, KlassFlags::_misc_is_value_based_class);
191 br(Assembler::NE, slow_path);
192 }
193
194 const Register t1_mark = t1;
195 const Register t3_t = t3;
196
197 { // Fast locking
198
199 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
200 Label push;
201
202 const Register t2_top = t2;
203
204 // Check if lock-stack is full.
205 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
206 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
207 br(Assembler::GT, slow_path);
208
209 // Check if recursive.
210 subw(t3_t, t2_top, oopSize);
211 ldr(t3_t, Address(rthread, t3_t));
212 cmp(obj, t3_t);
213 br(Assembler::EQ, push);
214
215 // Relaxed normal load to check for monitor. Optimization for monitor case.
216 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
217 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
218
219 // Not inflated
220 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
221
222 // Try to lock. Transition lock-bits 0b01 => 0b00
223 orr(t1_mark, t1_mark, markWord::unlocked_value);
224 eor(t3_t, t1_mark, markWord::unlocked_value);
225 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
226 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
227 br(Assembler::NE, slow_path);
228
229 bind(push);
230 // After successful lock, push object on lock-stack.
231 str(obj, Address(rthread, t2_top));
232 addw(t2_top, t2_top, oopSize);
233 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
234 b(locked);
235 }
236
237 { // Handle inflated monitor.
238 bind(inflated);
239
240 const Register t1_monitor = t1;
241
242 if (!UseObjectMonitorTable) {
243 assert(t1_monitor == t1_mark, "should be the same here");
244 } else {
245 Label monitor_found;
246
247 // Load cache address
248 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
249
250 const int num_unrolled = 2;
251 for (int i = 0; i < num_unrolled; i++) {
252 ldr(t1, Address(t3_t));
253 cmp(obj, t1);
254 br(Assembler::EQ, monitor_found);
255 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
256 }
257
258 Label loop;
259
260 // Search for obj in cache.
261 bind(loop);
262
263 // Check for match.
264 ldr(t1, Address(t3_t));
265 cmp(obj, t1);
266 br(Assembler::EQ, monitor_found);
267
268 // Search until null encountered, guaranteed _null_sentinel at end.
269 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
270 cbnz(t1, loop);
271 // Cache Miss, NE set from cmp above, cbnz does not set flags
272 b(slow_path);
273
274 bind(monitor_found);
275 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
276 }
277
278 const Register t2_owner_addr = t2;
279 const Register t3_owner = t3;
280 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
281 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
282 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
283
284 Label monitor_locked;
285
286 // Compute owner address.
287 lea(t2_owner_addr, owner_address);
288
289 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
290 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
291 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
292 /*release*/ false, /*weak*/ false, t3_owner);
293 br(Assembler::EQ, monitor_locked);
294
295 // Check if recursive.
296 cmp(t3_owner, rscratch2);
297 br(Assembler::NE, slow_path);
298
299 // Recursive.
300 increment(recursions_address, 1);
301
302 bind(monitor_locked);
303 if (UseObjectMonitorTable) {
304 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
305 }
306 }
307
308 bind(locked);
309
310 #ifdef ASSERT
311 // Check that locked label is reached with Flags == EQ.
312 Label flag_correct;
313 br(Assembler::EQ, flag_correct);
314 stop("Fast Lock Flag != EQ");
315 #endif
316
317 bind(slow_path);
318 #ifdef ASSERT
319 // Check that slow_path label is reached with Flags == NE.
320 br(Assembler::NE, flag_correct);
321 stop("Fast Lock Flag != NE");
322 bind(flag_correct);
323 #endif
324 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
325 }
326
327 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
328 Register t2, Register t3) {
329 assert_different_registers(obj, box, t1, t2, t3);
330
331 // Handle inflated monitor.
332 Label inflated, inflated_load_mark;
333 // Finish fast unlock successfully. MUST branch to with flag == EQ
334 Label unlocked;
335 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
336 Label slow_path;
337
338 const Register t1_mark = t1;
339 const Register t2_top = t2;
340 const Register t3_t = t3;
341
342 { // Fast unlock
343
344 Label push_and_slow_path;
345
346 // Check if obj is top of lock-stack.
347 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
348 subw(t2_top, t2_top, oopSize);
349 ldr(t3_t, Address(rthread, t2_top));
350 cmp(obj, t3_t);
351 // Top of lock stack was not obj. Must be monitor.
352 br(Assembler::NE, inflated_load_mark);
353
354 // Pop lock-stack.
355 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
356 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
357
358 // Check if recursive.
359 subw(t3_t, t2_top, oopSize);
360 ldr(t3_t, Address(rthread, t3_t));
361 cmp(obj, t3_t);
362 br(Assembler::EQ, unlocked);
363
364 // Not recursive.
365 // Load Mark.
366 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
367
368 // Check header for monitor (0b10).
369 // Because we got here by popping (meaning we pushed in locked)
370 // there will be no monitor in the box. So we need to push back the obj
371 // so that the runtime can fix any potential anonymous owner.
372 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
373
374 // Try to unlock. Transition lock bits 0b00 => 0b01
375 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
376 orr(t3_t, t1_mark, markWord::unlocked_value);
377 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
378 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
379 br(Assembler::EQ, unlocked);
380
381 bind(push_and_slow_path);
382 // Compare and exchange failed.
383 // Restore lock-stack and handle the unlock in runtime.
384 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
385 addw(t2_top, t2_top, oopSize);
386 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
387 b(slow_path);
388 }
389
390
391 { // Handle inflated monitor.
392 bind(inflated_load_mark);
393 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
394 #ifdef ASSERT
395 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
396 stop("Fast Unlock not monitor");
397 #endif
398
399 bind(inflated);
400
401 #ifdef ASSERT
402 Label check_done;
403 subw(t2_top, t2_top, oopSize);
404 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
405 br(Assembler::LT, check_done);
406 ldr(t3_t, Address(rthread, t2_top));
407 cmp(obj, t3_t);
408 br(Assembler::NE, inflated);
409 stop("Fast Unlock lock on stack");
410 bind(check_done);
411 #endif
412
413 const Register t1_monitor = t1;
414
415 if (!UseObjectMonitorTable) {
416 assert(t1_monitor == t1_mark, "should be the same here");
417
418 // Untag the monitor.
419 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
420 } else {
421 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
422 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
423 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
424 br(Assembler::LO, slow_path);
425 }
426
427 const Register t2_recursions = t2;
428 Label not_recursive;
429
430 // Check if recursive.
431 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
432 cbz(t2_recursions, not_recursive);
433
434 // Recursive unlock.
435 sub(t2_recursions, t2_recursions, 1u);
436 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
437 // Set flag == EQ
438 cmp(t2_recursions, t2_recursions);
439 b(unlocked);
440
441 bind(not_recursive);
442
443 const Register t2_owner_addr = t2;
444
445 // Compute owner address.
446 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
447
448 // Set owner to null.
449 // Release to satisfy the JMM
450 stlr(zr, t2_owner_addr);
451 // We need a full fence after clearing owner to avoid stranding.
452 // StoreLoad achieves this.
453 membar(StoreLoad);
454
455 // Check if the entry_list is empty.
456 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
457 cmp(rscratch1, zr);
458 br(Assembler::EQ, unlocked); // If so we are done.
459
460 // Check if there is a successor.
461 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
462 cmp(rscratch1, zr);
463 br(Assembler::NE, unlocked); // If so we are done.
464
465 // Save the monitor pointer in the current thread, so we can try to
466 // reacquire the lock in SharedRuntime::monitor_exit_helper().
467 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
468
469 cmp(zr, rthread); // Set Flag to NE => slow path
470 b(slow_path);
471 }
472
473 bind(unlocked);
474 cmp(zr, zr); // Set Flags to EQ => fast path
475
476 #ifdef ASSERT
477 // Check that unlocked label is reached with Flags == EQ.
478 Label flag_correct;
479 br(Assembler::EQ, flag_correct);
480 stop("Fast Unlock Flag != EQ");
481 #endif
482
483 bind(slow_path);
484 #ifdef ASSERT
485 // Check that slow_path label is reached with Flags == NE.
486 br(Assembler::NE, flag_correct);
487 stop("Fast Unlock Flag != NE");
488 bind(flag_correct);
489 #endif
490 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
491 }
492
493 // Search for str1 in str2 and return index or -1
494 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
495 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
496 Register cnt2, Register cnt1,
497 Register tmp1, Register tmp2,
498 Register tmp3, Register tmp4,
499 Register tmp5, Register tmp6,
500 int icnt1, Register result, int ae) {
501 // NOTE: tmp5, tmp6 can be zr depending on specific method version
502 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
503
504 Register ch1 = rscratch1;
505 Register ch2 = rscratch2;
506 Register cnt1tmp = tmp1;
507 Register cnt2tmp = tmp2;
508 Register cnt1_neg = cnt1;
509 Register cnt2_neg = cnt2;
510 Register result_tmp = tmp4;
511
512 bool isL = ae == StrIntrinsicNode::LL;
513
514 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
515 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
516 int str1_chr_shift = str1_isL ? 0:1;
517 int str2_chr_shift = str2_isL ? 0:1;
518 int str1_chr_size = str1_isL ? 1:2;
519 int str2_chr_size = str2_isL ? 1:2;
520 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
521 (chr_insn)&MacroAssembler::ldrh;
522 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
523 (chr_insn)&MacroAssembler::ldrh;
524 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
525 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
526
527 // Note, inline_string_indexOf() generates checks:
528 // if (substr.count > string.count) return -1;
529 // if (substr.count == 0) return 0;
530
531 // We have two strings, a source string in str2, cnt2 and a pattern string
532 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
533
534 // For larger pattern and source we use a simplified Boyer Moore algorithm.
535 // With a small pattern and source we use linear scan.
536
537 if (icnt1 == -1) {
538 sub(result_tmp, cnt2, cnt1);
539 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
540 br(LT, LINEARSEARCH);
541 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
542 subs(zr, cnt1, 256);
543 lsr(tmp1, cnt2, 2);
544 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
545 br(GE, LINEARSTUB);
546 }
547
548 // The Boyer Moore alogorithm is based on the description here:-
549 //
550 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
551 //
552 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
553 // and the 'Good Suffix' rule.
554 //
555 // These rules are essentially heuristics for how far we can shift the
556 // pattern along the search string.
557 //
558 // The implementation here uses the 'Bad Character' rule only because of the
559 // complexity of initialisation for the 'Good Suffix' rule.
560 //
561 // This is also known as the Boyer-Moore-Horspool algorithm:-
562 //
563 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
564 //
565 // This particular implementation has few java-specific optimizations.
566 //
567 // #define ASIZE 256
568 //
569 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
570 // int i, j;
571 // unsigned c;
572 // unsigned char bc[ASIZE];
573 //
574 // /* Preprocessing */
575 // for (i = 0; i < ASIZE; ++i)
576 // bc[i] = m;
577 // for (i = 0; i < m - 1; ) {
578 // c = x[i];
579 // ++i;
580 // // c < 256 for Latin1 string, so, no need for branch
581 // #ifdef PATTERN_STRING_IS_LATIN1
582 // bc[c] = m - i;
583 // #else
584 // if (c < ASIZE) bc[c] = m - i;
585 // #endif
586 // }
587 //
588 // /* Searching */
589 // j = 0;
590 // while (j <= n - m) {
591 // c = y[i+j];
592 // if (x[m-1] == c)
593 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
594 // if (i < 0) return j;
595 // // c < 256 for Latin1 string, so, no need for branch
596 // #ifdef SOURCE_STRING_IS_LATIN1
597 // // LL case: (c< 256) always true. Remove branch
598 // j += bc[y[j+m-1]];
599 // #endif
600 // #ifndef PATTERN_STRING_IS_UTF
601 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
602 // if (c < ASIZE)
603 // j += bc[y[j+m-1]];
604 // else
605 // j += 1
606 // #endif
607 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
608 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
609 // if (c < ASIZE)
610 // j += bc[y[j+m-1]];
611 // else
612 // j += m
613 // #endif
614 // }
615 // }
616
617 if (icnt1 == -1) {
618 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
619 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
620 Register cnt1end = tmp2;
621 Register str2end = cnt2;
622 Register skipch = tmp2;
623
624 // str1 length is >=8, so, we can read at least 1 register for cases when
625 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
626 // UL case. We'll re-read last character in inner pre-loop code to have
627 // single outer pre-loop load
628 const int firstStep = isL ? 7 : 3;
629
630 const int ASIZE = 256;
631 const int STORED_BYTES = 32; // amount of bytes stored per instruction
632 sub(sp, sp, ASIZE);
633 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
634 mov(ch1, sp);
635 BIND(BM_INIT_LOOP);
636 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
637 subs(tmp5, tmp5, 1);
638 br(GT, BM_INIT_LOOP);
639
640 sub(cnt1tmp, cnt1, 1);
641 mov(tmp5, str2);
642 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
643 sub(ch2, cnt1, 1);
644 mov(tmp3, str1);
645 BIND(BCLOOP);
646 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
647 if (!str1_isL) {
648 subs(zr, ch1, ASIZE);
649 br(HS, BCSKIP);
650 }
651 strb(ch2, Address(sp, ch1));
652 BIND(BCSKIP);
653 subs(ch2, ch2, 1);
654 br(GT, BCLOOP);
655
656 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
657 if (str1_isL == str2_isL) {
658 // load last 8 bytes (8LL/4UU symbols)
659 ldr(tmp6, Address(tmp6, -wordSize));
660 } else {
661 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
662 // convert Latin1 to UTF. We'll have to wait until load completed, but
663 // it's still faster than per-character loads+checks
664 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
665 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
666 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
667 andr(tmp6, tmp6, 0xFF); // str1[N-4]
668 orr(ch2, ch1, ch2, LSL, 16);
669 orr(tmp6, tmp6, tmp3, LSL, 48);
670 orr(tmp6, tmp6, ch2, LSL, 16);
671 }
672 BIND(BMLOOPSTR2);
673 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
674 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
675 if (str1_isL == str2_isL) {
676 // re-init tmp3. It's for free because it's executed in parallel with
677 // load above. Alternative is to initialize it before loop, but it'll
678 // affect performance on in-order systems with 2 or more ld/st pipelines
679 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
680 }
681 if (!isL) { // UU/UL case
682 lsl(ch2, cnt1tmp, 1); // offset in bytes
683 }
684 cmp(tmp3, skipch);
685 br(NE, BMSKIP);
686 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
687 mov(ch1, tmp6);
688 if (isL) {
689 b(BMLOOPSTR1_AFTER_LOAD);
690 } else {
691 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
692 b(BMLOOPSTR1_CMP);
693 }
694 BIND(BMLOOPSTR1);
695 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
696 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
697 BIND(BMLOOPSTR1_AFTER_LOAD);
698 subs(cnt1tmp, cnt1tmp, 1);
699 br(LT, BMLOOPSTR1_LASTCMP);
700 BIND(BMLOOPSTR1_CMP);
701 cmp(ch1, ch2);
702 br(EQ, BMLOOPSTR1);
703 BIND(BMSKIP);
704 if (!isL) {
705 // if we've met UTF symbol while searching Latin1 pattern, then we can
706 // skip cnt1 symbols
707 if (str1_isL != str2_isL) {
708 mov(result_tmp, cnt1);
709 } else {
710 mov(result_tmp, 1);
711 }
712 subs(zr, skipch, ASIZE);
713 br(HS, BMADV);
714 }
715 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
716 BIND(BMADV);
717 sub(cnt1tmp, cnt1, 1);
718 add(str2, str2, result_tmp, LSL, str2_chr_shift);
719 cmp(str2, str2end);
720 br(LE, BMLOOPSTR2);
721 add(sp, sp, ASIZE);
722 b(NOMATCH);
723 BIND(BMLOOPSTR1_LASTCMP);
724 cmp(ch1, ch2);
725 br(NE, BMSKIP);
726 BIND(BMMATCH);
727 sub(result, str2, tmp5);
728 if (!str2_isL) lsr(result, result, 1);
729 add(sp, sp, ASIZE);
730 b(DONE);
731
732 BIND(LINEARSTUB);
733 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
734 br(LT, LINEAR_MEDIUM);
735 mov(result, zr);
736 RuntimeAddress stub = nullptr;
737 if (isL) {
738 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
739 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
740 } else if (str1_isL) {
741 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
742 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
743 } else {
744 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
745 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
746 }
747 address call = trampoline_call(stub);
748 if (call == nullptr) {
749 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
750 ciEnv::current()->record_failure("CodeCache is full");
751 return;
752 }
753 b(DONE);
754 }
755
756 BIND(LINEARSEARCH);
757 {
758 Label DO1, DO2, DO3;
759
760 Register str2tmp = tmp2;
761 Register first = tmp3;
762
763 if (icnt1 == -1)
764 {
765 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
766
767 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
768 br(LT, DOSHORT);
769 BIND(LINEAR_MEDIUM);
770 (this->*str1_load_1chr)(first, Address(str1));
771 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
772 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
773 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
774 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
775
776 BIND(FIRST_LOOP);
777 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
778 cmp(first, ch2);
779 br(EQ, STR1_LOOP);
780 BIND(STR2_NEXT);
781 adds(cnt2_neg, cnt2_neg, str2_chr_size);
782 br(LE, FIRST_LOOP);
783 b(NOMATCH);
784
785 BIND(STR1_LOOP);
786 adds(cnt1tmp, cnt1_neg, str1_chr_size);
787 add(cnt2tmp, cnt2_neg, str2_chr_size);
788 br(GE, MATCH);
789
790 BIND(STR1_NEXT);
791 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
792 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
793 cmp(ch1, ch2);
794 br(NE, STR2_NEXT);
795 adds(cnt1tmp, cnt1tmp, str1_chr_size);
796 add(cnt2tmp, cnt2tmp, str2_chr_size);
797 br(LT, STR1_NEXT);
798 b(MATCH);
799
800 BIND(DOSHORT);
801 if (str1_isL == str2_isL) {
802 cmp(cnt1, (u1)2);
803 br(LT, DO1);
804 br(GT, DO3);
805 }
806 }
807
808 if (icnt1 == 4) {
809 Label CH1_LOOP;
810
811 (this->*load_4chr)(ch1, str1);
812 sub(result_tmp, cnt2, 4);
813 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
814 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
815
816 BIND(CH1_LOOP);
817 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
818 cmp(ch1, ch2);
819 br(EQ, MATCH);
820 adds(cnt2_neg, cnt2_neg, str2_chr_size);
821 br(LE, CH1_LOOP);
822 b(NOMATCH);
823 }
824
825 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
826 Label CH1_LOOP;
827
828 BIND(DO2);
829 (this->*load_2chr)(ch1, str1);
830 if (icnt1 == 2) {
831 sub(result_tmp, cnt2, 2);
832 }
833 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
834 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
835 BIND(CH1_LOOP);
836 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
837 cmp(ch1, ch2);
838 br(EQ, MATCH);
839 adds(cnt2_neg, cnt2_neg, str2_chr_size);
840 br(LE, CH1_LOOP);
841 b(NOMATCH);
842 }
843
844 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
845 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
846
847 BIND(DO3);
848 (this->*load_2chr)(first, str1);
849 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
850 if (icnt1 == 3) {
851 sub(result_tmp, cnt2, 3);
852 }
853 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
854 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
855 BIND(FIRST_LOOP);
856 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
857 cmpw(first, ch2);
858 br(EQ, STR1_LOOP);
859 BIND(STR2_NEXT);
860 adds(cnt2_neg, cnt2_neg, str2_chr_size);
861 br(LE, FIRST_LOOP);
862 b(NOMATCH);
863
864 BIND(STR1_LOOP);
865 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
866 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
867 cmp(ch1, ch2);
868 br(NE, STR2_NEXT);
869 b(MATCH);
870 }
871
872 if (icnt1 == -1 || icnt1 == 1) {
873 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
874
875 BIND(DO1);
876 (this->*str1_load_1chr)(ch1, str1);
877 cmp(cnt2, (u1)8);
878 br(LT, DO1_SHORT);
879
880 sub(result_tmp, cnt2, 8/str2_chr_size);
881 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
882 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
883 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
884
885 if (str2_isL) {
886 orr(ch1, ch1, ch1, LSL, 8);
887 }
888 orr(ch1, ch1, ch1, LSL, 16);
889 orr(ch1, ch1, ch1, LSL, 32);
890 BIND(CH1_LOOP);
891 ldr(ch2, Address(str2, cnt2_neg));
892 eor(ch2, ch1, ch2);
893 sub(tmp1, ch2, tmp3);
894 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
895 bics(tmp1, tmp1, tmp2);
896 br(NE, HAS_ZERO);
897 adds(cnt2_neg, cnt2_neg, 8);
898 br(LT, CH1_LOOP);
899
900 cmp(cnt2_neg, (u1)8);
901 mov(cnt2_neg, 0);
902 br(LT, CH1_LOOP);
903 b(NOMATCH);
904
905 BIND(HAS_ZERO);
906 rev(tmp1, tmp1);
907 clz(tmp1, tmp1);
908 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
909 b(MATCH);
910
911 BIND(DO1_SHORT);
912 mov(result_tmp, cnt2);
913 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
914 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
915 BIND(DO1_LOOP);
916 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
917 cmpw(ch1, ch2);
918 br(EQ, MATCH);
919 adds(cnt2_neg, cnt2_neg, str2_chr_size);
920 br(LT, DO1_LOOP);
921 }
922 }
923 BIND(NOMATCH);
924 mov(result, -1);
925 b(DONE);
926 BIND(MATCH);
927 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
928 BIND(DONE);
929 }
930
931 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
932 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
933
934 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
935 Register ch, Register result,
936 Register tmp1, Register tmp2, Register tmp3)
937 {
938 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
939 Register cnt1_neg = cnt1;
940 Register ch1 = rscratch1;
941 Register result_tmp = rscratch2;
942
943 cbz(cnt1, NOMATCH);
944
945 cmp(cnt1, (u1)4);
946 br(LT, DO1_SHORT);
947
948 orr(ch, ch, ch, LSL, 16);
949 orr(ch, ch, ch, LSL, 32);
950
951 sub(cnt1, cnt1, 4);
952 mov(result_tmp, cnt1);
953 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
954 sub(cnt1_neg, zr, cnt1, LSL, 1);
955
956 mov(tmp3, 0x0001000100010001);
957
958 BIND(CH1_LOOP);
959 ldr(ch1, Address(str1, cnt1_neg));
960 eor(ch1, ch, ch1);
961 sub(tmp1, ch1, tmp3);
962 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
963 bics(tmp1, tmp1, tmp2);
964 br(NE, HAS_ZERO);
965 adds(cnt1_neg, cnt1_neg, 8);
966 br(LT, CH1_LOOP);
967
968 cmp(cnt1_neg, (u1)8);
969 mov(cnt1_neg, 0);
970 br(LT, CH1_LOOP);
971 b(NOMATCH);
972
973 BIND(HAS_ZERO);
974 rev(tmp1, tmp1);
975 clz(tmp1, tmp1);
976 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
977 b(MATCH);
978
979 BIND(DO1_SHORT);
980 mov(result_tmp, cnt1);
981 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
982 sub(cnt1_neg, zr, cnt1, LSL, 1);
983 BIND(DO1_LOOP);
984 ldrh(ch1, Address(str1, cnt1_neg));
985 cmpw(ch, ch1);
986 br(EQ, MATCH);
987 adds(cnt1_neg, cnt1_neg, 2);
988 br(LT, DO1_LOOP);
989 BIND(NOMATCH);
990 mov(result, -1);
991 b(DONE);
992 BIND(MATCH);
993 add(result, result_tmp, cnt1_neg, ASR, 1);
994 BIND(DONE);
995 }
996
997 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
998 Register ch, Register result,
999 FloatRegister ztmp1,
1000 FloatRegister ztmp2,
1001 PRegister tmp_pg,
1002 PRegister tmp_pdn, bool isL)
1003 {
1004 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1005 assert(tmp_pg->is_governing(),
1006 "this register has to be a governing predicate register");
1007
1008 Label LOOP, MATCH, DONE, NOMATCH;
1009 Register vec_len = rscratch1;
1010 Register idx = rscratch2;
1011
1012 SIMD_RegVariant T = (isL == true) ? B : H;
1013
1014 cbz(cnt1, NOMATCH);
1015
1016 // Assign the particular char throughout the vector.
1017 sve_dup(ztmp2, T, ch);
1018 if (isL) {
1019 sve_cntb(vec_len);
1020 } else {
1021 sve_cnth(vec_len);
1022 }
1023 mov(idx, 0);
1024
1025 // Generate a predicate to control the reading of input string.
1026 sve_whilelt(tmp_pg, T, idx, cnt1);
1027
1028 BIND(LOOP);
1029 // Read a vector of 8- or 16-bit data depending on the string type. Note
1030 // that inactive elements indicated by the predicate register won't cause
1031 // a data read from memory to the destination vector.
1032 if (isL) {
1033 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1034 } else {
1035 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1036 }
1037 add(idx, idx, vec_len);
1038
1039 // Perform the comparison. An element of the destination predicate is set
1040 // to active if the particular char is matched.
1041 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1042
1043 // Branch if the particular char is found.
1044 br(NE, MATCH);
1045
1046 sve_whilelt(tmp_pg, T, idx, cnt1);
1047
1048 // Loop back if the particular char not found.
1049 br(MI, LOOP);
1050
1051 BIND(NOMATCH);
1052 mov(result, -1);
1053 b(DONE);
1054
1055 BIND(MATCH);
1056 // Undo the index increment.
1057 sub(idx, idx, vec_len);
1058
1059 // Crop the vector to find its location.
1060 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1061 add(result, idx, -1);
1062 sve_incp(result, T, tmp_pdn);
1063 BIND(DONE);
1064 }
1065
1066 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1067 Register ch, Register result,
1068 Register tmp1, Register tmp2, Register tmp3)
1069 {
1070 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1071 Register cnt1_neg = cnt1;
1072 Register ch1 = rscratch1;
1073 Register result_tmp = rscratch2;
1074
1075 cbz(cnt1, NOMATCH);
1076
1077 cmp(cnt1, (u1)8);
1078 br(LT, DO1_SHORT);
1079
1080 orr(ch, ch, ch, LSL, 8);
1081 orr(ch, ch, ch, LSL, 16);
1082 orr(ch, ch, ch, LSL, 32);
1083
1084 sub(cnt1, cnt1, 8);
1085 mov(result_tmp, cnt1);
1086 lea(str1, Address(str1, cnt1));
1087 sub(cnt1_neg, zr, cnt1);
1088
1089 mov(tmp3, 0x0101010101010101);
1090
1091 BIND(CH1_LOOP);
1092 ldr(ch1, Address(str1, cnt1_neg));
1093 eor(ch1, ch, ch1);
1094 sub(tmp1, ch1, tmp3);
1095 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1096 bics(tmp1, tmp1, tmp2);
1097 br(NE, HAS_ZERO);
1098 adds(cnt1_neg, cnt1_neg, 8);
1099 br(LT, CH1_LOOP);
1100
1101 cmp(cnt1_neg, (u1)8);
1102 mov(cnt1_neg, 0);
1103 br(LT, CH1_LOOP);
1104 b(NOMATCH);
1105
1106 BIND(HAS_ZERO);
1107 rev(tmp1, tmp1);
1108 clz(tmp1, tmp1);
1109 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1110 b(MATCH);
1111
1112 BIND(DO1_SHORT);
1113 mov(result_tmp, cnt1);
1114 lea(str1, Address(str1, cnt1));
1115 sub(cnt1_neg, zr, cnt1);
1116 BIND(DO1_LOOP);
1117 ldrb(ch1, Address(str1, cnt1_neg));
1118 cmp(ch, ch1);
1119 br(EQ, MATCH);
1120 adds(cnt1_neg, cnt1_neg, 1);
1121 br(LT, DO1_LOOP);
1122 BIND(NOMATCH);
1123 mov(result, -1);
1124 b(DONE);
1125 BIND(MATCH);
1126 add(result, result_tmp, cnt1_neg);
1127 BIND(DONE);
1128 }
1129
1130 // Compare strings.
1131 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1132 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1133 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1134 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1135 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1136 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1137 SHORT_LOOP_START, TAIL_CHECK;
1138
1139 bool isLL = ae == StrIntrinsicNode::LL;
1140 bool isLU = ae == StrIntrinsicNode::LU;
1141 bool isUL = ae == StrIntrinsicNode::UL;
1142
1143 // The stub threshold for LL strings is: 72 (64 + 8) chars
1144 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1145 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1146 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1147
1148 bool str1_isL = isLL || isLU;
1149 bool str2_isL = isLL || isUL;
1150
1151 int str1_chr_shift = str1_isL ? 0 : 1;
1152 int str2_chr_shift = str2_isL ? 0 : 1;
1153 int str1_chr_size = str1_isL ? 1 : 2;
1154 int str2_chr_size = str2_isL ? 1 : 2;
1155 int minCharsInWord = isLL ? wordSize : wordSize/2;
1156
1157 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1158 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1159 (chr_insn)&MacroAssembler::ldrh;
1160 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1161 (chr_insn)&MacroAssembler::ldrh;
1162 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1163 (uxt_insn)&MacroAssembler::uxthw;
1164
1165 BLOCK_COMMENT("string_compare {");
1166
1167 // Bizarrely, the counts are passed in bytes, regardless of whether they
1168 // are L or U strings, however the result is always in characters.
1169 if (!str1_isL) asrw(cnt1, cnt1, 1);
1170 if (!str2_isL) asrw(cnt2, cnt2, 1);
1171
1172 // Compute the minimum of the string lengths and save the difference.
1173 subsw(result, cnt1, cnt2);
1174 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1175
1176 // A very short string
1177 cmpw(cnt2, minCharsInWord);
1178 br(Assembler::LE, SHORT_STRING);
1179
1180 // Compare longwords
1181 // load first parts of strings and finish initialization while loading
1182 {
1183 if (str1_isL == str2_isL) { // LL or UU
1184 ldr(tmp1, Address(str1));
1185 cmp(str1, str2);
1186 br(Assembler::EQ, DONE);
1187 ldr(tmp2, Address(str2));
1188 cmp(cnt2, stub_threshold);
1189 br(GE, STUB);
1190 subsw(cnt2, cnt2, minCharsInWord);
1191 br(EQ, TAIL_CHECK);
1192 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1193 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1194 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1195 } else if (isLU) {
1196 ldrs(vtmp, Address(str1));
1197 ldr(tmp2, Address(str2));
1198 cmp(cnt2, stub_threshold);
1199 br(GE, STUB);
1200 subw(cnt2, cnt2, 4);
1201 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1202 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1203 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1204 zip1(vtmp, T8B, vtmp, vtmpZ);
1205 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1206 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1207 add(cnt1, cnt1, 4);
1208 fmovd(tmp1, vtmp);
1209 } else { // UL case
1210 ldr(tmp1, Address(str1));
1211 ldrs(vtmp, Address(str2));
1212 cmp(cnt2, stub_threshold);
1213 br(GE, STUB);
1214 subw(cnt2, cnt2, 4);
1215 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1216 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1217 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1218 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1219 zip1(vtmp, T8B, vtmp, vtmpZ);
1220 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1221 add(cnt1, cnt1, 8);
1222 fmovd(tmp2, vtmp);
1223 }
1224 adds(cnt2, cnt2, isUL ? 4 : 8);
1225 br(GE, TAIL);
1226 eor(rscratch2, tmp1, tmp2);
1227 cbnz(rscratch2, DIFF);
1228 // main loop
1229 bind(NEXT_WORD);
1230 if (str1_isL == str2_isL) {
1231 ldr(tmp1, Address(str1, cnt2));
1232 ldr(tmp2, Address(str2, cnt2));
1233 adds(cnt2, cnt2, 8);
1234 } else if (isLU) {
1235 ldrs(vtmp, Address(str1, cnt1));
1236 ldr(tmp2, Address(str2, cnt2));
1237 add(cnt1, cnt1, 4);
1238 zip1(vtmp, T8B, vtmp, vtmpZ);
1239 fmovd(tmp1, vtmp);
1240 adds(cnt2, cnt2, 8);
1241 } else { // UL
1242 ldrs(vtmp, Address(str2, cnt2));
1243 ldr(tmp1, Address(str1, cnt1));
1244 zip1(vtmp, T8B, vtmp, vtmpZ);
1245 add(cnt1, cnt1, 8);
1246 fmovd(tmp2, vtmp);
1247 adds(cnt2, cnt2, 4);
1248 }
1249 br(GE, TAIL);
1250
1251 eor(rscratch2, tmp1, tmp2);
1252 cbz(rscratch2, NEXT_WORD);
1253 b(DIFF);
1254 bind(TAIL);
1255 eor(rscratch2, tmp1, tmp2);
1256 cbnz(rscratch2, DIFF);
1257 // Last longword. In the case where length == 4 we compare the
1258 // same longword twice, but that's still faster than another
1259 // conditional branch.
1260 if (str1_isL == str2_isL) {
1261 ldr(tmp1, Address(str1));
1262 ldr(tmp2, Address(str2));
1263 } else if (isLU) {
1264 ldrs(vtmp, Address(str1));
1265 ldr(tmp2, Address(str2));
1266 zip1(vtmp, T8B, vtmp, vtmpZ);
1267 fmovd(tmp1, vtmp);
1268 } else { // UL
1269 ldrs(vtmp, Address(str2));
1270 ldr(tmp1, Address(str1));
1271 zip1(vtmp, T8B, vtmp, vtmpZ);
1272 fmovd(tmp2, vtmp);
1273 }
1274 bind(TAIL_CHECK);
1275 eor(rscratch2, tmp1, tmp2);
1276 cbz(rscratch2, DONE);
1277
1278 // Find the first different characters in the longwords and
1279 // compute their difference.
1280 bind(DIFF);
1281 rev(rscratch2, rscratch2);
1282 clz(rscratch2, rscratch2);
1283 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1284 lsrv(tmp1, tmp1, rscratch2);
1285 (this->*ext_chr)(tmp1, tmp1);
1286 lsrv(tmp2, tmp2, rscratch2);
1287 (this->*ext_chr)(tmp2, tmp2);
1288 subw(result, tmp1, tmp2);
1289 b(DONE);
1290 }
1291
1292 bind(STUB);
1293 RuntimeAddress stub = nullptr;
1294 switch(ae) {
1295 case StrIntrinsicNode::LL:
1296 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1297 break;
1298 case StrIntrinsicNode::UU:
1299 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1300 break;
1301 case StrIntrinsicNode::LU:
1302 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1303 break;
1304 case StrIntrinsicNode::UL:
1305 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1306 break;
1307 default:
1308 ShouldNotReachHere();
1309 }
1310 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1311 address call = trampoline_call(stub);
1312 if (call == nullptr) {
1313 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1314 ciEnv::current()->record_failure("CodeCache is full");
1315 return;
1316 }
1317 b(DONE);
1318
1319 bind(SHORT_STRING);
1320 // Is the minimum length zero?
1321 cbz(cnt2, DONE);
1322 // arrange code to do most branches while loading and loading next characters
1323 // while comparing previous
1324 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1325 subs(cnt2, cnt2, 1);
1326 br(EQ, SHORT_LAST_INIT);
1327 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1328 b(SHORT_LOOP_START);
1329 bind(SHORT_LOOP);
1330 subs(cnt2, cnt2, 1);
1331 br(EQ, SHORT_LAST);
1332 bind(SHORT_LOOP_START);
1333 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1334 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1335 cmp(tmp1, cnt1);
1336 br(NE, SHORT_LOOP_TAIL);
1337 subs(cnt2, cnt2, 1);
1338 br(EQ, SHORT_LAST2);
1339 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1340 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1341 cmp(tmp2, rscratch1);
1342 br(EQ, SHORT_LOOP);
1343 sub(result, tmp2, rscratch1);
1344 b(DONE);
1345 bind(SHORT_LOOP_TAIL);
1346 sub(result, tmp1, cnt1);
1347 b(DONE);
1348 bind(SHORT_LAST2);
1349 cmp(tmp2, rscratch1);
1350 br(EQ, DONE);
1351 sub(result, tmp2, rscratch1);
1352
1353 b(DONE);
1354 bind(SHORT_LAST_INIT);
1355 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1356 bind(SHORT_LAST);
1357 cmp(tmp1, cnt1);
1358 br(EQ, DONE);
1359 sub(result, tmp1, cnt1);
1360
1361 bind(DONE);
1362
1363 BLOCK_COMMENT("} string_compare");
1364 }
1365
1366 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1367 FloatRegister src2, Condition cond, bool isQ) {
1368 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1369 FloatRegister zn = src1, zm = src2;
1370 bool needs_negation = false;
1371 switch (cond) {
1372 case LT: cond = GT; zn = src2; zm = src1; break;
1373 case LE: cond = GE; zn = src2; zm = src1; break;
1374 case LO: cond = HI; zn = src2; zm = src1; break;
1375 case LS: cond = HS; zn = src2; zm = src1; break;
1376 case NE: cond = EQ; needs_negation = true; break;
1377 default:
1378 break;
1379 }
1380
1381 if (is_floating_point_type(bt)) {
1382 fcm(cond, dst, size, zn, zm);
1383 } else {
1384 cm(cond, dst, size, zn, zm);
1385 }
1386
1387 if (needs_negation) {
1388 notr(dst, isQ ? T16B : T8B, dst);
1389 }
1390 }
1391
1392 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1393 Condition cond, bool isQ) {
1394 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1395 if (bt == T_FLOAT || bt == T_DOUBLE) {
1396 if (cond == Assembler::NE) {
1397 fcm(Assembler::EQ, dst, size, src);
1398 notr(dst, isQ ? T16B : T8B, dst);
1399 } else {
1400 fcm(cond, dst, size, src);
1401 }
1402 } else {
1403 if (cond == Assembler::NE) {
1404 cm(Assembler::EQ, dst, size, src);
1405 notr(dst, isQ ? T16B : T8B, dst);
1406 } else {
1407 cm(cond, dst, size, src);
1408 }
1409 }
1410 }
1411
1412 // Compress the least significant bit of each byte to the rightmost and clear
1413 // the higher garbage bits.
1414 void C2_MacroAssembler::bytemask_compress(Register dst) {
1415 // Example input, dst = 0x01 00 00 00 01 01 00 01
1416 // The "??" bytes are garbage.
1417 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1418 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1419 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1420 andr(dst, dst, 0xff); // dst = 0x8D
1421 }
1422
1423 // Pack the value of each mask element in "src" into a long value in "dst", at most
1424 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1425 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1426 // one bit in "dst".
1427 //
1428 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1429 // Expected: dst = 0x658D
1430 //
1431 // Clobbers: rscratch1
1432 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1433 FloatRegister vtmp, int lane_cnt) {
1434 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1435 assert_different_registers(dst, rscratch1);
1436 assert_different_registers(src, vtmp);
1437 assert(UseSVE > 0, "must be");
1438
1439 // Compress the lowest 8 bytes.
1440 fmovd(dst, src);
1441 bytemask_compress(dst);
1442 if (lane_cnt <= 8) return;
1443
1444 // Repeat on higher bytes and join the results.
1445 // Compress 8 bytes in each iteration.
1446 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1447 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1448 bytemask_compress(rscratch1);
1449 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1450 }
1451 }
1452
1453 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1454 // instruction which requires the FEAT_BITPERM feature.
1455 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1456 FloatRegister vtmp1, FloatRegister vtmp2,
1457 int lane_cnt) {
1458 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1459 assert_different_registers(src, vtmp1, vtmp2);
1460 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1461
1462 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1463 // is to compress each significant bit of the byte in a cross-lane way. Due
1464 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1465 // (bit-compress in each lane) with the biggest lane size (T = D) then
1466 // concatenate the results.
1467
1468 // The second source input of BEXT, initialized with 0x01 in each byte.
1469 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1470 sve_dup(vtmp2, B, 1);
1471
1472 // BEXT vtmp1.D, src.D, vtmp2.D
1473 // src = 0x0001010000010001 | 0x0100000001010001
1474 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1475 // ---------------------------------------
1476 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1477 sve_bext(vtmp1, D, src, vtmp2);
1478
1479 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1480 // result to dst.
1481 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1482 // dst = 0x658D
1483 if (lane_cnt <= 8) {
1484 // No need to concatenate.
1485 umov(dst, vtmp1, B, 0);
1486 } else if (lane_cnt <= 16) {
1487 ins(vtmp1, B, vtmp1, 1, 8);
1488 umov(dst, vtmp1, H, 0);
1489 } else {
1490 // As the lane count is 64 at most, the final expected value must be in
1491 // the lowest 64 bits after narrowing vtmp1 from D to B.
1492 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1493 umov(dst, vtmp1, D, 0);
1494 }
1495 }
1496
1497 // Unpack the mask, a long value in "src", into a vector register of boolean
1498 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1499 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1500 // most 64 lanes.
1501 //
1502 // Below example gives the expected dst vector register, with a valid src(0x658D)
1503 // on a 128-bit vector size machine.
1504 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1505 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1506 FloatRegister vtmp, int lane_cnt) {
1507 assert_different_registers(dst, vtmp);
1508 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1509 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1510
1511 // Example: src = 0x658D, lane_cnt = 16
1512 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1513
1514 // Put long value from general purpose register into the first lane of vector.
1515 // vtmp = 0x0000000000000000 | 0x000000000000658D
1516 sve_dup(vtmp, B, 0);
1517 mov(vtmp, D, 0, src);
1518
1519 // Transform the value in the first lane which is mask in bit now to the mask in
1520 // byte, which can be done by SVE2's BDEP instruction.
1521
1522 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1523 // vtmp = 0x0000000000000065 | 0x000000000000008D
1524 if (lane_cnt <= 8) {
1525 // Nothing. As only one byte exsits.
1526 } else if (lane_cnt <= 16) {
1527 ins(vtmp, B, vtmp, 8, 1);
1528 } else {
1529 sve_vector_extend(vtmp, D, vtmp, B);
1530 }
1531
1532 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1533 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1534 sve_dup(dst, B, 1);
1535
1536 // BDEP dst.D, vtmp.D, dst.D
1537 // vtmp = 0x0000000000000065 | 0x000000000000008D
1538 // dst = 0x0101010101010101 | 0x0101010101010101
1539 // ---------------------------------------
1540 // dst = 0x0001010000010001 | 0x0100000001010001
1541 sve_bdep(dst, D, vtmp, dst);
1542 }
1543
1544 // Clobbers: rflags
1545 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1546 FloatRegister zn, FloatRegister zm, Condition cond) {
1547 assert(pg->is_governing(), "This register has to be a governing predicate register");
1548 FloatRegister z1 = zn, z2 = zm;
1549 switch (cond) {
1550 case LE: z1 = zm; z2 = zn; cond = GE; break;
1551 case LT: z1 = zm; z2 = zn; cond = GT; break;
1552 case LO: z1 = zm; z2 = zn; cond = HI; break;
1553 case LS: z1 = zm; z2 = zn; cond = HS; break;
1554 default:
1555 break;
1556 }
1557
1558 SIMD_RegVariant size = elemType_to_regVariant(bt);
1559 if (is_floating_point_type(bt)) {
1560 sve_fcm(cond, pd, size, pg, z1, z2);
1561 } else {
1562 assert(is_integral_type(bt), "unsupported element type");
1563 sve_cmp(cond, pd, size, pg, z1, z2);
1564 }
1565 }
1566
1567 // Get index of the last mask lane that is set
1568 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1569 SIMD_RegVariant size = elemType_to_regVariant(bt);
1570 sve_rev(ptmp, size, src);
1571 sve_brkb(ptmp, ptrue, ptmp, false);
1572 sve_cntp(dst, size, ptrue, ptmp);
1573 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1574 subw(dst, rscratch1, dst);
1575 }
1576
1577 // Extend integer vector src to dst with the same lane count
1578 // but larger element size, e.g. 4B -> 4I
1579 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1580 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1581 if (src_bt == T_BYTE) {
1582 // 4B to 4S/4I, 8B to 8S
1583 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1584 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1585 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1586 if (dst_bt == T_INT) {
1587 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1588 }
1589 } else if (src_bt == T_SHORT) {
1590 // 2S to 2I/2L, 4S to 4I
1591 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1592 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1593 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1594 if (dst_bt == T_LONG) {
1595 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1596 }
1597 } else if (src_bt == T_INT) {
1598 // 2I to 2L
1599 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1600 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1601 } else {
1602 ShouldNotReachHere();
1603 }
1604 }
1605
1606 // Narrow integer vector src down to dst with the same lane count
1607 // but smaller element size, e.g. 4I -> 4B
1608 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1609 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1610 if (src_bt == T_SHORT) {
1611 // 4S/8S to 4B/8B
1612 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1613 assert(dst_bt == T_BYTE, "unsupported");
1614 xtn(dst, T8B, src, T8H);
1615 } else if (src_bt == T_INT) {
1616 // 2I to 2S, 4I to 4B/4S
1617 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1618 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1619 xtn(dst, T4H, src, T4S);
1620 if (dst_bt == T_BYTE) {
1621 xtn(dst, T8B, dst, T8H);
1622 }
1623 } else if (src_bt == T_LONG) {
1624 // 2L to 2S/2I
1625 assert(src_vlen_in_bytes == 16, "unsupported");
1626 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1627 xtn(dst, T2S, src, T2D);
1628 if (dst_bt == T_SHORT) {
1629 xtn(dst, T4H, dst, T4S);
1630 }
1631 } else {
1632 ShouldNotReachHere();
1633 }
1634 }
1635
1636 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1637 FloatRegister src, SIMD_RegVariant src_size,
1638 bool is_unsigned) {
1639 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1640
1641 if (src_size == B) {
1642 switch (dst_size) {
1643 case H:
1644 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1645 break;
1646 case S:
1647 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1648 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1649 break;
1650 case D:
1651 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1652 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1653 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1654 break;
1655 default:
1656 ShouldNotReachHere();
1657 }
1658 } else if (src_size == H) {
1659 if (dst_size == S) {
1660 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1661 } else { // D
1662 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1663 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1664 }
1665 } else if (src_size == S) {
1666 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1667 }
1668 }
1669
1670 // Vector narrow from src to dst with specified element sizes.
1671 // High part of dst vector will be filled with zero.
1672 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1673 FloatRegister src, SIMD_RegVariant src_size,
1674 FloatRegister tmp) {
1675 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1676 assert_different_registers(src, tmp);
1677 sve_dup(tmp, src_size, 0);
1678 if (src_size == D) {
1679 switch (dst_size) {
1680 case S:
1681 sve_uzp1(dst, S, src, tmp);
1682 break;
1683 case H:
1684 assert_different_registers(dst, tmp);
1685 sve_uzp1(dst, S, src, tmp);
1686 sve_uzp1(dst, H, dst, tmp);
1687 break;
1688 case B:
1689 assert_different_registers(dst, tmp);
1690 sve_uzp1(dst, S, src, tmp);
1691 sve_uzp1(dst, H, dst, tmp);
1692 sve_uzp1(dst, B, dst, tmp);
1693 break;
1694 default:
1695 ShouldNotReachHere();
1696 }
1697 } else if (src_size == S) {
1698 if (dst_size == H) {
1699 sve_uzp1(dst, H, src, tmp);
1700 } else { // B
1701 assert_different_registers(dst, tmp);
1702 sve_uzp1(dst, H, src, tmp);
1703 sve_uzp1(dst, B, dst, tmp);
1704 }
1705 } else if (src_size == H) {
1706 sve_uzp1(dst, B, src, tmp);
1707 }
1708 }
1709
1710 // Extend src predicate to dst predicate with the same lane count but larger
1711 // element size, e.g. 64Byte -> 512Long
1712 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1713 uint dst_element_length_in_bytes,
1714 uint src_element_length_in_bytes) {
1715 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1716 sve_punpklo(dst, src);
1717 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1718 sve_punpklo(dst, src);
1719 sve_punpklo(dst, dst);
1720 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1721 sve_punpklo(dst, src);
1722 sve_punpklo(dst, dst);
1723 sve_punpklo(dst, dst);
1724 } else {
1725 assert(false, "unsupported");
1726 ShouldNotReachHere();
1727 }
1728 }
1729
1730 // Narrow src predicate to dst predicate with the same lane count but
1731 // smaller element size, e.g. 512Long -> 64Byte
1732 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1733 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1734 // The insignificant bits in src predicate are expected to be zero.
1735 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1736 // passed as the second argument. An example narrowing operation with a given mask would be -
1737 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1738 // Mask (for 2 Longs) : TF
1739 // Predicate register for the above mask (16 bits) : 00000001 00000000
1740 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1741 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1742 assert_different_registers(src, ptmp);
1743 assert_different_registers(dst, ptmp);
1744 sve_pfalse(ptmp);
1745 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1746 sve_uzp1(dst, B, src, ptmp);
1747 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1748 sve_uzp1(dst, H, src, ptmp);
1749 sve_uzp1(dst, B, dst, ptmp);
1750 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1751 sve_uzp1(dst, S, src, ptmp);
1752 sve_uzp1(dst, H, dst, ptmp);
1753 sve_uzp1(dst, B, dst, ptmp);
1754 } else {
1755 assert(false, "unsupported");
1756 ShouldNotReachHere();
1757 }
1758 }
1759
1760 // Vector reduction add for integral type with ASIMD instructions.
1761 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1762 Register isrc, FloatRegister vsrc,
1763 unsigned vector_length_in_bytes,
1764 FloatRegister vtmp) {
1765 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1766 assert_different_registers(dst, isrc);
1767 bool isQ = vector_length_in_bytes == 16;
1768
1769 BLOCK_COMMENT("neon_reduce_add_integral {");
1770 switch(bt) {
1771 case T_BYTE:
1772 addv(vtmp, isQ ? T16B : T8B, vsrc);
1773 smov(dst, vtmp, B, 0);
1774 addw(dst, dst, isrc, ext::sxtb);
1775 break;
1776 case T_SHORT:
1777 addv(vtmp, isQ ? T8H : T4H, vsrc);
1778 smov(dst, vtmp, H, 0);
1779 addw(dst, dst, isrc, ext::sxth);
1780 break;
1781 case T_INT:
1782 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1783 umov(dst, vtmp, S, 0);
1784 addw(dst, dst, isrc);
1785 break;
1786 case T_LONG:
1787 assert(isQ, "unsupported");
1788 addpd(vtmp, vsrc);
1789 umov(dst, vtmp, D, 0);
1790 add(dst, dst, isrc);
1791 break;
1792 default:
1793 assert(false, "unsupported");
1794 ShouldNotReachHere();
1795 }
1796 BLOCK_COMMENT("} neon_reduce_add_integral");
1797 }
1798
1799 // Vector reduction multiply for integral type with ASIMD instructions.
1800 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1801 // Clobbers: rscratch1
1802 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1803 Register isrc, FloatRegister vsrc,
1804 unsigned vector_length_in_bytes,
1805 FloatRegister vtmp1, FloatRegister vtmp2) {
1806 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1807 bool isQ = vector_length_in_bytes == 16;
1808
1809 BLOCK_COMMENT("neon_reduce_mul_integral {");
1810 switch(bt) {
1811 case T_BYTE:
1812 if (isQ) {
1813 // Multiply the lower half and higher half of vector iteratively.
1814 // vtmp1 = vsrc[8:15]
1815 ins(vtmp1, D, vsrc, 0, 1);
1816 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1817 mulv(vtmp1, T8B, vtmp1, vsrc);
1818 // vtmp2 = vtmp1[4:7]
1819 ins(vtmp2, S, vtmp1, 0, 1);
1820 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1821 mulv(vtmp1, T8B, vtmp2, vtmp1);
1822 } else {
1823 ins(vtmp1, S, vsrc, 0, 1);
1824 mulv(vtmp1, T8B, vtmp1, vsrc);
1825 }
1826 // vtmp2 = vtmp1[2:3]
1827 ins(vtmp2, H, vtmp1, 0, 1);
1828 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1829 mulv(vtmp2, T8B, vtmp2, vtmp1);
1830 // dst = vtmp2[0] * isrc * vtmp2[1]
1831 umov(rscratch1, vtmp2, B, 0);
1832 mulw(dst, rscratch1, isrc);
1833 sxtb(dst, dst);
1834 umov(rscratch1, vtmp2, B, 1);
1835 mulw(dst, rscratch1, dst);
1836 sxtb(dst, dst);
1837 break;
1838 case T_SHORT:
1839 if (isQ) {
1840 ins(vtmp2, D, vsrc, 0, 1);
1841 mulv(vtmp2, T4H, vtmp2, vsrc);
1842 ins(vtmp1, S, vtmp2, 0, 1);
1843 mulv(vtmp1, T4H, vtmp1, vtmp2);
1844 } else {
1845 ins(vtmp1, S, vsrc, 0, 1);
1846 mulv(vtmp1, T4H, vtmp1, vsrc);
1847 }
1848 umov(rscratch1, vtmp1, H, 0);
1849 mulw(dst, rscratch1, isrc);
1850 sxth(dst, dst);
1851 umov(rscratch1, vtmp1, H, 1);
1852 mulw(dst, rscratch1, dst);
1853 sxth(dst, dst);
1854 break;
1855 case T_INT:
1856 if (isQ) {
1857 ins(vtmp1, D, vsrc, 0, 1);
1858 mulv(vtmp1, T2S, vtmp1, vsrc);
1859 } else {
1860 vtmp1 = vsrc;
1861 }
1862 umov(rscratch1, vtmp1, S, 0);
1863 mul(dst, rscratch1, isrc);
1864 umov(rscratch1, vtmp1, S, 1);
1865 mul(dst, rscratch1, dst);
1866 break;
1867 case T_LONG:
1868 umov(rscratch1, vsrc, D, 0);
1869 mul(dst, isrc, rscratch1);
1870 umov(rscratch1, vsrc, D, 1);
1871 mul(dst, dst, rscratch1);
1872 break;
1873 default:
1874 assert(false, "unsupported");
1875 ShouldNotReachHere();
1876 }
1877 BLOCK_COMMENT("} neon_reduce_mul_integral");
1878 }
1879
1880 // Vector reduction multiply for floating-point type with ASIMD instructions.
1881 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1882 FloatRegister fsrc, FloatRegister vsrc,
1883 unsigned vector_length_in_bytes,
1884 FloatRegister vtmp) {
1885 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1886 bool isQ = vector_length_in_bytes == 16;
1887
1888 BLOCK_COMMENT("neon_reduce_mul_fp {");
1889 switch(bt) {
1890 case T_FLOAT:
1891 fmuls(dst, fsrc, vsrc);
1892 ins(vtmp, S, vsrc, 0, 1);
1893 fmuls(dst, dst, vtmp);
1894 if (isQ) {
1895 ins(vtmp, S, vsrc, 0, 2);
1896 fmuls(dst, dst, vtmp);
1897 ins(vtmp, S, vsrc, 0, 3);
1898 fmuls(dst, dst, vtmp);
1899 }
1900 break;
1901 case T_DOUBLE:
1902 assert(isQ, "unsupported");
1903 fmuld(dst, fsrc, vsrc);
1904 ins(vtmp, D, vsrc, 0, 1);
1905 fmuld(dst, dst, vtmp);
1906 break;
1907 default:
1908 assert(false, "unsupported");
1909 ShouldNotReachHere();
1910 }
1911 BLOCK_COMMENT("} neon_reduce_mul_fp");
1912 }
1913
1914 // Helper to select logical instruction
1915 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1916 Register Rn, Register Rm,
1917 enum shift_kind kind, unsigned shift) {
1918 switch(opc) {
1919 case Op_AndReductionV:
1920 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1921 break;
1922 case Op_OrReductionV:
1923 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1924 break;
1925 case Op_XorReductionV:
1926 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1927 break;
1928 default:
1929 assert(false, "unsupported");
1930 ShouldNotReachHere();
1931 }
1932 }
1933
1934 // Vector reduction logical operations And, Or, Xor
1935 // Clobbers: rscratch1
1936 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1937 Register isrc, FloatRegister vsrc,
1938 unsigned vector_length_in_bytes) {
1939 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1940 "unsupported");
1941 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1942 assert_different_registers(dst, isrc);
1943 bool isQ = vector_length_in_bytes == 16;
1944
1945 BLOCK_COMMENT("neon_reduce_logical {");
1946 umov(rscratch1, vsrc, isQ ? D : S, 0);
1947 umov(dst, vsrc, isQ ? D : S, 1);
1948 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1949 switch(bt) {
1950 case T_BYTE:
1951 if (isQ) {
1952 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1953 }
1954 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1955 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1956 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1957 sxtb(dst, dst);
1958 break;
1959 case T_SHORT:
1960 if (isQ) {
1961 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1962 }
1963 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1964 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1965 sxth(dst, dst);
1966 break;
1967 case T_INT:
1968 if (isQ) {
1969 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1970 }
1971 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1972 break;
1973 case T_LONG:
1974 assert(isQ, "unsupported");
1975 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1976 break;
1977 default:
1978 assert(false, "unsupported");
1979 ShouldNotReachHere();
1980 }
1981 BLOCK_COMMENT("} neon_reduce_logical");
1982 }
1983
1984 // Vector reduction min/max for integral type with ASIMD instructions.
1985 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1986 // Clobbers: rscratch1, rflags
1987 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1988 Register isrc, FloatRegister vsrc,
1989 unsigned vector_length_in_bytes,
1990 FloatRegister vtmp) {
1991 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1992 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1993 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1994 assert_different_registers(dst, isrc);
1995 bool isQ = vector_length_in_bytes == 16;
1996 bool is_min = opc == Op_MinReductionV;
1997
1998 BLOCK_COMMENT("neon_reduce_minmax_integral {");
1999 if (bt == T_LONG) {
2000 assert(vtmp == fnoreg, "should be");
2001 assert(isQ, "should be");
2002 umov(rscratch1, vsrc, D, 0);
2003 cmp(isrc, rscratch1);
2004 csel(dst, isrc, rscratch1, is_min ? LT : GT);
2005 umov(rscratch1, vsrc, D, 1);
2006 cmp(dst, rscratch1);
2007 csel(dst, dst, rscratch1, is_min ? LT : GT);
2008 } else {
2009 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2010 if (size == T2S) {
2011 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2012 } else {
2013 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2014 }
2015 if (bt == T_INT) {
2016 umov(dst, vtmp, S, 0);
2017 } else {
2018 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2019 }
2020 cmpw(dst, isrc);
2021 cselw(dst, dst, isrc, is_min ? LT : GT);
2022 }
2023 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2024 }
2025
2026 // Vector reduction for integral type with SVE instruction.
2027 // Supported operations are Add, And, Or, Xor, Max, Min.
2028 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2029 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2030 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2031 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2032 assert(pg->is_governing(), "This register has to be a governing predicate register");
2033 assert_different_registers(src1, dst);
2034 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2035 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2036 switch (opc) {
2037 case Op_AddReductionVI: {
2038 sve_uaddv(tmp, size, pg, src2);
2039 if (bt == T_BYTE) {
2040 smov(dst, tmp, size, 0);
2041 addw(dst, src1, dst, ext::sxtb);
2042 } else if (bt == T_SHORT) {
2043 smov(dst, tmp, size, 0);
2044 addw(dst, src1, dst, ext::sxth);
2045 } else {
2046 umov(dst, tmp, size, 0);
2047 addw(dst, dst, src1);
2048 }
2049 break;
2050 }
2051 case Op_AddReductionVL: {
2052 sve_uaddv(tmp, size, pg, src2);
2053 umov(dst, tmp, size, 0);
2054 add(dst, dst, src1);
2055 break;
2056 }
2057 case Op_AndReductionV: {
2058 sve_andv(tmp, size, pg, src2);
2059 if (bt == T_INT || bt == T_LONG) {
2060 umov(dst, tmp, size, 0);
2061 } else {
2062 smov(dst, tmp, size, 0);
2063 }
2064 if (bt == T_LONG) {
2065 andr(dst, dst, src1);
2066 } else {
2067 andw(dst, dst, src1);
2068 }
2069 break;
2070 }
2071 case Op_OrReductionV: {
2072 sve_orv(tmp, size, pg, src2);
2073 if (bt == T_INT || bt == T_LONG) {
2074 umov(dst, tmp, size, 0);
2075 } else {
2076 smov(dst, tmp, size, 0);
2077 }
2078 if (bt == T_LONG) {
2079 orr(dst, dst, src1);
2080 } else {
2081 orrw(dst, dst, src1);
2082 }
2083 break;
2084 }
2085 case Op_XorReductionV: {
2086 sve_eorv(tmp, size, pg, src2);
2087 if (bt == T_INT || bt == T_LONG) {
2088 umov(dst, tmp, size, 0);
2089 } else {
2090 smov(dst, tmp, size, 0);
2091 }
2092 if (bt == T_LONG) {
2093 eor(dst, dst, src1);
2094 } else {
2095 eorw(dst, dst, src1);
2096 }
2097 break;
2098 }
2099 case Op_MaxReductionV: {
2100 sve_smaxv(tmp, size, pg, src2);
2101 if (bt == T_INT || bt == T_LONG) {
2102 umov(dst, tmp, size, 0);
2103 } else {
2104 smov(dst, tmp, size, 0);
2105 }
2106 if (bt == T_LONG) {
2107 cmp(dst, src1);
2108 csel(dst, dst, src1, Assembler::GT);
2109 } else {
2110 cmpw(dst, src1);
2111 cselw(dst, dst, src1, Assembler::GT);
2112 }
2113 break;
2114 }
2115 case Op_MinReductionV: {
2116 sve_sminv(tmp, size, pg, src2);
2117 if (bt == T_INT || bt == T_LONG) {
2118 umov(dst, tmp, size, 0);
2119 } else {
2120 smov(dst, tmp, size, 0);
2121 }
2122 if (bt == T_LONG) {
2123 cmp(dst, src1);
2124 csel(dst, dst, src1, Assembler::LT);
2125 } else {
2126 cmpw(dst, src1);
2127 cselw(dst, dst, src1, Assembler::LT);
2128 }
2129 break;
2130 }
2131 default:
2132 assert(false, "unsupported");
2133 ShouldNotReachHere();
2134 }
2135
2136 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2137 if (bt == T_BYTE) {
2138 sxtb(dst, dst);
2139 } else if (bt == T_SHORT) {
2140 sxth(dst, dst);
2141 }
2142 }
2143 }
2144
2145 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2146 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2147 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2148 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2149 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2150 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2151
2152 // Set all elements to false if the input "lane_cnt" is zero.
2153 if (lane_cnt == 0) {
2154 sve_pfalse(dst);
2155 return;
2156 }
2157
2158 SIMD_RegVariant size = elemType_to_regVariant(bt);
2159 assert(size != Q, "invalid size");
2160
2161 // Set all true if "lane_cnt" equals to the max lane count.
2162 if (lane_cnt == max_vector_length) {
2163 sve_ptrue(dst, size, /* ALL */ 0b11111);
2164 return;
2165 }
2166
2167 // Fixed numbers for "ptrue".
2168 switch(lane_cnt) {
2169 case 1: /* VL1 */
2170 case 2: /* VL2 */
2171 case 3: /* VL3 */
2172 case 4: /* VL4 */
2173 case 5: /* VL5 */
2174 case 6: /* VL6 */
2175 case 7: /* VL7 */
2176 case 8: /* VL8 */
2177 sve_ptrue(dst, size, lane_cnt);
2178 return;
2179 case 16:
2180 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2181 return;
2182 case 32:
2183 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2184 return;
2185 case 64:
2186 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2187 return;
2188 case 128:
2189 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2190 return;
2191 case 256:
2192 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2193 return;
2194 default:
2195 break;
2196 }
2197
2198 // Special patterns for "ptrue".
2199 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2200 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2201 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2202 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2203 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2204 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2205 } else {
2206 // Encode to "whileltw" for the remaining cases.
2207 mov(rscratch1, lane_cnt);
2208 sve_whileltw(dst, size, zr, rscratch1);
2209 }
2210 }
2211
2212 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2213 // Any remaining elements of dst will be filled with zero.
2214 // Clobbers: rscratch1
2215 // Preserves: mask, vzr
2216 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2217 FloatRegister vzr, FloatRegister vtmp,
2218 PRegister pgtmp, unsigned vector_length_in_bytes) {
2219 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2220 // When called by sve_compress_byte, src and vtmp may be the same register.
2221 assert_different_registers(dst, src, vzr);
2222 assert_different_registers(dst, vtmp, vzr);
2223 assert_different_registers(mask, pgtmp);
2224 // high <-- low
2225 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2226 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2227 // Expected result: dst = 00 00 00 hh ee dd bb aa
2228
2229 // Extend lowest half to type INT.
2230 // dst = 00dd 00cc 00bb 00aa
2231 sve_uunpklo(dst, S, src);
2232 // pgtmp = 0001 0000 0001 0001
2233 sve_punpklo(pgtmp, mask);
2234 // Pack the active elements in size of type INT to the right,
2235 // and fill the remainings with zero.
2236 // dst = 0000 00dd 00bb 00aa
2237 sve_compact(dst, S, dst, pgtmp);
2238 // Narrow the result back to type SHORT.
2239 // dst = 00 00 00 00 00 dd bb aa
2240 sve_uzp1(dst, H, dst, vzr);
2241
2242 // Return if the vector length is no more than MaxVectorSize/2, since the
2243 // highest half is invalid.
2244 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2245 return;
2246 }
2247
2248 // Count the active elements of lowest half.
2249 // rscratch1 = 3
2250 sve_cntp(rscratch1, S, ptrue, pgtmp);
2251
2252 // Repeat to the highest half.
2253 // pgtmp = 0001 0000 0000 0001
2254 sve_punpkhi(pgtmp, mask);
2255 // vtmp = 00hh 00gg 00ff 00ee
2256 sve_uunpkhi(vtmp, S, src);
2257 // vtmp = 0000 0000 00hh 00ee
2258 sve_compact(vtmp, S, vtmp, pgtmp);
2259 // vtmp = 00 00 00 00 00 00 hh ee
2260 sve_uzp1(vtmp, H, vtmp, vzr);
2261
2262 // pgtmp = 00 00 00 00 00 01 01 01
2263 sve_whilelt(pgtmp, H, zr, rscratch1);
2264 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2265 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2266 // Combine the compressed low with the compressed high:
2267 // dst = 00 00 00 hh ee dd bb aa
2268 sve_splice(dst, H, pgtmp, vtmp);
2269 }
2270
2271 // Clobbers: rscratch1, rscratch2
2272 // Preserves: src, mask
2273 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2274 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2275 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2276 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2277 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2278 assert_different_registers(mask, ptmp, pgtmp);
2279 // high <-- low
2280 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2281 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2282 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2283 FloatRegister vzr = vtmp3;
2284 sve_dup(vzr, B, 0);
2285
2286 // Extend lowest half to type SHORT.
2287 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2288 sve_uunpklo(vtmp1, H, src);
2289 // ptmp = 00 01 00 00 00 01 00 01
2290 sve_punpklo(ptmp, mask);
2291 // Pack the active elements in size of type SHORT to the right,
2292 // and fill the remainings with zero.
2293 // dst = 00 00 00 00 00 0g 0c 0a
2294 unsigned extended_size = vector_length_in_bytes << 1;
2295 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2296 // Narrow the result back to type BYTE.
2297 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2298 sve_uzp1(dst, B, dst, vzr);
2299
2300 // Return if the vector length is no more than MaxVectorSize/2, since the
2301 // highest half is invalid.
2302 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2303 return;
2304 }
2305 // Count the active elements of lowest half.
2306 // rscratch2 = 3
2307 sve_cntp(rscratch2, H, ptrue, ptmp);
2308
2309 // Repeat to the highest half.
2310 // ptmp = 00 01 00 00 00 00 00 01
2311 sve_punpkhi(ptmp, mask);
2312 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2313 sve_uunpkhi(vtmp2, H, src);
2314 // vtmp1 = 00 00 00 00 00 00 0p 0i
2315 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2316 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2317 sve_uzp1(vtmp1, B, vtmp1, vzr);
2318
2319 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2320 sve_whilelt(ptmp, B, zr, rscratch2);
2321 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2322 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2323 // Combine the compressed low with the compressed high:
2324 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2325 sve_splice(dst, B, ptmp, vtmp1);
2326 }
2327
2328 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2329 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2330 SIMD_Arrangement size = isQ ? T16B : T8B;
2331 if (bt == T_BYTE) {
2332 rbit(dst, size, src);
2333 } else {
2334 neon_reverse_bytes(dst, src, bt, isQ);
2335 rbit(dst, size, dst);
2336 }
2337 }
2338
2339 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2340 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2341 SIMD_Arrangement size = isQ ? T16B : T8B;
2342 switch (bt) {
2343 case T_BYTE:
2344 if (dst != src) {
2345 orr(dst, size, src, src);
2346 }
2347 break;
2348 case T_SHORT:
2349 rev16(dst, size, src);
2350 break;
2351 case T_INT:
2352 rev32(dst, size, src);
2353 break;
2354 case T_LONG:
2355 rev64(dst, size, src);
2356 break;
2357 default:
2358 assert(false, "unsupported");
2359 ShouldNotReachHere();
2360 }
2361 }
2362
2363 // VectorRearrange implementation for short/int/float/long/double types with NEON
2364 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2365 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2366 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2367 // and use bsl to implement the operation.
2368 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2369 FloatRegister shuffle, FloatRegister tmp,
2370 BasicType bt, bool isQ) {
2371 assert_different_registers(dst, src, shuffle, tmp);
2372 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2373 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2374
2375 // Here is an example that rearranges a NEON vector with 4 ints:
2376 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2377 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2378 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2379 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2380 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2381 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2382 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2383 // 4. Use Vm as index register, and use V1 as table register.
2384 // Then get V2 as the result by tbl NEON instructions.
2385 switch (bt) {
2386 case T_SHORT:
2387 mov(tmp, size1, 0x02);
2388 mulv(dst, size2, shuffle, tmp);
2389 mov(tmp, size2, 0x0100);
2390 addv(dst, size1, dst, tmp);
2391 tbl(dst, size1, src, 1, dst);
2392 break;
2393 case T_INT:
2394 case T_FLOAT:
2395 mov(tmp, size1, 0x04);
2396 mulv(dst, size2, shuffle, tmp);
2397 mov(tmp, size2, 0x03020100);
2398 addv(dst, size1, dst, tmp);
2399 tbl(dst, size1, src, 1, dst);
2400 break;
2401 case T_LONG:
2402 case T_DOUBLE:
2403 // Load the iota indices for Long type. The indices are ordered by
2404 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2405 // the offset for L is 48.
2406 lea(rscratch1,
2407 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2408 ldrq(tmp, rscratch1);
2409 // Check whether the input "shuffle" is the same with iota indices.
2410 // Return "src" if true, otherwise swap the two elements of "src".
2411 cm(EQ, dst, size2, shuffle, tmp);
2412 ext(tmp, size1, src, src, 8);
2413 bsl(dst, size1, src, tmp);
2414 break;
2415 default:
2416 assert(false, "unsupported element type");
2417 ShouldNotReachHere();
2418 }
2419 }
2420
2421 // Extract a scalar element from an sve vector at position 'idx'.
2422 // The input elements in src are expected to be of integral type.
2423 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2424 int idx, FloatRegister vtmp) {
2425 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2426 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2427 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2428 if (bt == T_INT || bt == T_LONG) {
2429 umov(dst, src, size, idx);
2430 } else {
2431 smov(dst, src, size, idx);
2432 }
2433 } else {
2434 sve_orr(vtmp, src, src);
2435 sve_ext(vtmp, vtmp, idx << size);
2436 if (bt == T_INT || bt == T_LONG) {
2437 umov(dst, vtmp, size, 0);
2438 } else {
2439 smov(dst, vtmp, size, 0);
2440 }
2441 }
2442 }
2443
2444 // java.lang.Math::round intrinsics
2445
2446 // Clobbers: rscratch1, rflags
2447 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2448 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2449 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2450 switch (T) {
2451 case T2S:
2452 case T4S:
2453 fmovs(tmp1, T, 0.5f);
2454 mov(rscratch1, jint_cast(0x1.0p23f));
2455 break;
2456 case T2D:
2457 fmovd(tmp1, T, 0.5);
2458 mov(rscratch1, julong_cast(0x1.0p52));
2459 break;
2460 default:
2461 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2462 }
2463 fadd(tmp1, T, tmp1, src);
2464 fcvtms(tmp1, T, tmp1);
2465 // tmp1 = floor(src + 0.5, ties to even)
2466
2467 fcvtas(dst, T, src);
2468 // dst = round(src), ties to away
2469
2470 fneg(tmp3, T, src);
2471 dup(tmp2, T, rscratch1);
2472 cm(HS, tmp3, T, tmp3, tmp2);
2473 // tmp3 is now a set of flags
2474
2475 bif(dst, T16B, tmp1, tmp3);
2476 // result in dst
2477 }
2478
2479 // Clobbers: rscratch1, rflags
2480 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2481 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2482 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2483 assert_different_registers(tmp1, tmp2, src, dst);
2484
2485 switch (T) {
2486 case S:
2487 mov(rscratch1, jint_cast(0x1.0p23f));
2488 break;
2489 case D:
2490 mov(rscratch1, julong_cast(0x1.0p52));
2491 break;
2492 default:
2493 assert(T == S || T == D, "invalid register variant");
2494 }
2495
2496 sve_frinta(dst, T, ptrue, src);
2497 // dst = round(src), ties to away
2498
2499 Label none;
2500
2501 sve_fneg(tmp1, T, ptrue, src);
2502 sve_dup(tmp2, T, rscratch1);
2503 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2504 br(EQ, none);
2505 {
2506 sve_cpy(tmp1, T, pgtmp, 0.5);
2507 sve_fadd(tmp1, T, pgtmp, src);
2508 sve_frintm(dst, T, pgtmp, tmp1);
2509 // dst = floor(src + 0.5, ties to even)
2510 }
2511 bind(none);
2512
2513 sve_fcvtzs(dst, T, ptrue, dst, T);
2514 // result in dst
2515 }
2516
2517 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2518 FloatRegister one, SIMD_Arrangement T) {
2519 assert_different_registers(dst, src, zero, one);
2520 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2521
2522 facgt(dst, T, src, zero);
2523 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2524 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2525 }
2526
2527 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2528 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2529 assert_different_registers(dst, src, zero, one, vtmp);
2530 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2531
2532 sve_orr(vtmp, src, src);
2533 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2534 switch (T) {
2535 case S:
2536 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2537 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2538 // on the sign of the float value
2539 break;
2540 case D:
2541 sve_and(vtmp, T, min_jlong);
2542 sve_orr(vtmp, T, jlong_cast(1.0));
2543 break;
2544 default:
2545 assert(false, "unsupported");
2546 ShouldNotReachHere();
2547 }
2548 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2549 // Result in dst
2550 }
2551
2552 bool C2_MacroAssembler::in_scratch_emit_size() {
2553 if (ciEnv::current()->task() != nullptr) {
2554 PhaseOutput* phase_output = Compile::current()->output();
2555 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2556 return true;
2557 }
2558 }
2559 return MacroAssembler::in_scratch_emit_size();
2560 }
2561
2562 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2563 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2564 }
2565
2566 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2567 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2568 if (t == TypeInt::INT) {
2569 return;
2570 }
2571 BLOCK_COMMENT("verify_int_in_range {");
2572 Label L_success, L_failure;
2573
2574 jint lo = t->_lo;
2575 jint hi = t->_hi;
2576
2577 if (lo != min_jint && hi != max_jint) {
2578 subsw(rtmp, rval, lo);
2579 br(Assembler::LT, L_failure);
2580 subsw(rtmp, rval, hi);
2581 br(Assembler::LE, L_success);
2582 } else if (lo != min_jint) {
2583 subsw(rtmp, rval, lo);
2584 br(Assembler::GE, L_success);
2585 } else if (hi != max_jint) {
2586 subsw(rtmp, rval, hi);
2587 br(Assembler::LE, L_success);
2588 } else {
2589 ShouldNotReachHere();
2590 }
2591
2592 bind(L_failure);
2593 movw(c_rarg0, idx);
2594 mov(c_rarg1, rval);
2595 movw(c_rarg2, lo);
2596 movw(c_rarg3, hi);
2597 reconstruct_frame_pointer(rtmp);
2598 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2599 hlt(0);
2600
2601 bind(L_success);
2602 BLOCK_COMMENT("} verify_int_in_range");
2603 }
2604
2605 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2606 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2607 }
2608
2609 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2610 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2611 if (t == TypeLong::LONG) {
2612 return;
2613 }
2614 BLOCK_COMMENT("verify_long_in_range {");
2615 Label L_success, L_failure;
2616
2617 jlong lo = t->_lo;
2618 jlong hi = t->_hi;
2619
2620 if (lo != min_jlong && hi != max_jlong) {
2621 subs(rtmp, rval, lo);
2622 br(Assembler::LT, L_failure);
2623 subs(rtmp, rval, hi);
2624 br(Assembler::LE, L_success);
2625 } else if (lo != min_jlong) {
2626 subs(rtmp, rval, lo);
2627 br(Assembler::GE, L_success);
2628 } else if (hi != max_jlong) {
2629 subs(rtmp, rval, hi);
2630 br(Assembler::LE, L_success);
2631 } else {
2632 ShouldNotReachHere();
2633 }
2634
2635 bind(L_failure);
2636 movw(c_rarg0, idx);
2637 mov(c_rarg1, rval);
2638 mov(c_rarg2, lo);
2639 mov(c_rarg3, hi);
2640 reconstruct_frame_pointer(rtmp);
2641 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2642 hlt(0);
2643
2644 bind(L_success);
2645 BLOCK_COMMENT("} verify_long_in_range");
2646 }
2647
2648 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2649 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2650 if (PreserveFramePointer) {
2651 // frame pointer is valid
2652 #ifdef ASSERT
2653 // Verify frame pointer value in rfp.
2654 add(rtmp, sp, framesize - 2 * wordSize);
2655 Label L_success;
2656 cmp(rfp, rtmp);
2657 br(Assembler::EQ, L_success);
2658 stop("frame pointer mismatch");
2659 bind(L_success);
2660 #endif // ASSERT
2661 } else {
2662 add(rfp, sp, framesize - 2 * wordSize);
2663 }
2664 }
2665
2666 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2667 // using Neon instructions and places it in the destination vector element corresponding to the
2668 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2669 // where NUM_ELEM is the number of BasicType elements per vector.
2670 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2671 // Otherwise, selects src2[idx – NUM_ELEM]
2672 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2673 FloatRegister src2, FloatRegister index,
2674 FloatRegister tmp, unsigned vector_length_in_bytes) {
2675 assert_different_registers(dst, src1, src2, tmp);
2676 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2677
2678 if (vector_length_in_bytes == 16) {
2679 assert(UseSVE <= 1, "sve must be <= 1");
2680 assert(src1->successor() == src2, "Source registers must be ordered");
2681 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2682 tbl(dst, size, src1, 2, index);
2683 } else { // vector length == 8
2684 assert(UseSVE == 0, "must be Neon only");
2685 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2686 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2687 // instruction with one vector lookup
2688 ins(tmp, D, src1, 0, 0);
2689 ins(tmp, D, src2, 1, 0);
2690 tbl(dst, size, tmp, 1, index);
2691 }
2692 }
2693
2694 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2695 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2696 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2697 // where NUM_ELEM is the number of BasicType elements per vector.
2698 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2699 // Otherwise, selects src2[idx – NUM_ELEM]
2700 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2701 FloatRegister src2, FloatRegister index,
2702 FloatRegister tmp, SIMD_RegVariant T,
2703 unsigned vector_length_in_bytes) {
2704 assert_different_registers(dst, src1, src2, index, tmp);
2705
2706 if (vector_length_in_bytes == 8) {
2707 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2708 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2709 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2710 // instruction with one vector lookup
2711 assert(UseSVE >= 1, "sve must be >= 1");
2712 ins(tmp, D, src1, 0, 0);
2713 ins(tmp, D, src2, 1, 0);
2714 sve_tbl(dst, T, tmp, index);
2715 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2716 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2717 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2718 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2719 // with the only exception of 8B vector length.
2720 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2721 assert(src1->successor() == src2, "Source registers must be ordered");
2722 sve_tbl(dst, T, src1, src2, index);
2723 }
2724 }
2725
2726 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2727 FloatRegister src2, FloatRegister index,
2728 FloatRegister tmp, BasicType bt,
2729 unsigned vector_length_in_bytes) {
2730
2731 assert_different_registers(dst, src1, src2, index, tmp);
2732
2733 // The cases that can reach this method are -
2734 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2735 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2736 //
2737 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2738 // and UseSVE = 2 with vector_length_in_bytes >= 8
2739 //
2740 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2741 // UseSVE = 1 with vector_length_in_bytes = 16
2742
2743 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2744 SIMD_RegVariant T = elemType_to_regVariant(bt);
2745 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2746 return;
2747 }
2748
2749 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2750 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2751 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2752
2753 bool isQ = vector_length_in_bytes == 16;
2754
2755 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2756 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2757
2758 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2759 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2760 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2761 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2762 // the indices can range from [0, 8).
2763 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2764 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2765 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2766 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2767 // Add the multiplied result to the vector in tmp to obtain the byte level
2768 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2769 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2770
2771 if (bt == T_BYTE) {
2772 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2773 } else {
2774 int elem_size = (bt == T_SHORT) ? 2 : 4;
2775 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2776
2777 mov(tmp, size1, elem_size);
2778 mulv(dst, size2, index, tmp);
2779 mov(tmp, size2, tbl_offset);
2780 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2781 // to select a set of 2B/4B
2782 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2783 }
2784 }
2785
2786 // Vector expand implementation. Elements from the src vector are expanded into
2787 // the dst vector under the control of the vector mask.
2788 // Since there are no native instructions directly corresponding to expand before
2789 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2790 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2791 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2792 // for NEON and SVE, but with different instructions where appropriate.
2793
2794 // Vector expand implementation for NEON.
2795 //
2796 // An example of 128-bit Byte vector:
2797 // Data direction: high <== low
2798 // Input:
2799 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2800 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2801 // Expected result:
2802 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2803 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2804 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2805 int vector_length_in_bytes) {
2806 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2807 assert_different_registers(dst, src, mask, tmp1, tmp2);
2808 // Since the TBL instruction only supports byte table, we need to
2809 // compute indices in byte type for all types.
2810 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2811 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2812 dup(tmp1, size, zr);
2813 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2814 negr(dst, size, mask);
2815 // Calculate vector index for TBL with prefix sum algorithm.
2816 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2817 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2818 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2819 addv(dst, size, tmp2, dst);
2820 }
2821 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2822 orr(tmp2, size, mask, mask);
2823 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2824 bsl(tmp2, size, dst, tmp1);
2825 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2826 movi(tmp1, size, 1);
2827 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2828 subv(dst, size, tmp2, tmp1);
2829 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2830 tbl(dst, size, src, 1, dst);
2831 }
2832
2833 // Vector expand implementation for SVE.
2834 //
2835 // An example of 128-bit Short vector:
2836 // Data direction: high <== low
2837 // Input:
2838 // src = gf ed cb a9 87 65 43 21
2839 // pg = 00 01 00 01 00 01 00 01
2840 // Expected result:
2841 // dst = 00 87 00 65 00 43 00 21
2842 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2843 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2844 int vector_length_in_bytes) {
2845 assert(UseSVE > 0, "expand implementation only for SVE");
2846 assert_different_registers(dst, src, tmp1, tmp2);
2847 SIMD_RegVariant size = elemType_to_regVariant(bt);
2848
2849 // tmp1 = 00 00 00 00 00 00 00 00
2850 sve_dup(tmp1, size, 0);
2851 sve_movprfx(tmp2, tmp1);
2852 // tmp2 = 00 01 00 01 00 01 00 01
2853 sve_cpy(tmp2, size, pg, 1, true);
2854 // Calculate vector index for TBL with prefix sum algorithm.
2855 // tmp2 = 04 04 03 03 02 02 01 01
2856 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2857 sve_movprfx(dst, tmp1);
2858 // The EXT instruction operates on the full-width sve register. The correct
2859 // index calculation method is:
2860 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2861 // MaxVectorSize - i.
2862 sve_ext(dst, tmp2, MaxVectorSize - i);
2863 sve_add(tmp2, size, dst, tmp2);
2864 }
2865 // dst = 00 04 00 03 00 02 00 01
2866 sve_sel(dst, size, pg, tmp2, tmp1);
2867 // dst = -1 03 -1 02 -1 01 -1 00
2868 sve_sub(dst, size, 1);
2869 // dst = 00 87 00 65 00 43 00 21
2870 sve_tbl(dst, size, src, dst);
2871 }