1 /*
2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/stubRoutines.hpp"
34 #include "utilities/globalDefinitions.hpp"
35 #include "utilities/powerOfTwo.hpp"
36
37 #ifdef PRODUCT
38 #define BLOCK_COMMENT(str) /* nothing */
39 #define STOP(error) stop(error)
40 #else
41 #define BLOCK_COMMENT(str) block_comment(str)
42 #define STOP(error) block_comment(error); stop(error)
43 #endif
44
45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46
47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
48
49 void C2_MacroAssembler::entry_barrier() {
50 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
51 // Dummy labels for just measuring the code size
52 Label dummy_slow_path;
53 Label dummy_continuation;
54 Label dummy_guard;
55 Label* slow_path = &dummy_slow_path;
56 Label* continuation = &dummy_continuation;
57 Label* guard = &dummy_guard;
58 if (!Compile::current()->output()->in_scratch_emit_size()) {
59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
61 Compile::current()->output()->add_stub(stub);
62 slow_path = &stub->entry();
63 continuation = &stub->continuation();
64 guard = &stub->guard();
65 }
66 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
67 bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
68 }
69
70 // jdk.internal.util.ArraysSupport.vectorizedHashCode
71 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
72 FloatRegister vdata0, FloatRegister vdata1,
73 FloatRegister vdata2, FloatRegister vdata3,
74 FloatRegister vmul0, FloatRegister vmul1,
75 FloatRegister vmul2, FloatRegister vmul3,
76 FloatRegister vpow, FloatRegister vpowm,
77 BasicType eltype) {
78 ARRAYS_HASHCODE_REGISTERS;
79
80 Register tmp1 = rscratch1, tmp2 = rscratch2;
81
82 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
83
84 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
85 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
86 // use 4H for chars and shorts instead, but using 8H gives better performance.
87 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
88 : eltype == T_CHAR || eltype == T_SHORT ? 8
89 : eltype == T_INT ? 4
90 : 0;
91 guarantee(vf, "unsupported eltype");
92
93 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
94 const size_t unroll_factor = 4;
95
96 switch (eltype) {
97 case T_BOOLEAN:
98 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
99 break;
100 case T_CHAR:
101 BLOCK_COMMENT("arrays_hashcode(char) {");
102 break;
103 case T_BYTE:
104 BLOCK_COMMENT("arrays_hashcode(byte) {");
105 break;
106 case T_SHORT:
107 BLOCK_COMMENT("arrays_hashcode(short) {");
108 break;
109 case T_INT:
110 BLOCK_COMMENT("arrays_hashcode(int) {");
111 break;
112 default:
113 ShouldNotReachHere();
114 }
115
116 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
117 // implemented by the stub executes just once. Call the stub only if at least two iterations will
118 // be executed.
119 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
120 cmpw(cnt, large_threshold);
121 br(Assembler::HS, LARGE);
122
123 bind(TAIL);
124
125 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
126 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
127 // Iteration eats up the remainder, uf elements at a time.
128 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
129 andr(tmp2, cnt, unroll_factor - 1);
130 adr(tmp1, BR_BASE);
131 // For Cortex-A53 offset is 4 because 2 nops are generated.
132 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
133 movw(tmp2, 0x1f);
134 br(tmp1);
135
136 bind(LOOP);
137 for (size_t i = 0; i < unroll_factor; ++i) {
138 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
139 maddw(result, result, tmp2, tmp1);
140 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
141 // Generate 2nd nop to have 4 instructions per iteration.
142 if (VM_Version::supports_a53mac()) {
143 nop();
144 }
145 }
146 bind(BR_BASE);
147 subsw(cnt, cnt, unroll_factor);
148 br(Assembler::HS, LOOP);
149
150 b(DONE);
151
152 bind(LARGE);
153
154 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
155 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
156 address tpc = trampoline_call(stub);
157 if (tpc == nullptr) {
158 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
159 postcond(pc() == badAddress);
160 return nullptr;
161 }
162
163 bind(DONE);
164
165 BLOCK_COMMENT("} // arrays_hashcode");
166
167 postcond(pc() != badAddress);
168 return pc();
169 }
170
171 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
172 Register t2, Register t3) {
173 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
174
175 // Handle inflated monitor.
176 Label inflated;
177 // Finish fast lock successfully. MUST branch to with flag == EQ
178 Label locked;
179 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
180 Label slow_path;
181
182 if (UseObjectMonitorTable) {
183 // Clear cache in case fast locking succeeds or we need to take the slow-path.
184 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
185 }
186
187 if (DiagnoseSyncOnValueBasedClasses != 0) {
188 load_klass(t1, obj);
189 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
190 tst(t1, KlassFlags::_misc_is_value_based_class);
191 br(Assembler::NE, slow_path);
192 }
193
194 const Register t1_mark = t1;
195 const Register t3_t = t3;
196
197 { // Lightweight locking
198
199 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
200 Label push;
201
202 const Register t2_top = t2;
203
204 // Check if lock-stack is full.
205 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
206 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
207 br(Assembler::GT, slow_path);
208
209 // Check if recursive.
210 subw(t3_t, t2_top, oopSize);
211 ldr(t3_t, Address(rthread, t3_t));
212 cmp(obj, t3_t);
213 br(Assembler::EQ, push);
214
215 // Relaxed normal load to check for monitor. Optimization for monitor case.
216 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
217 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
218
219 // Not inflated
220 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
221
222 // Try to lock. Transition lock-bits 0b01 => 0b00
223 orr(t1_mark, t1_mark, markWord::unlocked_value);
224 eor(t3_t, t1_mark, markWord::unlocked_value);
225 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
226 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
227 br(Assembler::NE, slow_path);
228
229 bind(push);
230 // After successful lock, push object on lock-stack.
231 str(obj, Address(rthread, t2_top));
232 addw(t2_top, t2_top, oopSize);
233 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
234 b(locked);
235 }
236
237 { // Handle inflated monitor.
238 bind(inflated);
239
240 const Register t1_monitor = t1;
241
242 if (!UseObjectMonitorTable) {
243 assert(t1_monitor == t1_mark, "should be the same here");
244 } else {
245 Label monitor_found;
246
247 // Load cache address
248 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
249
250 const int num_unrolled = 2;
251 for (int i = 0; i < num_unrolled; i++) {
252 ldr(t1, Address(t3_t));
253 cmp(obj, t1);
254 br(Assembler::EQ, monitor_found);
255 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
256 }
257
258 Label loop;
259
260 // Search for obj in cache.
261 bind(loop);
262
263 // Check for match.
264 ldr(t1, Address(t3_t));
265 cmp(obj, t1);
266 br(Assembler::EQ, monitor_found);
267
268 // Search until null encountered, guaranteed _null_sentinel at end.
269 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
270 cbnz(t1, loop);
271 // Cache Miss, NE set from cmp above, cbnz does not set flags
272 b(slow_path);
273
274 bind(monitor_found);
275 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
276 }
277
278 const Register t2_owner_addr = t2;
279 const Register t3_owner = t3;
280 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
281 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
282 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
283
284 Label monitor_locked;
285
286 // Compute owner address.
287 lea(t2_owner_addr, owner_address);
288
289 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
290 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
291 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
292 /*release*/ false, /*weak*/ false, t3_owner);
293 br(Assembler::EQ, monitor_locked);
294
295 // Check if recursive.
296 cmp(t3_owner, rscratch2);
297 br(Assembler::NE, slow_path);
298
299 // Recursive.
300 increment(recursions_address, 1);
301
302 bind(monitor_locked);
303 if (UseObjectMonitorTable) {
304 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
305 }
306 }
307
308 bind(locked);
309
310 #ifdef ASSERT
311 // Check that locked label is reached with Flags == EQ.
312 Label flag_correct;
313 br(Assembler::EQ, flag_correct);
314 stop("Fast Lock Flag != EQ");
315 #endif
316
317 bind(slow_path);
318 #ifdef ASSERT
319 // Check that slow_path label is reached with Flags == NE.
320 br(Assembler::NE, flag_correct);
321 stop("Fast Lock Flag != NE");
322 bind(flag_correct);
323 #endif
324 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
325 }
326
327 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
328 Register t2, Register t3) {
329 assert_different_registers(obj, box, t1, t2, t3);
330
331 // Handle inflated monitor.
332 Label inflated, inflated_load_mark;
333 // Finish fast unlock successfully. MUST branch to with flag == EQ
334 Label unlocked;
335 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
336 Label slow_path;
337
338 const Register t1_mark = t1;
339 const Register t2_top = t2;
340 const Register t3_t = t3;
341
342 { // Lightweight unlock
343
344 Label push_and_slow_path;
345
346 // Check if obj is top of lock-stack.
347 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
348 subw(t2_top, t2_top, oopSize);
349 ldr(t3_t, Address(rthread, t2_top));
350 cmp(obj, t3_t);
351 // Top of lock stack was not obj. Must be monitor.
352 br(Assembler::NE, inflated_load_mark);
353
354 // Pop lock-stack.
355 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
356 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
357
358 // Check if recursive.
359 subw(t3_t, t2_top, oopSize);
360 ldr(t3_t, Address(rthread, t3_t));
361 cmp(obj, t3_t);
362 br(Assembler::EQ, unlocked);
363
364 // Not recursive.
365 // Load Mark.
366 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
367
368 // Check header for monitor (0b10).
369 // Because we got here by popping (meaning we pushed in locked)
370 // there will be no monitor in the box. So we need to push back the obj
371 // so that the runtime can fix any potential anonymous owner.
372 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
373
374 // Try to unlock. Transition lock bits 0b00 => 0b01
375 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
376 orr(t3_t, t1_mark, markWord::unlocked_value);
377 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
378 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
379 br(Assembler::EQ, unlocked);
380
381 bind(push_and_slow_path);
382 // Compare and exchange failed.
383 // Restore lock-stack and handle the unlock in runtime.
384 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
385 addw(t2_top, t2_top, oopSize);
386 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
387 b(slow_path);
388 }
389
390
391 { // Handle inflated monitor.
392 bind(inflated_load_mark);
393 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
394 #ifdef ASSERT
395 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
396 stop("Fast Unlock not monitor");
397 #endif
398
399 bind(inflated);
400
401 #ifdef ASSERT
402 Label check_done;
403 subw(t2_top, t2_top, oopSize);
404 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
405 br(Assembler::LT, check_done);
406 ldr(t3_t, Address(rthread, t2_top));
407 cmp(obj, t3_t);
408 br(Assembler::NE, inflated);
409 stop("Fast Unlock lock on stack");
410 bind(check_done);
411 #endif
412
413 const Register t1_monitor = t1;
414
415 if (!UseObjectMonitorTable) {
416 assert(t1_monitor == t1_mark, "should be the same here");
417
418 // Untag the monitor.
419 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
420 } else {
421 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
422 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
423 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
424 br(Assembler::LO, slow_path);
425 }
426
427 const Register t2_recursions = t2;
428 Label not_recursive;
429
430 // Check if recursive.
431 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
432 cbz(t2_recursions, not_recursive);
433
434 // Recursive unlock.
435 sub(t2_recursions, t2_recursions, 1u);
436 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
437 // Set flag == EQ
438 cmp(t2_recursions, t2_recursions);
439 b(unlocked);
440
441 bind(not_recursive);
442
443 const Register t2_owner_addr = t2;
444
445 // Compute owner address.
446 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
447
448 // Set owner to null.
449 // Release to satisfy the JMM
450 stlr(zr, t2_owner_addr);
451 // We need a full fence after clearing owner to avoid stranding.
452 // StoreLoad achieves this.
453 membar(StoreLoad);
454
455 // Check if the entry_list is empty.
456 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
457 cmp(rscratch1, zr);
458 br(Assembler::EQ, unlocked); // If so we are done.
459
460 // Check if there is a successor.
461 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
462 cmp(rscratch1, zr);
463 br(Assembler::NE, unlocked); // If so we are done.
464
465 // Save the monitor pointer in the current thread, so we can try to
466 // reacquire the lock in SharedRuntime::monitor_exit_helper().
467 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
468
469 cmp(zr, rthread); // Set Flag to NE => slow path
470 b(slow_path);
471 }
472
473 bind(unlocked);
474 cmp(zr, zr); // Set Flags to EQ => fast path
475
476 #ifdef ASSERT
477 // Check that unlocked label is reached with Flags == EQ.
478 Label flag_correct;
479 br(Assembler::EQ, flag_correct);
480 stop("Fast Unlock Flag != EQ");
481 #endif
482
483 bind(slow_path);
484 #ifdef ASSERT
485 // Check that slow_path label is reached with Flags == NE.
486 br(Assembler::NE, flag_correct);
487 stop("Fast Unlock Flag != NE");
488 bind(flag_correct);
489 #endif
490 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
491 }
492
493 // Search for str1 in str2 and return index or -1
494 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
495 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
496 Register cnt2, Register cnt1,
497 Register tmp1, Register tmp2,
498 Register tmp3, Register tmp4,
499 Register tmp5, Register tmp6,
500 int icnt1, Register result, int ae) {
501 // NOTE: tmp5, tmp6 can be zr depending on specific method version
502 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
503
504 Register ch1 = rscratch1;
505 Register ch2 = rscratch2;
506 Register cnt1tmp = tmp1;
507 Register cnt2tmp = tmp2;
508 Register cnt1_neg = cnt1;
509 Register cnt2_neg = cnt2;
510 Register result_tmp = tmp4;
511
512 bool isL = ae == StrIntrinsicNode::LL;
513
514 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
515 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
516 int str1_chr_shift = str1_isL ? 0:1;
517 int str2_chr_shift = str2_isL ? 0:1;
518 int str1_chr_size = str1_isL ? 1:2;
519 int str2_chr_size = str2_isL ? 1:2;
520 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
521 (chr_insn)&MacroAssembler::ldrh;
522 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
523 (chr_insn)&MacroAssembler::ldrh;
524 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
525 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
526
527 // Note, inline_string_indexOf() generates checks:
528 // if (substr.count > string.count) return -1;
529 // if (substr.count == 0) return 0;
530
531 // We have two strings, a source string in str2, cnt2 and a pattern string
532 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
533
534 // For larger pattern and source we use a simplified Boyer Moore algorithm.
535 // With a small pattern and source we use linear scan.
536
537 if (icnt1 == -1) {
538 sub(result_tmp, cnt2, cnt1);
539 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
540 br(LT, LINEARSEARCH);
541 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
542 subs(zr, cnt1, 256);
543 lsr(tmp1, cnt2, 2);
544 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
545 br(GE, LINEARSTUB);
546 }
547
548 // The Boyer Moore alogorithm is based on the description here:-
549 //
550 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
551 //
552 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
553 // and the 'Good Suffix' rule.
554 //
555 // These rules are essentially heuristics for how far we can shift the
556 // pattern along the search string.
557 //
558 // The implementation here uses the 'Bad Character' rule only because of the
559 // complexity of initialisation for the 'Good Suffix' rule.
560 //
561 // This is also known as the Boyer-Moore-Horspool algorithm:-
562 //
563 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
564 //
565 // This particular implementation has few java-specific optimizations.
566 //
567 // #define ASIZE 256
568 //
569 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
570 // int i, j;
571 // unsigned c;
572 // unsigned char bc[ASIZE];
573 //
574 // /* Preprocessing */
575 // for (i = 0; i < ASIZE; ++i)
576 // bc[i] = m;
577 // for (i = 0; i < m - 1; ) {
578 // c = x[i];
579 // ++i;
580 // // c < 256 for Latin1 string, so, no need for branch
581 // #ifdef PATTERN_STRING_IS_LATIN1
582 // bc[c] = m - i;
583 // #else
584 // if (c < ASIZE) bc[c] = m - i;
585 // #endif
586 // }
587 //
588 // /* Searching */
589 // j = 0;
590 // while (j <= n - m) {
591 // c = y[i+j];
592 // if (x[m-1] == c)
593 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
594 // if (i < 0) return j;
595 // // c < 256 for Latin1 string, so, no need for branch
596 // #ifdef SOURCE_STRING_IS_LATIN1
597 // // LL case: (c< 256) always true. Remove branch
598 // j += bc[y[j+m-1]];
599 // #endif
600 // #ifndef PATTERN_STRING_IS_UTF
601 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
602 // if (c < ASIZE)
603 // j += bc[y[j+m-1]];
604 // else
605 // j += 1
606 // #endif
607 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
608 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
609 // if (c < ASIZE)
610 // j += bc[y[j+m-1]];
611 // else
612 // j += m
613 // #endif
614 // }
615 // }
616
617 if (icnt1 == -1) {
618 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
619 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
620 Register cnt1end = tmp2;
621 Register str2end = cnt2;
622 Register skipch = tmp2;
623
624 // str1 length is >=8, so, we can read at least 1 register for cases when
625 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
626 // UL case. We'll re-read last character in inner pre-loop code to have
627 // single outer pre-loop load
628 const int firstStep = isL ? 7 : 3;
629
630 const int ASIZE = 256;
631 const int STORED_BYTES = 32; // amount of bytes stored per instruction
632 sub(sp, sp, ASIZE);
633 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
634 mov(ch1, sp);
635 BIND(BM_INIT_LOOP);
636 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
637 subs(tmp5, tmp5, 1);
638 br(GT, BM_INIT_LOOP);
639
640 sub(cnt1tmp, cnt1, 1);
641 mov(tmp5, str2);
642 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
643 sub(ch2, cnt1, 1);
644 mov(tmp3, str1);
645 BIND(BCLOOP);
646 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
647 if (!str1_isL) {
648 subs(zr, ch1, ASIZE);
649 br(HS, BCSKIP);
650 }
651 strb(ch2, Address(sp, ch1));
652 BIND(BCSKIP);
653 subs(ch2, ch2, 1);
654 br(GT, BCLOOP);
655
656 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
657 if (str1_isL == str2_isL) {
658 // load last 8 bytes (8LL/4UU symbols)
659 ldr(tmp6, Address(tmp6, -wordSize));
660 } else {
661 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
662 // convert Latin1 to UTF. We'll have to wait until load completed, but
663 // it's still faster than per-character loads+checks
664 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
665 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
666 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
667 andr(tmp6, tmp6, 0xFF); // str1[N-4]
668 orr(ch2, ch1, ch2, LSL, 16);
669 orr(tmp6, tmp6, tmp3, LSL, 48);
670 orr(tmp6, tmp6, ch2, LSL, 16);
671 }
672 BIND(BMLOOPSTR2);
673 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
674 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
675 if (str1_isL == str2_isL) {
676 // re-init tmp3. It's for free because it's executed in parallel with
677 // load above. Alternative is to initialize it before loop, but it'll
678 // affect performance on in-order systems with 2 or more ld/st pipelines
679 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
680 }
681 if (!isL) { // UU/UL case
682 lsl(ch2, cnt1tmp, 1); // offset in bytes
683 }
684 cmp(tmp3, skipch);
685 br(NE, BMSKIP);
686 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
687 mov(ch1, tmp6);
688 if (isL) {
689 b(BMLOOPSTR1_AFTER_LOAD);
690 } else {
691 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
692 b(BMLOOPSTR1_CMP);
693 }
694 BIND(BMLOOPSTR1);
695 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
696 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
697 BIND(BMLOOPSTR1_AFTER_LOAD);
698 subs(cnt1tmp, cnt1tmp, 1);
699 br(LT, BMLOOPSTR1_LASTCMP);
700 BIND(BMLOOPSTR1_CMP);
701 cmp(ch1, ch2);
702 br(EQ, BMLOOPSTR1);
703 BIND(BMSKIP);
704 if (!isL) {
705 // if we've met UTF symbol while searching Latin1 pattern, then we can
706 // skip cnt1 symbols
707 if (str1_isL != str2_isL) {
708 mov(result_tmp, cnt1);
709 } else {
710 mov(result_tmp, 1);
711 }
712 subs(zr, skipch, ASIZE);
713 br(HS, BMADV);
714 }
715 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
716 BIND(BMADV);
717 sub(cnt1tmp, cnt1, 1);
718 add(str2, str2, result_tmp, LSL, str2_chr_shift);
719 cmp(str2, str2end);
720 br(LE, BMLOOPSTR2);
721 add(sp, sp, ASIZE);
722 b(NOMATCH);
723 BIND(BMLOOPSTR1_LASTCMP);
724 cmp(ch1, ch2);
725 br(NE, BMSKIP);
726 BIND(BMMATCH);
727 sub(result, str2, tmp5);
728 if (!str2_isL) lsr(result, result, 1);
729 add(sp, sp, ASIZE);
730 b(DONE);
731
732 BIND(LINEARSTUB);
733 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
734 br(LT, LINEAR_MEDIUM);
735 mov(result, zr);
736 RuntimeAddress stub = nullptr;
737 if (isL) {
738 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
739 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
740 } else if (str1_isL) {
741 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
742 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
743 } else {
744 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
745 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
746 }
747 address call = trampoline_call(stub);
748 if (call == nullptr) {
749 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
750 ciEnv::current()->record_failure("CodeCache is full");
751 return;
752 }
753 b(DONE);
754 }
755
756 BIND(LINEARSEARCH);
757 {
758 Label DO1, DO2, DO3;
759
760 Register str2tmp = tmp2;
761 Register first = tmp3;
762
763 if (icnt1 == -1)
764 {
765 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
766
767 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
768 br(LT, DOSHORT);
769 BIND(LINEAR_MEDIUM);
770 (this->*str1_load_1chr)(first, Address(str1));
771 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
772 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
773 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
774 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
775
776 BIND(FIRST_LOOP);
777 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
778 cmp(first, ch2);
779 br(EQ, STR1_LOOP);
780 BIND(STR2_NEXT);
781 adds(cnt2_neg, cnt2_neg, str2_chr_size);
782 br(LE, FIRST_LOOP);
783 b(NOMATCH);
784
785 BIND(STR1_LOOP);
786 adds(cnt1tmp, cnt1_neg, str1_chr_size);
787 add(cnt2tmp, cnt2_neg, str2_chr_size);
788 br(GE, MATCH);
789
790 BIND(STR1_NEXT);
791 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
792 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
793 cmp(ch1, ch2);
794 br(NE, STR2_NEXT);
795 adds(cnt1tmp, cnt1tmp, str1_chr_size);
796 add(cnt2tmp, cnt2tmp, str2_chr_size);
797 br(LT, STR1_NEXT);
798 b(MATCH);
799
800 BIND(DOSHORT);
801 if (str1_isL == str2_isL) {
802 cmp(cnt1, (u1)2);
803 br(LT, DO1);
804 br(GT, DO3);
805 }
806 }
807
808 if (icnt1 == 4) {
809 Label CH1_LOOP;
810
811 (this->*load_4chr)(ch1, str1);
812 sub(result_tmp, cnt2, 4);
813 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
814 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
815
816 BIND(CH1_LOOP);
817 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
818 cmp(ch1, ch2);
819 br(EQ, MATCH);
820 adds(cnt2_neg, cnt2_neg, str2_chr_size);
821 br(LE, CH1_LOOP);
822 b(NOMATCH);
823 }
824
825 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
826 Label CH1_LOOP;
827
828 BIND(DO2);
829 (this->*load_2chr)(ch1, str1);
830 if (icnt1 == 2) {
831 sub(result_tmp, cnt2, 2);
832 }
833 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
834 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
835 BIND(CH1_LOOP);
836 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
837 cmp(ch1, ch2);
838 br(EQ, MATCH);
839 adds(cnt2_neg, cnt2_neg, str2_chr_size);
840 br(LE, CH1_LOOP);
841 b(NOMATCH);
842 }
843
844 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
845 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
846
847 BIND(DO3);
848 (this->*load_2chr)(first, str1);
849 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
850 if (icnt1 == 3) {
851 sub(result_tmp, cnt2, 3);
852 }
853 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
854 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
855 BIND(FIRST_LOOP);
856 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
857 cmpw(first, ch2);
858 br(EQ, STR1_LOOP);
859 BIND(STR2_NEXT);
860 adds(cnt2_neg, cnt2_neg, str2_chr_size);
861 br(LE, FIRST_LOOP);
862 b(NOMATCH);
863
864 BIND(STR1_LOOP);
865 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
866 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
867 cmp(ch1, ch2);
868 br(NE, STR2_NEXT);
869 b(MATCH);
870 }
871
872 if (icnt1 == -1 || icnt1 == 1) {
873 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
874
875 BIND(DO1);
876 (this->*str1_load_1chr)(ch1, str1);
877 cmp(cnt2, (u1)8);
878 br(LT, DO1_SHORT);
879
880 sub(result_tmp, cnt2, 8/str2_chr_size);
881 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
882 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
883 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
884
885 if (str2_isL) {
886 orr(ch1, ch1, ch1, LSL, 8);
887 }
888 orr(ch1, ch1, ch1, LSL, 16);
889 orr(ch1, ch1, ch1, LSL, 32);
890 BIND(CH1_LOOP);
891 ldr(ch2, Address(str2, cnt2_neg));
892 eor(ch2, ch1, ch2);
893 sub(tmp1, ch2, tmp3);
894 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
895 bics(tmp1, tmp1, tmp2);
896 br(NE, HAS_ZERO);
897 adds(cnt2_neg, cnt2_neg, 8);
898 br(LT, CH1_LOOP);
899
900 cmp(cnt2_neg, (u1)8);
901 mov(cnt2_neg, 0);
902 br(LT, CH1_LOOP);
903 b(NOMATCH);
904
905 BIND(HAS_ZERO);
906 rev(tmp1, tmp1);
907 clz(tmp1, tmp1);
908 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
909 b(MATCH);
910
911 BIND(DO1_SHORT);
912 mov(result_tmp, cnt2);
913 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
914 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
915 BIND(DO1_LOOP);
916 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
917 cmpw(ch1, ch2);
918 br(EQ, MATCH);
919 adds(cnt2_neg, cnt2_neg, str2_chr_size);
920 br(LT, DO1_LOOP);
921 }
922 }
923 BIND(NOMATCH);
924 mov(result, -1);
925 b(DONE);
926 BIND(MATCH);
927 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
928 BIND(DONE);
929 }
930
931 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
932 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
933
934 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
935 Register ch, Register result,
936 Register tmp1, Register tmp2, Register tmp3)
937 {
938 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
939 Register cnt1_neg = cnt1;
940 Register ch1 = rscratch1;
941 Register result_tmp = rscratch2;
942
943 cbz(cnt1, NOMATCH);
944
945 cmp(cnt1, (u1)4);
946 br(LT, DO1_SHORT);
947
948 orr(ch, ch, ch, LSL, 16);
949 orr(ch, ch, ch, LSL, 32);
950
951 sub(cnt1, cnt1, 4);
952 mov(result_tmp, cnt1);
953 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
954 sub(cnt1_neg, zr, cnt1, LSL, 1);
955
956 mov(tmp3, 0x0001000100010001);
957
958 BIND(CH1_LOOP);
959 ldr(ch1, Address(str1, cnt1_neg));
960 eor(ch1, ch, ch1);
961 sub(tmp1, ch1, tmp3);
962 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
963 bics(tmp1, tmp1, tmp2);
964 br(NE, HAS_ZERO);
965 adds(cnt1_neg, cnt1_neg, 8);
966 br(LT, CH1_LOOP);
967
968 cmp(cnt1_neg, (u1)8);
969 mov(cnt1_neg, 0);
970 br(LT, CH1_LOOP);
971 b(NOMATCH);
972
973 BIND(HAS_ZERO);
974 rev(tmp1, tmp1);
975 clz(tmp1, tmp1);
976 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
977 b(MATCH);
978
979 BIND(DO1_SHORT);
980 mov(result_tmp, cnt1);
981 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
982 sub(cnt1_neg, zr, cnt1, LSL, 1);
983 BIND(DO1_LOOP);
984 ldrh(ch1, Address(str1, cnt1_neg));
985 cmpw(ch, ch1);
986 br(EQ, MATCH);
987 adds(cnt1_neg, cnt1_neg, 2);
988 br(LT, DO1_LOOP);
989 BIND(NOMATCH);
990 mov(result, -1);
991 b(DONE);
992 BIND(MATCH);
993 add(result, result_tmp, cnt1_neg, ASR, 1);
994 BIND(DONE);
995 }
996
997 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
998 Register ch, Register result,
999 FloatRegister ztmp1,
1000 FloatRegister ztmp2,
1001 PRegister tmp_pg,
1002 PRegister tmp_pdn, bool isL)
1003 {
1004 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1005 assert(tmp_pg->is_governing(),
1006 "this register has to be a governing predicate register");
1007
1008 Label LOOP, MATCH, DONE, NOMATCH;
1009 Register vec_len = rscratch1;
1010 Register idx = rscratch2;
1011
1012 SIMD_RegVariant T = (isL == true) ? B : H;
1013
1014 cbz(cnt1, NOMATCH);
1015
1016 // Assign the particular char throughout the vector.
1017 sve_dup(ztmp2, T, ch);
1018 if (isL) {
1019 sve_cntb(vec_len);
1020 } else {
1021 sve_cnth(vec_len);
1022 }
1023 mov(idx, 0);
1024
1025 // Generate a predicate to control the reading of input string.
1026 sve_whilelt(tmp_pg, T, idx, cnt1);
1027
1028 BIND(LOOP);
1029 // Read a vector of 8- or 16-bit data depending on the string type. Note
1030 // that inactive elements indicated by the predicate register won't cause
1031 // a data read from memory to the destination vector.
1032 if (isL) {
1033 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1034 } else {
1035 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1036 }
1037 add(idx, idx, vec_len);
1038
1039 // Perform the comparison. An element of the destination predicate is set
1040 // to active if the particular char is matched.
1041 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1042
1043 // Branch if the particular char is found.
1044 br(NE, MATCH);
1045
1046 sve_whilelt(tmp_pg, T, idx, cnt1);
1047
1048 // Loop back if the particular char not found.
1049 br(MI, LOOP);
1050
1051 BIND(NOMATCH);
1052 mov(result, -1);
1053 b(DONE);
1054
1055 BIND(MATCH);
1056 // Undo the index increment.
1057 sub(idx, idx, vec_len);
1058
1059 // Crop the vector to find its location.
1060 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1061 add(result, idx, -1);
1062 sve_incp(result, T, tmp_pdn);
1063 BIND(DONE);
1064 }
1065
1066 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1067 Register ch, Register result,
1068 Register tmp1, Register tmp2, Register tmp3)
1069 {
1070 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1071 Register cnt1_neg = cnt1;
1072 Register ch1 = rscratch1;
1073 Register result_tmp = rscratch2;
1074
1075 cbz(cnt1, NOMATCH);
1076
1077 cmp(cnt1, (u1)8);
1078 br(LT, DO1_SHORT);
1079
1080 orr(ch, ch, ch, LSL, 8);
1081 orr(ch, ch, ch, LSL, 16);
1082 orr(ch, ch, ch, LSL, 32);
1083
1084 sub(cnt1, cnt1, 8);
1085 mov(result_tmp, cnt1);
1086 lea(str1, Address(str1, cnt1));
1087 sub(cnt1_neg, zr, cnt1);
1088
1089 mov(tmp3, 0x0101010101010101);
1090
1091 BIND(CH1_LOOP);
1092 ldr(ch1, Address(str1, cnt1_neg));
1093 eor(ch1, ch, ch1);
1094 sub(tmp1, ch1, tmp3);
1095 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1096 bics(tmp1, tmp1, tmp2);
1097 br(NE, HAS_ZERO);
1098 adds(cnt1_neg, cnt1_neg, 8);
1099 br(LT, CH1_LOOP);
1100
1101 cmp(cnt1_neg, (u1)8);
1102 mov(cnt1_neg, 0);
1103 br(LT, CH1_LOOP);
1104 b(NOMATCH);
1105
1106 BIND(HAS_ZERO);
1107 rev(tmp1, tmp1);
1108 clz(tmp1, tmp1);
1109 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1110 b(MATCH);
1111
1112 BIND(DO1_SHORT);
1113 mov(result_tmp, cnt1);
1114 lea(str1, Address(str1, cnt1));
1115 sub(cnt1_neg, zr, cnt1);
1116 BIND(DO1_LOOP);
1117 ldrb(ch1, Address(str1, cnt1_neg));
1118 cmp(ch, ch1);
1119 br(EQ, MATCH);
1120 adds(cnt1_neg, cnt1_neg, 1);
1121 br(LT, DO1_LOOP);
1122 BIND(NOMATCH);
1123 mov(result, -1);
1124 b(DONE);
1125 BIND(MATCH);
1126 add(result, result_tmp, cnt1_neg);
1127 BIND(DONE);
1128 }
1129
1130 // Compare strings.
1131 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1132 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1133 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1134 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1135 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1136 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1137 SHORT_LOOP_START, TAIL_CHECK;
1138
1139 bool isLL = ae == StrIntrinsicNode::LL;
1140 bool isLU = ae == StrIntrinsicNode::LU;
1141 bool isUL = ae == StrIntrinsicNode::UL;
1142
1143 // The stub threshold for LL strings is: 72 (64 + 8) chars
1144 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1145 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1146 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1147
1148 bool str1_isL = isLL || isLU;
1149 bool str2_isL = isLL || isUL;
1150
1151 int str1_chr_shift = str1_isL ? 0 : 1;
1152 int str2_chr_shift = str2_isL ? 0 : 1;
1153 int str1_chr_size = str1_isL ? 1 : 2;
1154 int str2_chr_size = str2_isL ? 1 : 2;
1155 int minCharsInWord = isLL ? wordSize : wordSize/2;
1156
1157 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1158 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1159 (chr_insn)&MacroAssembler::ldrh;
1160 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1161 (chr_insn)&MacroAssembler::ldrh;
1162 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1163 (uxt_insn)&MacroAssembler::uxthw;
1164
1165 BLOCK_COMMENT("string_compare {");
1166
1167 // Bizarrely, the counts are passed in bytes, regardless of whether they
1168 // are L or U strings, however the result is always in characters.
1169 if (!str1_isL) asrw(cnt1, cnt1, 1);
1170 if (!str2_isL) asrw(cnt2, cnt2, 1);
1171
1172 // Compute the minimum of the string lengths and save the difference.
1173 subsw(result, cnt1, cnt2);
1174 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1175
1176 // A very short string
1177 cmpw(cnt2, minCharsInWord);
1178 br(Assembler::LE, SHORT_STRING);
1179
1180 // Compare longwords
1181 // load first parts of strings and finish initialization while loading
1182 {
1183 if (str1_isL == str2_isL) { // LL or UU
1184 ldr(tmp1, Address(str1));
1185 cmp(str1, str2);
1186 br(Assembler::EQ, DONE);
1187 ldr(tmp2, Address(str2));
1188 cmp(cnt2, stub_threshold);
1189 br(GE, STUB);
1190 subsw(cnt2, cnt2, minCharsInWord);
1191 br(EQ, TAIL_CHECK);
1192 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1193 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1194 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1195 } else if (isLU) {
1196 ldrs(vtmp, Address(str1));
1197 ldr(tmp2, Address(str2));
1198 cmp(cnt2, stub_threshold);
1199 br(GE, STUB);
1200 subw(cnt2, cnt2, 4);
1201 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1202 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1203 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1204 zip1(vtmp, T8B, vtmp, vtmpZ);
1205 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1206 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1207 add(cnt1, cnt1, 4);
1208 fmovd(tmp1, vtmp);
1209 } else { // UL case
1210 ldr(tmp1, Address(str1));
1211 ldrs(vtmp, Address(str2));
1212 cmp(cnt2, stub_threshold);
1213 br(GE, STUB);
1214 subw(cnt2, cnt2, 4);
1215 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1216 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1217 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1218 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1219 zip1(vtmp, T8B, vtmp, vtmpZ);
1220 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1221 add(cnt1, cnt1, 8);
1222 fmovd(tmp2, vtmp);
1223 }
1224 adds(cnt2, cnt2, isUL ? 4 : 8);
1225 br(GE, TAIL);
1226 eor(rscratch2, tmp1, tmp2);
1227 cbnz(rscratch2, DIFF);
1228 // main loop
1229 bind(NEXT_WORD);
1230 if (str1_isL == str2_isL) {
1231 ldr(tmp1, Address(str1, cnt2));
1232 ldr(tmp2, Address(str2, cnt2));
1233 adds(cnt2, cnt2, 8);
1234 } else if (isLU) {
1235 ldrs(vtmp, Address(str1, cnt1));
1236 ldr(tmp2, Address(str2, cnt2));
1237 add(cnt1, cnt1, 4);
1238 zip1(vtmp, T8B, vtmp, vtmpZ);
1239 fmovd(tmp1, vtmp);
1240 adds(cnt2, cnt2, 8);
1241 } else { // UL
1242 ldrs(vtmp, Address(str2, cnt2));
1243 ldr(tmp1, Address(str1, cnt1));
1244 zip1(vtmp, T8B, vtmp, vtmpZ);
1245 add(cnt1, cnt1, 8);
1246 fmovd(tmp2, vtmp);
1247 adds(cnt2, cnt2, 4);
1248 }
1249 br(GE, TAIL);
1250
1251 eor(rscratch2, tmp1, tmp2);
1252 cbz(rscratch2, NEXT_WORD);
1253 b(DIFF);
1254 bind(TAIL);
1255 eor(rscratch2, tmp1, tmp2);
1256 cbnz(rscratch2, DIFF);
1257 // Last longword. In the case where length == 4 we compare the
1258 // same longword twice, but that's still faster than another
1259 // conditional branch.
1260 if (str1_isL == str2_isL) {
1261 ldr(tmp1, Address(str1));
1262 ldr(tmp2, Address(str2));
1263 } else if (isLU) {
1264 ldrs(vtmp, Address(str1));
1265 ldr(tmp2, Address(str2));
1266 zip1(vtmp, T8B, vtmp, vtmpZ);
1267 fmovd(tmp1, vtmp);
1268 } else { // UL
1269 ldrs(vtmp, Address(str2));
1270 ldr(tmp1, Address(str1));
1271 zip1(vtmp, T8B, vtmp, vtmpZ);
1272 fmovd(tmp2, vtmp);
1273 }
1274 bind(TAIL_CHECK);
1275 eor(rscratch2, tmp1, tmp2);
1276 cbz(rscratch2, DONE);
1277
1278 // Find the first different characters in the longwords and
1279 // compute their difference.
1280 bind(DIFF);
1281 rev(rscratch2, rscratch2);
1282 clz(rscratch2, rscratch2);
1283 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1284 lsrv(tmp1, tmp1, rscratch2);
1285 (this->*ext_chr)(tmp1, tmp1);
1286 lsrv(tmp2, tmp2, rscratch2);
1287 (this->*ext_chr)(tmp2, tmp2);
1288 subw(result, tmp1, tmp2);
1289 b(DONE);
1290 }
1291
1292 bind(STUB);
1293 RuntimeAddress stub = nullptr;
1294 switch(ae) {
1295 case StrIntrinsicNode::LL:
1296 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1297 break;
1298 case StrIntrinsicNode::UU:
1299 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1300 break;
1301 case StrIntrinsicNode::LU:
1302 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1303 break;
1304 case StrIntrinsicNode::UL:
1305 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1306 break;
1307 default:
1308 ShouldNotReachHere();
1309 }
1310 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1311 address call = trampoline_call(stub);
1312 if (call == nullptr) {
1313 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1314 ciEnv::current()->record_failure("CodeCache is full");
1315 return;
1316 }
1317 b(DONE);
1318
1319 bind(SHORT_STRING);
1320 // Is the minimum length zero?
1321 cbz(cnt2, DONE);
1322 // arrange code to do most branches while loading and loading next characters
1323 // while comparing previous
1324 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1325 subs(cnt2, cnt2, 1);
1326 br(EQ, SHORT_LAST_INIT);
1327 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1328 b(SHORT_LOOP_START);
1329 bind(SHORT_LOOP);
1330 subs(cnt2, cnt2, 1);
1331 br(EQ, SHORT_LAST);
1332 bind(SHORT_LOOP_START);
1333 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1334 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1335 cmp(tmp1, cnt1);
1336 br(NE, SHORT_LOOP_TAIL);
1337 subs(cnt2, cnt2, 1);
1338 br(EQ, SHORT_LAST2);
1339 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1340 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1341 cmp(tmp2, rscratch1);
1342 br(EQ, SHORT_LOOP);
1343 sub(result, tmp2, rscratch1);
1344 b(DONE);
1345 bind(SHORT_LOOP_TAIL);
1346 sub(result, tmp1, cnt1);
1347 b(DONE);
1348 bind(SHORT_LAST2);
1349 cmp(tmp2, rscratch1);
1350 br(EQ, DONE);
1351 sub(result, tmp2, rscratch1);
1352
1353 b(DONE);
1354 bind(SHORT_LAST_INIT);
1355 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1356 bind(SHORT_LAST);
1357 cmp(tmp1, cnt1);
1358 br(EQ, DONE);
1359 sub(result, tmp1, cnt1);
1360
1361 bind(DONE);
1362
1363 BLOCK_COMMENT("} string_compare");
1364 }
1365
1366 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1367 FloatRegister src2, Condition cond, bool isQ) {
1368 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1369 FloatRegister zn = src1, zm = src2;
1370 bool needs_negation = false;
1371 switch (cond) {
1372 case LT: cond = GT; zn = src2; zm = src1; break;
1373 case LE: cond = GE; zn = src2; zm = src1; break;
1374 case LO: cond = HI; zn = src2; zm = src1; break;
1375 case LS: cond = HS; zn = src2; zm = src1; break;
1376 case NE: cond = EQ; needs_negation = true; break;
1377 default:
1378 break;
1379 }
1380
1381 if (is_floating_point_type(bt)) {
1382 fcm(cond, dst, size, zn, zm);
1383 } else {
1384 cm(cond, dst, size, zn, zm);
1385 }
1386
1387 if (needs_negation) {
1388 notr(dst, isQ ? T16B : T8B, dst);
1389 }
1390 }
1391
1392 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1393 Condition cond, bool isQ) {
1394 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1395 if (bt == T_FLOAT || bt == T_DOUBLE) {
1396 if (cond == Assembler::NE) {
1397 fcm(Assembler::EQ, dst, size, src);
1398 notr(dst, isQ ? T16B : T8B, dst);
1399 } else {
1400 fcm(cond, dst, size, src);
1401 }
1402 } else {
1403 if (cond == Assembler::NE) {
1404 cm(Assembler::EQ, dst, size, src);
1405 notr(dst, isQ ? T16B : T8B, dst);
1406 } else {
1407 cm(cond, dst, size, src);
1408 }
1409 }
1410 }
1411
1412 // Compress the least significant bit of each byte to the rightmost and clear
1413 // the higher garbage bits.
1414 void C2_MacroAssembler::bytemask_compress(Register dst) {
1415 // Example input, dst = 0x01 00 00 00 01 01 00 01
1416 // The "??" bytes are garbage.
1417 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1418 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1419 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1420 andr(dst, dst, 0xff); // dst = 0x8D
1421 }
1422
1423 // Pack the lowest-numbered bit of each mask element in src into a long value
1424 // in dst, at most the first 64 lane elements.
1425 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1426 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1427 FloatRegister vtmp1, FloatRegister vtmp2) {
1428 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1429 assert_different_registers(dst, rscratch1);
1430 assert_different_registers(vtmp1, vtmp2);
1431
1432 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1433 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1434 // Expected: dst = 0x658D
1435
1436 // Convert the mask into vector with sequential bytes.
1437 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1438 sve_cpy(vtmp1, size, src, 1, false);
1439 if (bt != T_BYTE) {
1440 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1441 }
1442
1443 if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1444 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1445 // is to compress each significant bit of the byte in a cross-lane way. Due
1446 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1447 // (bit-compress in each lane) with the biggest lane size (T = D) then
1448 // concatenate the results.
1449
1450 // The second source input of BEXT, initialized with 0x01 in each byte.
1451 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1452 sve_dup(vtmp2, B, 1);
1453
1454 // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1455 // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1456 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1457 // ---------------------------------------
1458 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1459 sve_bext(vtmp1, D, vtmp1, vtmp2);
1460
1461 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1462 // result to dst.
1463 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1464 // dst = 0x658D
1465 if (lane_cnt <= 8) {
1466 // No need to concatenate.
1467 umov(dst, vtmp1, B, 0);
1468 } else if (lane_cnt <= 16) {
1469 ins(vtmp1, B, vtmp1, 1, 8);
1470 umov(dst, vtmp1, H, 0);
1471 } else {
1472 // As the lane count is 64 at most, the final expected value must be in
1473 // the lowest 64 bits after narrowing vtmp1 from D to B.
1474 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1475 umov(dst, vtmp1, D, 0);
1476 }
1477 } else if (UseSVE > 0) {
1478 // Compress the lowest 8 bytes.
1479 fmovd(dst, vtmp1);
1480 bytemask_compress(dst);
1481 if (lane_cnt <= 8) return;
1482
1483 // Repeat on higher bytes and join the results.
1484 // Compress 8 bytes in each iteration.
1485 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1486 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1487 bytemask_compress(rscratch1);
1488 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1489 }
1490 } else {
1491 assert(false, "unsupported");
1492 ShouldNotReachHere();
1493 }
1494 }
1495
1496 // Unpack the mask, a long value in src, into predicate register dst based on the
1497 // corresponding data type. Note that dst can support at most 64 lanes.
1498 // Below example gives the expected dst predicate register in different types, with
1499 // a valid src(0x658D) on a 1024-bit vector size machine.
1500 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1501 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1502 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1503 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1504 //
1505 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1506 // has 24 significant bits would be an invalid input if dst predicate register refers to
1507 // a LONG type 1024-bit vector, which has at most 16 lanes.
1508 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1509 FloatRegister vtmp1, FloatRegister vtmp2) {
1510 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1511 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1512 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1513 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1514 // Expected: dst = 0b01101001 10001101
1515
1516 // Put long value from general purpose register into the first lane of vector.
1517 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1518 sve_dup(vtmp1, B, 0);
1519 mov(vtmp1, D, 0, src);
1520
1521 // As sve_cmp generates mask value with the minimum unit in byte, we should
1522 // transform the value in the first lane which is mask in bit now to the
1523 // mask in byte, which can be done by SVE2's BDEP instruction.
1524
1525 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1526 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1527 if (lane_cnt <= 8) {
1528 // Nothing. As only one byte exsits.
1529 } else if (lane_cnt <= 16) {
1530 ins(vtmp1, B, vtmp1, 8, 1);
1531 mov(vtmp1, B, 1, zr);
1532 } else {
1533 sve_vector_extend(vtmp1, D, vtmp1, B);
1534 }
1535
1536 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1537 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1538 sve_dup(vtmp2, B, 1);
1539
1540 // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1541 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1542 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1543 // ---------------------------------------
1544 // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1545 sve_bdep(vtmp1, D, vtmp1, vtmp2);
1546
1547 if (bt != T_BYTE) {
1548 sve_vector_extend(vtmp1, size, vtmp1, B);
1549 }
1550 // Generate mask according to the given vector, in which the elements have been
1551 // extended to expected type.
1552 // dst = 0b01101001 10001101
1553 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1554 }
1555
1556 // Clobbers: rflags
1557 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1558 FloatRegister zn, FloatRegister zm, Condition cond) {
1559 assert(pg->is_governing(), "This register has to be a governing predicate register");
1560 FloatRegister z1 = zn, z2 = zm;
1561 switch (cond) {
1562 case LE: z1 = zm; z2 = zn; cond = GE; break;
1563 case LT: z1 = zm; z2 = zn; cond = GT; break;
1564 case LO: z1 = zm; z2 = zn; cond = HI; break;
1565 case LS: z1 = zm; z2 = zn; cond = HS; break;
1566 default:
1567 break;
1568 }
1569
1570 SIMD_RegVariant size = elemType_to_regVariant(bt);
1571 if (is_floating_point_type(bt)) {
1572 sve_fcm(cond, pd, size, pg, z1, z2);
1573 } else {
1574 assert(is_integral_type(bt), "unsupported element type");
1575 sve_cmp(cond, pd, size, pg, z1, z2);
1576 }
1577 }
1578
1579 // Get index of the last mask lane that is set
1580 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1581 SIMD_RegVariant size = elemType_to_regVariant(bt);
1582 sve_rev(ptmp, size, src);
1583 sve_brkb(ptmp, ptrue, ptmp, false);
1584 sve_cntp(dst, size, ptrue, ptmp);
1585 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1586 subw(dst, rscratch1, dst);
1587 }
1588
1589 // Extend integer vector src to dst with the same lane count
1590 // but larger element size, e.g. 4B -> 4I
1591 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1592 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1593 if (src_bt == T_BYTE) {
1594 // 4B to 4S/4I, 8B to 8S
1595 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1596 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1597 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1598 if (dst_bt == T_INT) {
1599 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1600 }
1601 } else if (src_bt == T_SHORT) {
1602 // 2S to 2I/2L, 4S to 4I
1603 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1604 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1605 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1606 if (dst_bt == T_LONG) {
1607 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1608 }
1609 } else if (src_bt == T_INT) {
1610 // 2I to 2L
1611 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1612 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1613 } else {
1614 ShouldNotReachHere();
1615 }
1616 }
1617
1618 // Narrow integer vector src down to dst with the same lane count
1619 // but smaller element size, e.g. 4I -> 4B
1620 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1621 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1622 if (src_bt == T_SHORT) {
1623 // 4S/8S to 4B/8B
1624 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1625 assert(dst_bt == T_BYTE, "unsupported");
1626 xtn(dst, T8B, src, T8H);
1627 } else if (src_bt == T_INT) {
1628 // 2I to 2S, 4I to 4B/4S
1629 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1630 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1631 xtn(dst, T4H, src, T4S);
1632 if (dst_bt == T_BYTE) {
1633 xtn(dst, T8B, dst, T8H);
1634 }
1635 } else if (src_bt == T_LONG) {
1636 // 2L to 2S/2I
1637 assert(src_vlen_in_bytes == 16, "unsupported");
1638 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1639 xtn(dst, T2S, src, T2D);
1640 if (dst_bt == T_SHORT) {
1641 xtn(dst, T4H, dst, T4S);
1642 }
1643 } else {
1644 ShouldNotReachHere();
1645 }
1646 }
1647
1648 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1649 FloatRegister src, SIMD_RegVariant src_size,
1650 bool is_unsigned) {
1651 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1652
1653 if (src_size == B) {
1654 switch (dst_size) {
1655 case H:
1656 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1657 break;
1658 case S:
1659 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1660 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1661 break;
1662 case D:
1663 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1664 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1665 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1666 break;
1667 default:
1668 ShouldNotReachHere();
1669 }
1670 } else if (src_size == H) {
1671 if (dst_size == S) {
1672 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1673 } else { // D
1674 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1675 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1676 }
1677 } else if (src_size == S) {
1678 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1679 }
1680 }
1681
1682 // Vector narrow from src to dst with specified element sizes.
1683 // High part of dst vector will be filled with zero.
1684 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1685 FloatRegister src, SIMD_RegVariant src_size,
1686 FloatRegister tmp) {
1687 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1688 assert_different_registers(src, tmp);
1689 sve_dup(tmp, src_size, 0);
1690 if (src_size == D) {
1691 switch (dst_size) {
1692 case S:
1693 sve_uzp1(dst, S, src, tmp);
1694 break;
1695 case H:
1696 assert_different_registers(dst, tmp);
1697 sve_uzp1(dst, S, src, tmp);
1698 sve_uzp1(dst, H, dst, tmp);
1699 break;
1700 case B:
1701 assert_different_registers(dst, tmp);
1702 sve_uzp1(dst, S, src, tmp);
1703 sve_uzp1(dst, H, dst, tmp);
1704 sve_uzp1(dst, B, dst, tmp);
1705 break;
1706 default:
1707 ShouldNotReachHere();
1708 }
1709 } else if (src_size == S) {
1710 if (dst_size == H) {
1711 sve_uzp1(dst, H, src, tmp);
1712 } else { // B
1713 assert_different_registers(dst, tmp);
1714 sve_uzp1(dst, H, src, tmp);
1715 sve_uzp1(dst, B, dst, tmp);
1716 }
1717 } else if (src_size == H) {
1718 sve_uzp1(dst, B, src, tmp);
1719 }
1720 }
1721
1722 // Extend src predicate to dst predicate with the same lane count but larger
1723 // element size, e.g. 64Byte -> 512Long
1724 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1725 uint dst_element_length_in_bytes,
1726 uint src_element_length_in_bytes) {
1727 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1728 sve_punpklo(dst, src);
1729 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1730 sve_punpklo(dst, src);
1731 sve_punpklo(dst, dst);
1732 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1733 sve_punpklo(dst, src);
1734 sve_punpklo(dst, dst);
1735 sve_punpklo(dst, dst);
1736 } else {
1737 assert(false, "unsupported");
1738 ShouldNotReachHere();
1739 }
1740 }
1741
1742 // Narrow src predicate to dst predicate with the same lane count but
1743 // smaller element size, e.g. 512Long -> 64Byte
1744 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1745 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1746 // The insignificant bits in src predicate are expected to be zero.
1747 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1748 // passed as the second argument. An example narrowing operation with a given mask would be -
1749 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1750 // Mask (for 2 Longs) : TF
1751 // Predicate register for the above mask (16 bits) : 00000001 00000000
1752 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1753 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1754 assert_different_registers(src, ptmp);
1755 assert_different_registers(dst, ptmp);
1756 sve_pfalse(ptmp);
1757 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1758 sve_uzp1(dst, B, src, ptmp);
1759 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1760 sve_uzp1(dst, H, src, ptmp);
1761 sve_uzp1(dst, B, dst, ptmp);
1762 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1763 sve_uzp1(dst, S, src, ptmp);
1764 sve_uzp1(dst, H, dst, ptmp);
1765 sve_uzp1(dst, B, dst, ptmp);
1766 } else {
1767 assert(false, "unsupported");
1768 ShouldNotReachHere();
1769 }
1770 }
1771
1772 // Vector reduction add for integral type with ASIMD instructions.
1773 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1774 Register isrc, FloatRegister vsrc,
1775 unsigned vector_length_in_bytes,
1776 FloatRegister vtmp) {
1777 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1778 assert_different_registers(dst, isrc);
1779 bool isQ = vector_length_in_bytes == 16;
1780
1781 BLOCK_COMMENT("neon_reduce_add_integral {");
1782 switch(bt) {
1783 case T_BYTE:
1784 addv(vtmp, isQ ? T16B : T8B, vsrc);
1785 smov(dst, vtmp, B, 0);
1786 addw(dst, dst, isrc, ext::sxtb);
1787 break;
1788 case T_SHORT:
1789 addv(vtmp, isQ ? T8H : T4H, vsrc);
1790 smov(dst, vtmp, H, 0);
1791 addw(dst, dst, isrc, ext::sxth);
1792 break;
1793 case T_INT:
1794 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1795 umov(dst, vtmp, S, 0);
1796 addw(dst, dst, isrc);
1797 break;
1798 case T_LONG:
1799 assert(isQ, "unsupported");
1800 addpd(vtmp, vsrc);
1801 umov(dst, vtmp, D, 0);
1802 add(dst, dst, isrc);
1803 break;
1804 default:
1805 assert(false, "unsupported");
1806 ShouldNotReachHere();
1807 }
1808 BLOCK_COMMENT("} neon_reduce_add_integral");
1809 }
1810
1811 // Vector reduction multiply for integral type with ASIMD instructions.
1812 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1813 // Clobbers: rscratch1
1814 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1815 Register isrc, FloatRegister vsrc,
1816 unsigned vector_length_in_bytes,
1817 FloatRegister vtmp1, FloatRegister vtmp2) {
1818 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1819 bool isQ = vector_length_in_bytes == 16;
1820
1821 BLOCK_COMMENT("neon_reduce_mul_integral {");
1822 switch(bt) {
1823 case T_BYTE:
1824 if (isQ) {
1825 // Multiply the lower half and higher half of vector iteratively.
1826 // vtmp1 = vsrc[8:15]
1827 ins(vtmp1, D, vsrc, 0, 1);
1828 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1829 mulv(vtmp1, T8B, vtmp1, vsrc);
1830 // vtmp2 = vtmp1[4:7]
1831 ins(vtmp2, S, vtmp1, 0, 1);
1832 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1833 mulv(vtmp1, T8B, vtmp2, vtmp1);
1834 } else {
1835 ins(vtmp1, S, vsrc, 0, 1);
1836 mulv(vtmp1, T8B, vtmp1, vsrc);
1837 }
1838 // vtmp2 = vtmp1[2:3]
1839 ins(vtmp2, H, vtmp1, 0, 1);
1840 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1841 mulv(vtmp2, T8B, vtmp2, vtmp1);
1842 // dst = vtmp2[0] * isrc * vtmp2[1]
1843 umov(rscratch1, vtmp2, B, 0);
1844 mulw(dst, rscratch1, isrc);
1845 sxtb(dst, dst);
1846 umov(rscratch1, vtmp2, B, 1);
1847 mulw(dst, rscratch1, dst);
1848 sxtb(dst, dst);
1849 break;
1850 case T_SHORT:
1851 if (isQ) {
1852 ins(vtmp2, D, vsrc, 0, 1);
1853 mulv(vtmp2, T4H, vtmp2, vsrc);
1854 ins(vtmp1, S, vtmp2, 0, 1);
1855 mulv(vtmp1, T4H, vtmp1, vtmp2);
1856 } else {
1857 ins(vtmp1, S, vsrc, 0, 1);
1858 mulv(vtmp1, T4H, vtmp1, vsrc);
1859 }
1860 umov(rscratch1, vtmp1, H, 0);
1861 mulw(dst, rscratch1, isrc);
1862 sxth(dst, dst);
1863 umov(rscratch1, vtmp1, H, 1);
1864 mulw(dst, rscratch1, dst);
1865 sxth(dst, dst);
1866 break;
1867 case T_INT:
1868 if (isQ) {
1869 ins(vtmp1, D, vsrc, 0, 1);
1870 mulv(vtmp1, T2S, vtmp1, vsrc);
1871 } else {
1872 vtmp1 = vsrc;
1873 }
1874 umov(rscratch1, vtmp1, S, 0);
1875 mul(dst, rscratch1, isrc);
1876 umov(rscratch1, vtmp1, S, 1);
1877 mul(dst, rscratch1, dst);
1878 break;
1879 case T_LONG:
1880 umov(rscratch1, vsrc, D, 0);
1881 mul(dst, isrc, rscratch1);
1882 umov(rscratch1, vsrc, D, 1);
1883 mul(dst, dst, rscratch1);
1884 break;
1885 default:
1886 assert(false, "unsupported");
1887 ShouldNotReachHere();
1888 }
1889 BLOCK_COMMENT("} neon_reduce_mul_integral");
1890 }
1891
1892 // Vector reduction multiply for floating-point type with ASIMD instructions.
1893 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1894 FloatRegister fsrc, FloatRegister vsrc,
1895 unsigned vector_length_in_bytes,
1896 FloatRegister vtmp) {
1897 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1898 bool isQ = vector_length_in_bytes == 16;
1899
1900 BLOCK_COMMENT("neon_reduce_mul_fp {");
1901 switch(bt) {
1902 case T_FLOAT:
1903 fmuls(dst, fsrc, vsrc);
1904 ins(vtmp, S, vsrc, 0, 1);
1905 fmuls(dst, dst, vtmp);
1906 if (isQ) {
1907 ins(vtmp, S, vsrc, 0, 2);
1908 fmuls(dst, dst, vtmp);
1909 ins(vtmp, S, vsrc, 0, 3);
1910 fmuls(dst, dst, vtmp);
1911 }
1912 break;
1913 case T_DOUBLE:
1914 assert(isQ, "unsupported");
1915 fmuld(dst, fsrc, vsrc);
1916 ins(vtmp, D, vsrc, 0, 1);
1917 fmuld(dst, dst, vtmp);
1918 break;
1919 default:
1920 assert(false, "unsupported");
1921 ShouldNotReachHere();
1922 }
1923 BLOCK_COMMENT("} neon_reduce_mul_fp");
1924 }
1925
1926 // Helper to select logical instruction
1927 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1928 Register Rn, Register Rm,
1929 enum shift_kind kind, unsigned shift) {
1930 switch(opc) {
1931 case Op_AndReductionV:
1932 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1933 break;
1934 case Op_OrReductionV:
1935 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1936 break;
1937 case Op_XorReductionV:
1938 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1939 break;
1940 default:
1941 assert(false, "unsupported");
1942 ShouldNotReachHere();
1943 }
1944 }
1945
1946 // Vector reduction logical operations And, Or, Xor
1947 // Clobbers: rscratch1
1948 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1949 Register isrc, FloatRegister vsrc,
1950 unsigned vector_length_in_bytes) {
1951 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1952 "unsupported");
1953 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1954 assert_different_registers(dst, isrc);
1955 bool isQ = vector_length_in_bytes == 16;
1956
1957 BLOCK_COMMENT("neon_reduce_logical {");
1958 umov(rscratch1, vsrc, isQ ? D : S, 0);
1959 umov(dst, vsrc, isQ ? D : S, 1);
1960 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1961 switch(bt) {
1962 case T_BYTE:
1963 if (isQ) {
1964 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1965 }
1966 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1967 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1968 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1969 sxtb(dst, dst);
1970 break;
1971 case T_SHORT:
1972 if (isQ) {
1973 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1974 }
1975 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1976 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1977 sxth(dst, dst);
1978 break;
1979 case T_INT:
1980 if (isQ) {
1981 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1982 }
1983 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1984 break;
1985 case T_LONG:
1986 assert(isQ, "unsupported");
1987 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1988 break;
1989 default:
1990 assert(false, "unsupported");
1991 ShouldNotReachHere();
1992 }
1993 BLOCK_COMMENT("} neon_reduce_logical");
1994 }
1995
1996 // Vector reduction min/max for integral type with ASIMD instructions.
1997 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1998 // Clobbers: rscratch1, rflags
1999 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2000 Register isrc, FloatRegister vsrc,
2001 unsigned vector_length_in_bytes,
2002 FloatRegister vtmp) {
2003 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2004 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2005 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2006 assert_different_registers(dst, isrc);
2007 bool isQ = vector_length_in_bytes == 16;
2008 bool is_min = opc == Op_MinReductionV;
2009
2010 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2011 if (bt == T_LONG) {
2012 assert(vtmp == fnoreg, "should be");
2013 assert(isQ, "should be");
2014 umov(rscratch1, vsrc, D, 0);
2015 cmp(isrc, rscratch1);
2016 csel(dst, isrc, rscratch1, is_min ? LT : GT);
2017 umov(rscratch1, vsrc, D, 1);
2018 cmp(dst, rscratch1);
2019 csel(dst, dst, rscratch1, is_min ? LT : GT);
2020 } else {
2021 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2022 if (size == T2S) {
2023 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2024 } else {
2025 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2026 }
2027 if (bt == T_INT) {
2028 umov(dst, vtmp, S, 0);
2029 } else {
2030 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2031 }
2032 cmpw(dst, isrc);
2033 cselw(dst, dst, isrc, is_min ? LT : GT);
2034 }
2035 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2036 }
2037
2038 // Vector reduction for integral type with SVE instruction.
2039 // Supported operations are Add, And, Or, Xor, Max, Min.
2040 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2041 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2042 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2043 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2044 assert(pg->is_governing(), "This register has to be a governing predicate register");
2045 assert_different_registers(src1, dst);
2046 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2047 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2048 switch (opc) {
2049 case Op_AddReductionVI: {
2050 sve_uaddv(tmp, size, pg, src2);
2051 if (bt == T_BYTE) {
2052 smov(dst, tmp, size, 0);
2053 addw(dst, src1, dst, ext::sxtb);
2054 } else if (bt == T_SHORT) {
2055 smov(dst, tmp, size, 0);
2056 addw(dst, src1, dst, ext::sxth);
2057 } else {
2058 umov(dst, tmp, size, 0);
2059 addw(dst, dst, src1);
2060 }
2061 break;
2062 }
2063 case Op_AddReductionVL: {
2064 sve_uaddv(tmp, size, pg, src2);
2065 umov(dst, tmp, size, 0);
2066 add(dst, dst, src1);
2067 break;
2068 }
2069 case Op_AndReductionV: {
2070 sve_andv(tmp, size, pg, src2);
2071 if (bt == T_INT || bt == T_LONG) {
2072 umov(dst, tmp, size, 0);
2073 } else {
2074 smov(dst, tmp, size, 0);
2075 }
2076 if (bt == T_LONG) {
2077 andr(dst, dst, src1);
2078 } else {
2079 andw(dst, dst, src1);
2080 }
2081 break;
2082 }
2083 case Op_OrReductionV: {
2084 sve_orv(tmp, size, pg, src2);
2085 if (bt == T_INT || bt == T_LONG) {
2086 umov(dst, tmp, size, 0);
2087 } else {
2088 smov(dst, tmp, size, 0);
2089 }
2090 if (bt == T_LONG) {
2091 orr(dst, dst, src1);
2092 } else {
2093 orrw(dst, dst, src1);
2094 }
2095 break;
2096 }
2097 case Op_XorReductionV: {
2098 sve_eorv(tmp, size, pg, src2);
2099 if (bt == T_INT || bt == T_LONG) {
2100 umov(dst, tmp, size, 0);
2101 } else {
2102 smov(dst, tmp, size, 0);
2103 }
2104 if (bt == T_LONG) {
2105 eor(dst, dst, src1);
2106 } else {
2107 eorw(dst, dst, src1);
2108 }
2109 break;
2110 }
2111 case Op_MaxReductionV: {
2112 sve_smaxv(tmp, size, pg, src2);
2113 if (bt == T_INT || bt == T_LONG) {
2114 umov(dst, tmp, size, 0);
2115 } else {
2116 smov(dst, tmp, size, 0);
2117 }
2118 if (bt == T_LONG) {
2119 cmp(dst, src1);
2120 csel(dst, dst, src1, Assembler::GT);
2121 } else {
2122 cmpw(dst, src1);
2123 cselw(dst, dst, src1, Assembler::GT);
2124 }
2125 break;
2126 }
2127 case Op_MinReductionV: {
2128 sve_sminv(tmp, size, pg, src2);
2129 if (bt == T_INT || bt == T_LONG) {
2130 umov(dst, tmp, size, 0);
2131 } else {
2132 smov(dst, tmp, size, 0);
2133 }
2134 if (bt == T_LONG) {
2135 cmp(dst, src1);
2136 csel(dst, dst, src1, Assembler::LT);
2137 } else {
2138 cmpw(dst, src1);
2139 cselw(dst, dst, src1, Assembler::LT);
2140 }
2141 break;
2142 }
2143 default:
2144 assert(false, "unsupported");
2145 ShouldNotReachHere();
2146 }
2147
2148 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2149 if (bt == T_BYTE) {
2150 sxtb(dst, dst);
2151 } else if (bt == T_SHORT) {
2152 sxth(dst, dst);
2153 }
2154 }
2155 }
2156
2157 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2158 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2159 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2160 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2161 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2162 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2163
2164 // Set all elements to false if the input "lane_cnt" is zero.
2165 if (lane_cnt == 0) {
2166 sve_pfalse(dst);
2167 return;
2168 }
2169
2170 SIMD_RegVariant size = elemType_to_regVariant(bt);
2171 assert(size != Q, "invalid size");
2172
2173 // Set all true if "lane_cnt" equals to the max lane count.
2174 if (lane_cnt == max_vector_length) {
2175 sve_ptrue(dst, size, /* ALL */ 0b11111);
2176 return;
2177 }
2178
2179 // Fixed numbers for "ptrue".
2180 switch(lane_cnt) {
2181 case 1: /* VL1 */
2182 case 2: /* VL2 */
2183 case 3: /* VL3 */
2184 case 4: /* VL4 */
2185 case 5: /* VL5 */
2186 case 6: /* VL6 */
2187 case 7: /* VL7 */
2188 case 8: /* VL8 */
2189 sve_ptrue(dst, size, lane_cnt);
2190 return;
2191 case 16:
2192 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2193 return;
2194 case 32:
2195 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2196 return;
2197 case 64:
2198 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2199 return;
2200 case 128:
2201 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2202 return;
2203 case 256:
2204 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2205 return;
2206 default:
2207 break;
2208 }
2209
2210 // Special patterns for "ptrue".
2211 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2212 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2213 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2214 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2215 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2216 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2217 } else {
2218 // Encode to "whileltw" for the remaining cases.
2219 mov(rscratch1, lane_cnt);
2220 sve_whileltw(dst, size, zr, rscratch1);
2221 }
2222 }
2223
2224 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2225 // Any remaining elements of dst will be filled with zero.
2226 // Clobbers: rscratch1
2227 // Preserves: mask, vzr
2228 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2229 FloatRegister vzr, FloatRegister vtmp,
2230 PRegister pgtmp, unsigned vector_length_in_bytes) {
2231 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2232 // When called by sve_compress_byte, src and vtmp may be the same register.
2233 assert_different_registers(dst, src, vzr);
2234 assert_different_registers(dst, vtmp, vzr);
2235 assert_different_registers(mask, pgtmp);
2236 // high <-- low
2237 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2238 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2239 // Expected result: dst = 00 00 00 hh ee dd bb aa
2240
2241 // Extend lowest half to type INT.
2242 // dst = 00dd 00cc 00bb 00aa
2243 sve_uunpklo(dst, S, src);
2244 // pgtmp = 0001 0000 0001 0001
2245 sve_punpklo(pgtmp, mask);
2246 // Pack the active elements in size of type INT to the right,
2247 // and fill the remainings with zero.
2248 // dst = 0000 00dd 00bb 00aa
2249 sve_compact(dst, S, dst, pgtmp);
2250 // Narrow the result back to type SHORT.
2251 // dst = 00 00 00 00 00 dd bb aa
2252 sve_uzp1(dst, H, dst, vzr);
2253
2254 // Return if the vector length is no more than MaxVectorSize/2, since the
2255 // highest half is invalid.
2256 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2257 return;
2258 }
2259
2260 // Count the active elements of lowest half.
2261 // rscratch1 = 3
2262 sve_cntp(rscratch1, S, ptrue, pgtmp);
2263
2264 // Repeat to the highest half.
2265 // pgtmp = 0001 0000 0000 0001
2266 sve_punpkhi(pgtmp, mask);
2267 // vtmp = 00hh 00gg 00ff 00ee
2268 sve_uunpkhi(vtmp, S, src);
2269 // vtmp = 0000 0000 00hh 00ee
2270 sve_compact(vtmp, S, vtmp, pgtmp);
2271 // vtmp = 00 00 00 00 00 00 hh ee
2272 sve_uzp1(vtmp, H, vtmp, vzr);
2273
2274 // pgtmp = 00 00 00 00 00 01 01 01
2275 sve_whilelt(pgtmp, H, zr, rscratch1);
2276 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2277 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2278 // Combine the compressed low with the compressed high:
2279 // dst = 00 00 00 hh ee dd bb aa
2280 sve_splice(dst, H, pgtmp, vtmp);
2281 }
2282
2283 // Clobbers: rscratch1, rscratch2
2284 // Preserves: src, mask
2285 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2286 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2287 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2288 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2289 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2290 assert_different_registers(mask, ptmp, pgtmp);
2291 // high <-- low
2292 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2293 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2294 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2295 FloatRegister vzr = vtmp3;
2296 sve_dup(vzr, B, 0);
2297
2298 // Extend lowest half to type SHORT.
2299 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2300 sve_uunpklo(vtmp1, H, src);
2301 // ptmp = 00 01 00 00 00 01 00 01
2302 sve_punpklo(ptmp, mask);
2303 // Pack the active elements in size of type SHORT to the right,
2304 // and fill the remainings with zero.
2305 // dst = 00 00 00 00 00 0g 0c 0a
2306 unsigned extended_size = vector_length_in_bytes << 1;
2307 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2308 // Narrow the result back to type BYTE.
2309 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2310 sve_uzp1(dst, B, dst, vzr);
2311
2312 // Return if the vector length is no more than MaxVectorSize/2, since the
2313 // highest half is invalid.
2314 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2315 return;
2316 }
2317 // Count the active elements of lowest half.
2318 // rscratch2 = 3
2319 sve_cntp(rscratch2, H, ptrue, ptmp);
2320
2321 // Repeat to the highest half.
2322 // ptmp = 00 01 00 00 00 00 00 01
2323 sve_punpkhi(ptmp, mask);
2324 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2325 sve_uunpkhi(vtmp2, H, src);
2326 // vtmp1 = 00 00 00 00 00 00 0p 0i
2327 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2328 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2329 sve_uzp1(vtmp1, B, vtmp1, vzr);
2330
2331 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2332 sve_whilelt(ptmp, B, zr, rscratch2);
2333 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2334 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2335 // Combine the compressed low with the compressed high:
2336 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2337 sve_splice(dst, B, ptmp, vtmp1);
2338 }
2339
2340 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2341 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2342 SIMD_Arrangement size = isQ ? T16B : T8B;
2343 if (bt == T_BYTE) {
2344 rbit(dst, size, src);
2345 } else {
2346 neon_reverse_bytes(dst, src, bt, isQ);
2347 rbit(dst, size, dst);
2348 }
2349 }
2350
2351 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2352 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2353 SIMD_Arrangement size = isQ ? T16B : T8B;
2354 switch (bt) {
2355 case T_BYTE:
2356 if (dst != src) {
2357 orr(dst, size, src, src);
2358 }
2359 break;
2360 case T_SHORT:
2361 rev16(dst, size, src);
2362 break;
2363 case T_INT:
2364 rev32(dst, size, src);
2365 break;
2366 case T_LONG:
2367 rev64(dst, size, src);
2368 break;
2369 default:
2370 assert(false, "unsupported");
2371 ShouldNotReachHere();
2372 }
2373 }
2374
2375 // VectorRearrange implementation for short/int/float/long/double types with NEON
2376 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2377 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2378 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2379 // and use bsl to implement the operation.
2380 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2381 FloatRegister shuffle, FloatRegister tmp,
2382 BasicType bt, bool isQ) {
2383 assert_different_registers(dst, src, shuffle, tmp);
2384 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2385 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2386
2387 // Here is an example that rearranges a NEON vector with 4 ints:
2388 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2389 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2390 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2391 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2392 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2393 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2394 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2395 // 4. Use Vm as index register, and use V1 as table register.
2396 // Then get V2 as the result by tbl NEON instructions.
2397 switch (bt) {
2398 case T_SHORT:
2399 mov(tmp, size1, 0x02);
2400 mulv(dst, size2, shuffle, tmp);
2401 mov(tmp, size2, 0x0100);
2402 addv(dst, size1, dst, tmp);
2403 tbl(dst, size1, src, 1, dst);
2404 break;
2405 case T_INT:
2406 case T_FLOAT:
2407 mov(tmp, size1, 0x04);
2408 mulv(dst, size2, shuffle, tmp);
2409 mov(tmp, size2, 0x03020100);
2410 addv(dst, size1, dst, tmp);
2411 tbl(dst, size1, src, 1, dst);
2412 break;
2413 case T_LONG:
2414 case T_DOUBLE:
2415 // Load the iota indices for Long type. The indices are ordered by
2416 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2417 // the offset for L is 48.
2418 lea(rscratch1,
2419 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2420 ldrq(tmp, rscratch1);
2421 // Check whether the input "shuffle" is the same with iota indices.
2422 // Return "src" if true, otherwise swap the two elements of "src".
2423 cm(EQ, dst, size2, shuffle, tmp);
2424 ext(tmp, size1, src, src, 8);
2425 bsl(dst, size1, src, tmp);
2426 break;
2427 default:
2428 assert(false, "unsupported element type");
2429 ShouldNotReachHere();
2430 }
2431 }
2432
2433 // Extract a scalar element from an sve vector at position 'idx'.
2434 // The input elements in src are expected to be of integral type.
2435 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2436 int idx, FloatRegister vtmp) {
2437 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2438 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2439 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2440 if (bt == T_INT || bt == T_LONG) {
2441 umov(dst, src, size, idx);
2442 } else {
2443 smov(dst, src, size, idx);
2444 }
2445 } else {
2446 sve_orr(vtmp, src, src);
2447 sve_ext(vtmp, vtmp, idx << size);
2448 if (bt == T_INT || bt == T_LONG) {
2449 umov(dst, vtmp, size, 0);
2450 } else {
2451 smov(dst, vtmp, size, 0);
2452 }
2453 }
2454 }
2455
2456 // java.lang.Math::round intrinsics
2457
2458 // Clobbers: rscratch1, rflags
2459 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2460 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2461 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2462 switch (T) {
2463 case T2S:
2464 case T4S:
2465 fmovs(tmp1, T, 0.5f);
2466 mov(rscratch1, jint_cast(0x1.0p23f));
2467 break;
2468 case T2D:
2469 fmovd(tmp1, T, 0.5);
2470 mov(rscratch1, julong_cast(0x1.0p52));
2471 break;
2472 default:
2473 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2474 }
2475 fadd(tmp1, T, tmp1, src);
2476 fcvtms(tmp1, T, tmp1);
2477 // tmp1 = floor(src + 0.5, ties to even)
2478
2479 fcvtas(dst, T, src);
2480 // dst = round(src), ties to away
2481
2482 fneg(tmp3, T, src);
2483 dup(tmp2, T, rscratch1);
2484 cm(HS, tmp3, T, tmp3, tmp2);
2485 // tmp3 is now a set of flags
2486
2487 bif(dst, T16B, tmp1, tmp3);
2488 // result in dst
2489 }
2490
2491 // Clobbers: rscratch1, rflags
2492 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2493 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2494 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2495 assert_different_registers(tmp1, tmp2, src, dst);
2496
2497 switch (T) {
2498 case S:
2499 mov(rscratch1, jint_cast(0x1.0p23f));
2500 break;
2501 case D:
2502 mov(rscratch1, julong_cast(0x1.0p52));
2503 break;
2504 default:
2505 assert(T == S || T == D, "invalid register variant");
2506 }
2507
2508 sve_frinta(dst, T, ptrue, src);
2509 // dst = round(src), ties to away
2510
2511 Label none;
2512
2513 sve_fneg(tmp1, T, ptrue, src);
2514 sve_dup(tmp2, T, rscratch1);
2515 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2516 br(EQ, none);
2517 {
2518 sve_cpy(tmp1, T, pgtmp, 0.5);
2519 sve_fadd(tmp1, T, pgtmp, src);
2520 sve_frintm(dst, T, pgtmp, tmp1);
2521 // dst = floor(src + 0.5, ties to even)
2522 }
2523 bind(none);
2524
2525 sve_fcvtzs(dst, T, ptrue, dst, T);
2526 // result in dst
2527 }
2528
2529 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2530 FloatRegister one, SIMD_Arrangement T) {
2531 assert_different_registers(dst, src, zero, one);
2532 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2533
2534 facgt(dst, T, src, zero);
2535 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2536 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2537 }
2538
2539 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2540 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2541 assert_different_registers(dst, src, zero, one, vtmp);
2542 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2543
2544 sve_orr(vtmp, src, src);
2545 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2546 switch (T) {
2547 case S:
2548 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2549 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2550 // on the sign of the float value
2551 break;
2552 case D:
2553 sve_and(vtmp, T, min_jlong);
2554 sve_orr(vtmp, T, jlong_cast(1.0));
2555 break;
2556 default:
2557 assert(false, "unsupported");
2558 ShouldNotReachHere();
2559 }
2560 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2561 // Result in dst
2562 }
2563
2564 bool C2_MacroAssembler::in_scratch_emit_size() {
2565 if (ciEnv::current()->task() != nullptr) {
2566 PhaseOutput* phase_output = Compile::current()->output();
2567 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2568 return true;
2569 }
2570 }
2571 return MacroAssembler::in_scratch_emit_size();
2572 }
2573
2574 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2575 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2576 }
2577
2578 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2579 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2580 if (t == TypeInt::INT) {
2581 return;
2582 }
2583 BLOCK_COMMENT("verify_int_in_range {");
2584 Label L_success, L_failure;
2585
2586 jint lo = t->_lo;
2587 jint hi = t->_hi;
2588
2589 if (lo != min_jint && hi != max_jint) {
2590 subsw(rtmp, rval, lo);
2591 br(Assembler::LT, L_failure);
2592 subsw(rtmp, rval, hi);
2593 br(Assembler::LE, L_success);
2594 } else if (lo != min_jint) {
2595 subsw(rtmp, rval, lo);
2596 br(Assembler::GE, L_success);
2597 } else if (hi != max_jint) {
2598 subsw(rtmp, rval, hi);
2599 br(Assembler::LE, L_success);
2600 } else {
2601 ShouldNotReachHere();
2602 }
2603
2604 bind(L_failure);
2605 movw(c_rarg0, idx);
2606 mov(c_rarg1, rval);
2607 movw(c_rarg2, lo);
2608 movw(c_rarg3, hi);
2609 reconstruct_frame_pointer(rtmp);
2610 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2611 hlt(0);
2612
2613 bind(L_success);
2614 BLOCK_COMMENT("} verify_int_in_range");
2615 }
2616
2617 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2618 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2619 }
2620
2621 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2622 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2623 if (t == TypeLong::LONG) {
2624 return;
2625 }
2626 BLOCK_COMMENT("verify_long_in_range {");
2627 Label L_success, L_failure;
2628
2629 jlong lo = t->_lo;
2630 jlong hi = t->_hi;
2631
2632 if (lo != min_jlong && hi != max_jlong) {
2633 subs(rtmp, rval, lo);
2634 br(Assembler::LT, L_failure);
2635 subs(rtmp, rval, hi);
2636 br(Assembler::LE, L_success);
2637 } else if (lo != min_jlong) {
2638 subs(rtmp, rval, lo);
2639 br(Assembler::GE, L_success);
2640 } else if (hi != max_jlong) {
2641 subs(rtmp, rval, hi);
2642 br(Assembler::LE, L_success);
2643 } else {
2644 ShouldNotReachHere();
2645 }
2646
2647 bind(L_failure);
2648 movw(c_rarg0, idx);
2649 mov(c_rarg1, rval);
2650 mov(c_rarg2, lo);
2651 mov(c_rarg3, hi);
2652 reconstruct_frame_pointer(rtmp);
2653 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2654 hlt(0);
2655
2656 bind(L_success);
2657 BLOCK_COMMENT("} verify_long_in_range");
2658 }
2659
2660 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2661 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2662 if (PreserveFramePointer) {
2663 // frame pointer is valid
2664 #ifdef ASSERT
2665 // Verify frame pointer value in rfp.
2666 add(rtmp, sp, framesize - 2 * wordSize);
2667 Label L_success;
2668 cmp(rfp, rtmp);
2669 br(Assembler::EQ, L_success);
2670 stop("frame pointer mismatch");
2671 bind(L_success);
2672 #endif // ASSERT
2673 } else {
2674 add(rfp, sp, framesize - 2 * wordSize);
2675 }
2676 }
2677
2678 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2679 // using Neon instructions and places it in the destination vector element corresponding to the
2680 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2681 // where NUM_ELEM is the number of BasicType elements per vector.
2682 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2683 // Otherwise, selects src2[idx – NUM_ELEM]
2684 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2685 FloatRegister src2, FloatRegister index,
2686 FloatRegister tmp, unsigned vector_length_in_bytes) {
2687 assert_different_registers(dst, src1, src2, tmp);
2688 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2689
2690 if (vector_length_in_bytes == 16) {
2691 assert(UseSVE <= 1, "sve must be <= 1");
2692 assert(src1->successor() == src2, "Source registers must be ordered");
2693 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2694 tbl(dst, size, src1, 2, index);
2695 } else { // vector length == 8
2696 assert(UseSVE == 0, "must be Neon only");
2697 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2698 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2699 // instruction with one vector lookup
2700 ins(tmp, D, src1, 0, 0);
2701 ins(tmp, D, src2, 1, 0);
2702 tbl(dst, size, tmp, 1, index);
2703 }
2704 }
2705
2706 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2707 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2708 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2709 // where NUM_ELEM is the number of BasicType elements per vector.
2710 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2711 // Otherwise, selects src2[idx – NUM_ELEM]
2712 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2713 FloatRegister src2, FloatRegister index,
2714 FloatRegister tmp, SIMD_RegVariant T,
2715 unsigned vector_length_in_bytes) {
2716 assert_different_registers(dst, src1, src2, index, tmp);
2717
2718 if (vector_length_in_bytes == 8) {
2719 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2720 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2721 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2722 // instruction with one vector lookup
2723 assert(UseSVE >= 1, "sve must be >= 1");
2724 ins(tmp, D, src1, 0, 0);
2725 ins(tmp, D, src2, 1, 0);
2726 sve_tbl(dst, T, tmp, index);
2727 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2728 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2729 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2730 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2731 // with the only exception of 8B vector length.
2732 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2733 assert(src1->successor() == src2, "Source registers must be ordered");
2734 sve_tbl(dst, T, src1, src2, index);
2735 }
2736 }
2737
2738 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2739 FloatRegister src2, FloatRegister index,
2740 FloatRegister tmp, BasicType bt,
2741 unsigned vector_length_in_bytes) {
2742
2743 assert_different_registers(dst, src1, src2, index, tmp);
2744
2745 // The cases that can reach this method are -
2746 // - UseSVE = 0, vector_length_in_bytes = 8 or 16
2747 // - UseSVE = 1, vector_length_in_bytes = 8 or 16
2748 // - UseSVE = 2, vector_length_in_bytes >= 8
2749 //
2750 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2751 // and UseSVE = 2 with vector_length_in_bytes >= 8
2752 //
2753 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2754 // UseSVE = 1 with vector_length_in_bytes = 16
2755
2756 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2757 SIMD_RegVariant T = elemType_to_regVariant(bt);
2758 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2759 return;
2760 }
2761
2762 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2763 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2764 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2765
2766 bool isQ = vector_length_in_bytes == 16;
2767
2768 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2769 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2770
2771 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2772 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2773 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2774 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2775 // the indices can range from [0, 8).
2776 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2777 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2778 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2779 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2780 // Add the multiplied result to the vector in tmp to obtain the byte level
2781 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2782 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2783
2784 if (bt == T_BYTE) {
2785 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2786 } else {
2787 int elem_size = (bt == T_SHORT) ? 2 : 4;
2788 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2789
2790 mov(tmp, size1, elem_size);
2791 mulv(dst, size2, index, tmp);
2792 mov(tmp, size2, tbl_offset);
2793 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2794 // to select a set of 2B/4B
2795 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2796 }
2797 }
2798
2799 // Vector expand implementation. Elements from the src vector are expanded into
2800 // the dst vector under the control of the vector mask.
2801 // Since there are no native instructions directly corresponding to expand before
2802 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2803 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2804 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2805 // for NEON and SVE, but with different instructions where appropriate.
2806
2807 // Vector expand implementation for NEON.
2808 //
2809 // An example of 128-bit Byte vector:
2810 // Data direction: high <== low
2811 // Input:
2812 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2813 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2814 // Expected result:
2815 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2816 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2817 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2818 int vector_length_in_bytes) {
2819 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2820 assert_different_registers(dst, src, mask, tmp1, tmp2);
2821 // Since the TBL instruction only supports byte table, we need to
2822 // compute indices in byte type for all types.
2823 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2824 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2825 dup(tmp1, size, zr);
2826 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2827 negr(dst, size, mask);
2828 // Calculate vector index for TBL with prefix sum algorithm.
2829 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2830 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2831 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2832 addv(dst, size, tmp2, dst);
2833 }
2834 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2835 orr(tmp2, size, mask, mask);
2836 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2837 bsl(tmp2, size, dst, tmp1);
2838 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2839 movi(tmp1, size, 1);
2840 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2841 subv(dst, size, tmp2, tmp1);
2842 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2843 tbl(dst, size, src, 1, dst);
2844 }
2845
2846 // Vector expand implementation for SVE.
2847 //
2848 // An example of 128-bit Short vector:
2849 // Data direction: high <== low
2850 // Input:
2851 // src = gf ed cb a9 87 65 43 21
2852 // pg = 00 01 00 01 00 01 00 01
2853 // Expected result:
2854 // dst = 00 87 00 65 00 43 00 21
2855 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2856 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2857 int vector_length_in_bytes) {
2858 assert(UseSVE > 0, "expand implementation only for SVE");
2859 assert_different_registers(dst, src, tmp1, tmp2);
2860 SIMD_RegVariant size = elemType_to_regVariant(bt);
2861
2862 // tmp1 = 00 00 00 00 00 00 00 00
2863 sve_dup(tmp1, size, 0);
2864 sve_movprfx(tmp2, tmp1);
2865 // tmp2 = 00 01 00 01 00 01 00 01
2866 sve_cpy(tmp2, size, pg, 1, true);
2867 // Calculate vector index for TBL with prefix sum algorithm.
2868 // tmp2 = 04 04 03 03 02 02 01 01
2869 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2870 sve_movprfx(dst, tmp1);
2871 // The EXT instruction operates on the full-width sve register. The correct
2872 // index calculation method is:
2873 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2874 // MaxVectorSize - i.
2875 sve_ext(dst, tmp2, MaxVectorSize - i);
2876 sve_add(tmp2, size, dst, tmp2);
2877 }
2878 // dst = 00 04 00 03 00 02 00 01
2879 sve_sel(dst, size, pg, tmp2, tmp1);
2880 // dst = -1 03 -1 02 -1 01 -1 00
2881 sve_sub(dst, size, 1);
2882 // dst = 00 87 00 65 00 43 00 21
2883 sve_tbl(dst, size, src, dst);
2884 }