1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2026 Arm Limited and/or its affiliates.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/matcher.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/objectMonitorTable.hpp"
35 #include "runtime/stubRoutines.hpp"
36 #include "runtime/synchronizer.hpp"
37 #include "utilities/globalDefinitions.hpp"
38 #include "utilities/powerOfTwo.hpp"
39
40 #ifdef PRODUCT
41 #define BLOCK_COMMENT(str) /* nothing */
42 #define STOP(error) stop(error)
43 #else
44 #define BLOCK_COMMENT(str) block_comment(str)
45 #define STOP(error) block_comment(error); stop(error)
46 #endif
47
48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
49
50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
51
52 // jdk.internal.util.ArraysSupport.vectorizedHashCode
53 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
54 FloatRegister vdata0, FloatRegister vdata1,
55 FloatRegister vdata2, FloatRegister vdata3,
56 FloatRegister vmul0, FloatRegister vmul1,
57 FloatRegister vmul2, FloatRegister vmul3,
58 FloatRegister vpow, FloatRegister vpowm,
59 BasicType eltype) {
60 ARRAYS_HASHCODE_REGISTERS;
61
62 Register tmp1 = rscratch1, tmp2 = rscratch2;
63
64 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
65
66 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
67 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
68 // use 4H for chars and shorts instead, but using 8H gives better performance.
69 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
70 : eltype == T_CHAR || eltype == T_SHORT ? 8
71 : eltype == T_INT ? 4
72 : 0;
73 guarantee(vf, "unsupported eltype");
74
75 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
76 const size_t unroll_factor = 4;
77
78 switch (eltype) {
79 case T_BOOLEAN:
80 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
81 break;
82 case T_CHAR:
83 BLOCK_COMMENT("arrays_hashcode(char) {");
84 break;
85 case T_BYTE:
86 BLOCK_COMMENT("arrays_hashcode(byte) {");
87 break;
88 case T_SHORT:
89 BLOCK_COMMENT("arrays_hashcode(short) {");
90 break;
91 case T_INT:
92 BLOCK_COMMENT("arrays_hashcode(int) {");
93 break;
94 default:
95 ShouldNotReachHere();
96 }
97
98 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
99 // implemented by the stub executes just once. Call the stub only if at least two iterations will
100 // be executed.
101 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
102 cmpw(cnt, large_threshold);
103 br(Assembler::HS, LARGE);
104
105 bind(TAIL);
106
107 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
108 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
109 // Iteration eats up the remainder, uf elements at a time.
110 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
111 andr(tmp2, cnt, unroll_factor - 1);
112 adr(tmp1, BR_BASE);
113 // For Cortex-A53 offset is 4 because 2 nops are generated.
114 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
115 movw(tmp2, 0x1f);
116 br(tmp1);
117
118 bind(LOOP);
119 for (size_t i = 0; i < unroll_factor; ++i) {
120 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
121 maddw(result, result, tmp2, tmp1);
122 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
123 // Generate 2nd nop to have 4 instructions per iteration.
124 if (VM_Version::supports_a53mac()) {
125 nop();
126 }
127 }
128 bind(BR_BASE);
129 subsw(cnt, cnt, unroll_factor);
130 br(Assembler::HS, LOOP);
131
132 b(DONE);
133
134 bind(LARGE);
135
136 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
137 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
138 address tpc = trampoline_call(stub);
139 if (tpc == nullptr) {
140 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
141 postcond(pc() == badAddress);
142 return nullptr;
143 }
144
145 bind(DONE);
146
147 BLOCK_COMMENT("} // arrays_hashcode");
148
149 postcond(pc() != badAddress);
150 return pc();
151 }
152
153 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
154 Register t2, Register t3) {
155 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
156
157 // Handle inflated monitor.
158 Label inflated;
159 // Finish fast lock successfully. MUST branch to with flag == EQ
160 Label locked;
161 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
162 Label slow_path;
163
164 if (UseObjectMonitorTable) {
165 // Clear cache in case fast locking succeeds or we need to take the slow-path.
166 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
167 }
168
169 if (DiagnoseSyncOnValueBasedClasses != 0) {
170 load_klass(t1, obj);
171 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
172 tst(t1, KlassFlags::_misc_is_value_based_class);
173 br(Assembler::NE, slow_path);
174 }
175
176 const Register t1_mark = t1;
177 const Register t3_t = t3;
178
179 { // Fast locking
180
181 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
182 Label push;
183
184 const Register t2_top = t2;
185
186 // Check if lock-stack is full.
187 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
188 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
189 br(Assembler::GT, slow_path);
190
191 // Check if recursive.
192 subw(t3_t, t2_top, oopSize);
193 ldr(t3_t, Address(rthread, t3_t));
194 cmp(obj, t3_t);
195 br(Assembler::EQ, push);
196
197 // Relaxed normal load to check for monitor. Optimization for monitor case.
198 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
199 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
200
201 // Not inflated
202 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
203
204 // Try to lock. Transition lock-bits 0b01 => 0b00
205 orr(t1_mark, t1_mark, markWord::unlocked_value);
206 eor(t3_t, t1_mark, markWord::unlocked_value);
207 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
208 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
209 br(Assembler::NE, slow_path);
210
211 bind(push);
212 // After successful lock, push object on lock-stack.
213 str(obj, Address(rthread, t2_top));
214 addw(t2_top, t2_top, oopSize);
215 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
216 b(locked);
217 }
218
219 { // Handle inflated monitor.
220 bind(inflated);
221
222 const Register t1_monitor = t1;
223
224 if (!UseObjectMonitorTable) {
225 assert(t1_monitor == t1_mark, "should be the same here");
226 } else {
227 const Register t1_hash = t1;
228 Label monitor_found;
229
230 // Save the mark, we might need it to extract the hash.
231 mov(t3, t1_mark);
232
233 // Look for the monitor in the om_cache.
234
235 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
236 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
237 const int num_unrolled = OMCache::CAPACITY;
238 for (int i = 0; i < num_unrolled; i++) {
239 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
240 ldr(t2, Address(rthread, cache_offset));
241 cmp(obj, t2);
242 br(Assembler::EQ, monitor_found);
243 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
244 }
245
246 // Look for the monitor in the table.
247
248 // Get the hash code.
249 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
250
251 // Get the table and calculate the bucket's address
252 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
253 ldr(t3, Address(t3));
254 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
255 ands(t1_hash, t1_hash, t2);
256 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
257
258 // Read the monitor from the bucket.
259 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
260
261 // Check if the monitor in the bucket is special (empty, tombstone or removed).
262 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
263 br(Assembler::LO, slow_path);
264
265 // Check if object matches.
266 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
267 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
268 bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
269 cmp(t3, obj);
270 br(Assembler::NE, slow_path);
271
272 bind(monitor_found);
273 }
274
275 const Register t2_owner_addr = t2;
276 const Register t3_owner = t3;
277 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
278 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
279 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
280
281 Label monitor_locked;
282
283 // Compute owner address.
284 lea(t2_owner_addr, owner_address);
285
286 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
287 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
288 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
289 /*release*/ false, /*weak*/ false, t3_owner);
290 br(Assembler::EQ, monitor_locked);
291
292 // Check if recursive.
293 cmp(t3_owner, rscratch2);
294 br(Assembler::NE, slow_path);
295
296 // Recursive.
297 increment(recursions_address, 1);
298
299 bind(monitor_locked);
300 if (UseObjectMonitorTable) {
301 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
302 }
303 }
304
305 bind(locked);
306
307 #ifdef ASSERT
308 // Check that locked label is reached with Flags == EQ.
309 Label flag_correct;
310 br(Assembler::EQ, flag_correct);
311 stop("Fast Lock Flag != EQ");
312 #endif
313
314 bind(slow_path);
315 #ifdef ASSERT
316 // Check that slow_path label is reached with Flags == NE.
317 br(Assembler::NE, flag_correct);
318 stop("Fast Lock Flag != NE");
319 bind(flag_correct);
320 #endif
321 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
322 }
323
324 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
325 Register t2, Register t3) {
326 assert_different_registers(obj, box, t1, t2, t3);
327
328 // Handle inflated monitor.
329 Label inflated, inflated_load_mark;
330 // Finish fast unlock successfully. MUST branch to with flag == EQ
331 Label unlocked;
332 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
333 Label slow_path;
334
335 const Register t1_mark = t1;
336 const Register t2_top = t2;
337 const Register t3_t = t3;
338
339 { // Fast unlock
340
341 Label push_and_slow_path;
342
343 // Check if obj is top of lock-stack.
344 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
345 subw(t2_top, t2_top, oopSize);
346 ldr(t3_t, Address(rthread, t2_top));
347 cmp(obj, t3_t);
348 // Top of lock stack was not obj. Must be monitor.
349 br(Assembler::NE, inflated_load_mark);
350
351 // Pop lock-stack.
352 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
353 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
354
355 // Check if recursive.
356 subw(t3_t, t2_top, oopSize);
357 ldr(t3_t, Address(rthread, t3_t));
358 cmp(obj, t3_t);
359 br(Assembler::EQ, unlocked);
360
361 // Not recursive.
362 // Load Mark.
363 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
364
365 // Check header for monitor (0b10).
366 // Because we got here by popping (meaning we pushed in locked)
367 // there will be no monitor in the box. So we need to push back the obj
368 // so that the runtime can fix any potential anonymous owner.
369 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
370
371 // Try to unlock. Transition lock bits 0b00 => 0b01
372 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
373 orr(t3_t, t1_mark, markWord::unlocked_value);
374 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
375 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
376 br(Assembler::EQ, unlocked);
377
378 bind(push_and_slow_path);
379 // Compare and exchange failed.
380 // Restore lock-stack and handle the unlock in runtime.
381 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
382 addw(t2_top, t2_top, oopSize);
383 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
384 b(slow_path);
385 }
386
387
388 { // Handle inflated monitor.
389 bind(inflated_load_mark);
390 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
391 #ifdef ASSERT
392 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
393 stop("Fast Unlock not monitor");
394 #endif
395
396 bind(inflated);
397
398 #ifdef ASSERT
399 Label check_done;
400 subw(t2_top, t2_top, oopSize);
401 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
402 br(Assembler::LT, check_done);
403 ldr(t3_t, Address(rthread, t2_top));
404 cmp(obj, t3_t);
405 br(Assembler::NE, inflated);
406 stop("Fast Unlock lock on stack");
407 bind(check_done);
408 #endif
409
410 const Register t1_monitor = t1;
411
412 if (!UseObjectMonitorTable) {
413 assert(t1_monitor == t1_mark, "should be the same here");
414
415 // Untag the monitor.
416 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
417 } else {
418 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
419 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
420 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
421 br(Assembler::LO, slow_path);
422 }
423
424 const Register t2_recursions = t2;
425 Label not_recursive;
426
427 // Check if recursive.
428 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
429 cbz(t2_recursions, not_recursive);
430
431 // Recursive unlock.
432 sub(t2_recursions, t2_recursions, 1u);
433 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
434 // Set flag == EQ
435 cmp(t2_recursions, t2_recursions);
436 b(unlocked);
437
438 bind(not_recursive);
439
440 const Register t2_owner_addr = t2;
441
442 // Compute owner address.
443 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
444
445 // Set owner to null.
446 // Release to satisfy the JMM
447 stlr(zr, t2_owner_addr);
448 // We need a full fence after clearing owner to avoid stranding.
449 // StoreLoad achieves this.
450 membar(StoreLoad);
451
452 // Check if the entry_list is empty.
453 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
454 cmp(rscratch1, zr);
455 br(Assembler::EQ, unlocked); // If so we are done.
456
457 // Check if there is a successor.
458 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
459 cmp(rscratch1, zr);
460 br(Assembler::NE, unlocked); // If so we are done.
461
462 // Save the monitor pointer in the current thread, so we can try to
463 // reacquire the lock in SharedRuntime::monitor_exit_helper().
464 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
465
466 cmp(zr, rthread); // Set Flag to NE => slow path
467 b(slow_path);
468 }
469
470 bind(unlocked);
471 cmp(zr, zr); // Set Flags to EQ => fast path
472
473 #ifdef ASSERT
474 // Check that unlocked label is reached with Flags == EQ.
475 Label flag_correct;
476 br(Assembler::EQ, flag_correct);
477 stop("Fast Unlock Flag != EQ");
478 #endif
479
480 bind(slow_path);
481 #ifdef ASSERT
482 // Check that slow_path label is reached with Flags == NE.
483 br(Assembler::NE, flag_correct);
484 stop("Fast Unlock Flag != NE");
485 bind(flag_correct);
486 #endif
487 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
488 }
489
490 // Search for str1 in str2 and return index or -1
491 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
492 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
493 Register cnt2, Register cnt1,
494 Register tmp1, Register tmp2,
495 Register tmp3, Register tmp4,
496 Register tmp5, Register tmp6,
497 int icnt1, Register result, int ae) {
498 // NOTE: tmp5, tmp6 can be zr depending on specific method version
499 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
500
501 Register ch1 = rscratch1;
502 Register ch2 = rscratch2;
503 Register cnt1tmp = tmp1;
504 Register cnt2tmp = tmp2;
505 Register cnt1_neg = cnt1;
506 Register cnt2_neg = cnt2;
507 Register result_tmp = tmp4;
508
509 bool isL = ae == StrIntrinsicNode::LL;
510
511 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
512 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
513 int str1_chr_shift = str1_isL ? 0:1;
514 int str2_chr_shift = str2_isL ? 0:1;
515 int str1_chr_size = str1_isL ? 1:2;
516 int str2_chr_size = str2_isL ? 1:2;
517 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
518 (chr_insn)&MacroAssembler::ldrh;
519 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
520 (chr_insn)&MacroAssembler::ldrh;
521 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
522 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
523
524 // Note, inline_string_indexOf() generates checks:
525 // if (substr.count > string.count) return -1;
526 // if (substr.count == 0) return 0;
527
528 // We have two strings, a source string in str2, cnt2 and a pattern string
529 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
530
531 // For larger pattern and source we use a simplified Boyer Moore algorithm.
532 // With a small pattern and source we use linear scan.
533
534 if (icnt1 == -1) {
535 sub(result_tmp, cnt2, cnt1);
536 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
537 br(LT, LINEARSEARCH);
538 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
539 subs(zr, cnt1, 256);
540 lsr(tmp1, cnt2, 2);
541 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
542 br(GE, LINEARSTUB);
543 }
544
545 // The Boyer Moore alogorithm is based on the description here:-
546 //
547 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
548 //
549 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
550 // and the 'Good Suffix' rule.
551 //
552 // These rules are essentially heuristics for how far we can shift the
553 // pattern along the search string.
554 //
555 // The implementation here uses the 'Bad Character' rule only because of the
556 // complexity of initialisation for the 'Good Suffix' rule.
557 //
558 // This is also known as the Boyer-Moore-Horspool algorithm:-
559 //
560 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
561 //
562 // This particular implementation has few java-specific optimizations.
563 //
564 // #define ASIZE 256
565 //
566 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
567 // int i, j;
568 // unsigned c;
569 // unsigned char bc[ASIZE];
570 //
571 // /* Preprocessing */
572 // for (i = 0; i < ASIZE; ++i)
573 // bc[i] = m;
574 // for (i = 0; i < m - 1; ) {
575 // c = x[i];
576 // ++i;
577 // // c < 256 for Latin1 string, so, no need for branch
578 // #ifdef PATTERN_STRING_IS_LATIN1
579 // bc[c] = m - i;
580 // #else
581 // if (c < ASIZE) bc[c] = m - i;
582 // #endif
583 // }
584 //
585 // /* Searching */
586 // j = 0;
587 // while (j <= n - m) {
588 // c = y[i+j];
589 // if (x[m-1] == c)
590 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
591 // if (i < 0) return j;
592 // // c < 256 for Latin1 string, so, no need for branch
593 // #ifdef SOURCE_STRING_IS_LATIN1
594 // // LL case: (c< 256) always true. Remove branch
595 // j += bc[y[j+m-1]];
596 // #endif
597 // #ifndef PATTERN_STRING_IS_UTF
598 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
599 // if (c < ASIZE)
600 // j += bc[y[j+m-1]];
601 // else
602 // j += 1
603 // #endif
604 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
605 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
606 // if (c < ASIZE)
607 // j += bc[y[j+m-1]];
608 // else
609 // j += m
610 // #endif
611 // }
612 // }
613
614 if (icnt1 == -1) {
615 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
616 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
617 Register cnt1end = tmp2;
618 Register str2end = cnt2;
619 Register skipch = tmp2;
620
621 // str1 length is >=8, so, we can read at least 1 register for cases when
622 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
623 // UL case. We'll re-read last character in inner pre-loop code to have
624 // single outer pre-loop load
625 const int firstStep = isL ? 7 : 3;
626
627 const int ASIZE = 256;
628 const int STORED_BYTES = 32; // amount of bytes stored per instruction
629 sub(sp, sp, ASIZE);
630 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
631 mov(ch1, sp);
632 BIND(BM_INIT_LOOP);
633 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
634 subs(tmp5, tmp5, 1);
635 br(GT, BM_INIT_LOOP);
636
637 sub(cnt1tmp, cnt1, 1);
638 mov(tmp5, str2);
639 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
640 sub(ch2, cnt1, 1);
641 mov(tmp3, str1);
642 BIND(BCLOOP);
643 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
644 if (!str1_isL) {
645 subs(zr, ch1, ASIZE);
646 br(HS, BCSKIP);
647 }
648 strb(ch2, Address(sp, ch1));
649 BIND(BCSKIP);
650 subs(ch2, ch2, 1);
651 br(GT, BCLOOP);
652
653 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
654 if (str1_isL == str2_isL) {
655 // load last 8 bytes (8LL/4UU symbols)
656 ldr(tmp6, Address(tmp6, -wordSize));
657 } else {
658 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
659 // convert Latin1 to UTF. We'll have to wait until load completed, but
660 // it's still faster than per-character loads+checks
661 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
662 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
663 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
664 andr(tmp6, tmp6, 0xFF); // str1[N-4]
665 orr(ch2, ch1, ch2, LSL, 16);
666 orr(tmp6, tmp6, tmp3, LSL, 48);
667 orr(tmp6, tmp6, ch2, LSL, 16);
668 }
669 BIND(BMLOOPSTR2);
670 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
671 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
672 if (str1_isL == str2_isL) {
673 // re-init tmp3. It's for free because it's executed in parallel with
674 // load above. Alternative is to initialize it before loop, but it'll
675 // affect performance on in-order systems with 2 or more ld/st pipelines
676 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
677 }
678 if (!isL) { // UU/UL case
679 lsl(ch2, cnt1tmp, 1); // offset in bytes
680 }
681 cmp(tmp3, skipch);
682 br(NE, BMSKIP);
683 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
684 mov(ch1, tmp6);
685 if (isL) {
686 b(BMLOOPSTR1_AFTER_LOAD);
687 } else {
688 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
689 b(BMLOOPSTR1_CMP);
690 }
691 BIND(BMLOOPSTR1);
692 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
693 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
694 BIND(BMLOOPSTR1_AFTER_LOAD);
695 subs(cnt1tmp, cnt1tmp, 1);
696 br(LT, BMLOOPSTR1_LASTCMP);
697 BIND(BMLOOPSTR1_CMP);
698 cmp(ch1, ch2);
699 br(EQ, BMLOOPSTR1);
700 BIND(BMSKIP);
701 if (!isL) {
702 // if we've met UTF symbol while searching Latin1 pattern, then we can
703 // skip cnt1 symbols
704 if (str1_isL != str2_isL) {
705 mov(result_tmp, cnt1);
706 } else {
707 mov(result_tmp, 1);
708 }
709 subs(zr, skipch, ASIZE);
710 br(HS, BMADV);
711 }
712 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
713 BIND(BMADV);
714 sub(cnt1tmp, cnt1, 1);
715 add(str2, str2, result_tmp, LSL, str2_chr_shift);
716 cmp(str2, str2end);
717 br(LE, BMLOOPSTR2);
718 add(sp, sp, ASIZE);
719 b(NOMATCH);
720 BIND(BMLOOPSTR1_LASTCMP);
721 cmp(ch1, ch2);
722 br(NE, BMSKIP);
723 BIND(BMMATCH);
724 sub(result, str2, tmp5);
725 if (!str2_isL) lsr(result, result, 1);
726 add(sp, sp, ASIZE);
727 b(DONE);
728
729 BIND(LINEARSTUB);
730 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
731 br(LT, LINEAR_MEDIUM);
732 mov(result, zr);
733 RuntimeAddress stub = nullptr;
734 if (isL) {
735 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
736 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
737 } else if (str1_isL) {
738 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
739 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
740 } else {
741 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
742 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
743 }
744 address call = trampoline_call(stub);
745 if (call == nullptr) {
746 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
747 ciEnv::current()->record_failure("CodeCache is full");
748 return;
749 }
750 b(DONE);
751 }
752
753 BIND(LINEARSEARCH);
754 {
755 Label DO1, DO2, DO3;
756
757 Register str2tmp = tmp2;
758 Register first = tmp3;
759
760 if (icnt1 == -1)
761 {
762 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
763
764 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
765 br(LT, DOSHORT);
766 BIND(LINEAR_MEDIUM);
767 (this->*str1_load_1chr)(first, Address(str1));
768 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
769 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
770 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
771 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
772
773 BIND(FIRST_LOOP);
774 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
775 cmp(first, ch2);
776 br(EQ, STR1_LOOP);
777 BIND(STR2_NEXT);
778 adds(cnt2_neg, cnt2_neg, str2_chr_size);
779 br(LE, FIRST_LOOP);
780 b(NOMATCH);
781
782 BIND(STR1_LOOP);
783 adds(cnt1tmp, cnt1_neg, str1_chr_size);
784 add(cnt2tmp, cnt2_neg, str2_chr_size);
785 br(GE, MATCH);
786
787 BIND(STR1_NEXT);
788 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
789 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
790 cmp(ch1, ch2);
791 br(NE, STR2_NEXT);
792 adds(cnt1tmp, cnt1tmp, str1_chr_size);
793 add(cnt2tmp, cnt2tmp, str2_chr_size);
794 br(LT, STR1_NEXT);
795 b(MATCH);
796
797 BIND(DOSHORT);
798 if (str1_isL == str2_isL) {
799 cmp(cnt1, (u1)2);
800 br(LT, DO1);
801 br(GT, DO3);
802 }
803 }
804
805 if (icnt1 == 4) {
806 Label CH1_LOOP;
807
808 (this->*load_4chr)(ch1, str1);
809 sub(result_tmp, cnt2, 4);
810 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
811 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
812
813 BIND(CH1_LOOP);
814 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
815 cmp(ch1, ch2);
816 br(EQ, MATCH);
817 adds(cnt2_neg, cnt2_neg, str2_chr_size);
818 br(LE, CH1_LOOP);
819 b(NOMATCH);
820 }
821
822 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
823 Label CH1_LOOP;
824
825 BIND(DO2);
826 (this->*load_2chr)(ch1, str1);
827 if (icnt1 == 2) {
828 sub(result_tmp, cnt2, 2);
829 }
830 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
831 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
832 BIND(CH1_LOOP);
833 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
834 cmp(ch1, ch2);
835 br(EQ, MATCH);
836 adds(cnt2_neg, cnt2_neg, str2_chr_size);
837 br(LE, CH1_LOOP);
838 b(NOMATCH);
839 }
840
841 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
842 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
843
844 BIND(DO3);
845 (this->*load_2chr)(first, str1);
846 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
847 if (icnt1 == 3) {
848 sub(result_tmp, cnt2, 3);
849 }
850 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
851 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
852 BIND(FIRST_LOOP);
853 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
854 cmpw(first, ch2);
855 br(EQ, STR1_LOOP);
856 BIND(STR2_NEXT);
857 adds(cnt2_neg, cnt2_neg, str2_chr_size);
858 br(LE, FIRST_LOOP);
859 b(NOMATCH);
860
861 BIND(STR1_LOOP);
862 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
863 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
864 cmp(ch1, ch2);
865 br(NE, STR2_NEXT);
866 b(MATCH);
867 }
868
869 if (icnt1 == -1 || icnt1 == 1) {
870 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
871
872 BIND(DO1);
873 (this->*str1_load_1chr)(ch1, str1);
874 cmp(cnt2, (u1)8);
875 br(LT, DO1_SHORT);
876
877 sub(result_tmp, cnt2, 8/str2_chr_size);
878 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
879 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
880 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
881
882 if (str2_isL) {
883 orr(ch1, ch1, ch1, LSL, 8);
884 }
885 orr(ch1, ch1, ch1, LSL, 16);
886 orr(ch1, ch1, ch1, LSL, 32);
887 BIND(CH1_LOOP);
888 ldr(ch2, Address(str2, cnt2_neg));
889 eor(ch2, ch1, ch2);
890 sub(tmp1, ch2, tmp3);
891 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
892 bics(tmp1, tmp1, tmp2);
893 br(NE, HAS_ZERO);
894 adds(cnt2_neg, cnt2_neg, 8);
895 br(LT, CH1_LOOP);
896
897 cmp(cnt2_neg, (u1)8);
898 mov(cnt2_neg, 0);
899 br(LT, CH1_LOOP);
900 b(NOMATCH);
901
902 BIND(HAS_ZERO);
903 rev(tmp1, tmp1);
904 clz(tmp1, tmp1);
905 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
906 b(MATCH);
907
908 BIND(DO1_SHORT);
909 mov(result_tmp, cnt2);
910 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
911 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
912 BIND(DO1_LOOP);
913 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
914 cmpw(ch1, ch2);
915 br(EQ, MATCH);
916 adds(cnt2_neg, cnt2_neg, str2_chr_size);
917 br(LT, DO1_LOOP);
918 }
919 }
920 BIND(NOMATCH);
921 mov(result, -1);
922 b(DONE);
923 BIND(MATCH);
924 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
925 BIND(DONE);
926 }
927
928 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
929 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
930
931 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
932 Register ch, Register result,
933 Register tmp1, Register tmp2, Register tmp3)
934 {
935 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
936 Register cnt1_neg = cnt1;
937 Register ch1 = rscratch1;
938 Register result_tmp = rscratch2;
939
940 cbz(cnt1, NOMATCH);
941
942 cmp(cnt1, (u1)4);
943 br(LT, DO1_SHORT);
944
945 orr(ch, ch, ch, LSL, 16);
946 orr(ch, ch, ch, LSL, 32);
947
948 sub(cnt1, cnt1, 4);
949 mov(result_tmp, cnt1);
950 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
951 sub(cnt1_neg, zr, cnt1, LSL, 1);
952
953 mov(tmp3, 0x0001000100010001);
954
955 BIND(CH1_LOOP);
956 ldr(ch1, Address(str1, cnt1_neg));
957 eor(ch1, ch, ch1);
958 sub(tmp1, ch1, tmp3);
959 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
960 bics(tmp1, tmp1, tmp2);
961 br(NE, HAS_ZERO);
962 adds(cnt1_neg, cnt1_neg, 8);
963 br(LT, CH1_LOOP);
964
965 cmp(cnt1_neg, (u1)8);
966 mov(cnt1_neg, 0);
967 br(LT, CH1_LOOP);
968 b(NOMATCH);
969
970 BIND(HAS_ZERO);
971 rev(tmp1, tmp1);
972 clz(tmp1, tmp1);
973 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
974 b(MATCH);
975
976 BIND(DO1_SHORT);
977 mov(result_tmp, cnt1);
978 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
979 sub(cnt1_neg, zr, cnt1, LSL, 1);
980 BIND(DO1_LOOP);
981 ldrh(ch1, Address(str1, cnt1_neg));
982 cmpw(ch, ch1);
983 br(EQ, MATCH);
984 adds(cnt1_neg, cnt1_neg, 2);
985 br(LT, DO1_LOOP);
986 BIND(NOMATCH);
987 mov(result, -1);
988 b(DONE);
989 BIND(MATCH);
990 add(result, result_tmp, cnt1_neg, ASR, 1);
991 BIND(DONE);
992 }
993
994 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
995 Register ch, Register result,
996 FloatRegister ztmp1,
997 FloatRegister ztmp2,
998 PRegister tmp_pg,
999 PRegister tmp_pdn, bool isL)
1000 {
1001 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1002 assert(tmp_pg->is_governing(),
1003 "this register has to be a governing predicate register");
1004
1005 Label LOOP, MATCH, DONE, NOMATCH;
1006 Register vec_len = rscratch1;
1007 Register idx = rscratch2;
1008
1009 SIMD_RegVariant T = (isL == true) ? B : H;
1010
1011 cbz(cnt1, NOMATCH);
1012
1013 // Assign the particular char throughout the vector.
1014 sve_dup(ztmp2, T, ch);
1015 if (isL) {
1016 sve_cntb(vec_len);
1017 } else {
1018 sve_cnth(vec_len);
1019 }
1020 mov(idx, 0);
1021
1022 // Generate a predicate to control the reading of input string.
1023 sve_whilelt(tmp_pg, T, idx, cnt1);
1024
1025 BIND(LOOP);
1026 // Read a vector of 8- or 16-bit data depending on the string type. Note
1027 // that inactive elements indicated by the predicate register won't cause
1028 // a data read from memory to the destination vector.
1029 if (isL) {
1030 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1031 } else {
1032 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1033 }
1034 add(idx, idx, vec_len);
1035
1036 // Perform the comparison. An element of the destination predicate is set
1037 // to active if the particular char is matched.
1038 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1039
1040 // Branch if the particular char is found.
1041 br(NE, MATCH);
1042
1043 sve_whilelt(tmp_pg, T, idx, cnt1);
1044
1045 // Loop back if the particular char not found.
1046 br(MI, LOOP);
1047
1048 BIND(NOMATCH);
1049 mov(result, -1);
1050 b(DONE);
1051
1052 BIND(MATCH);
1053 // Undo the index increment.
1054 sub(idx, idx, vec_len);
1055
1056 // Crop the vector to find its location.
1057 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1058 add(result, idx, -1);
1059 sve_incp(result, T, tmp_pdn);
1060 BIND(DONE);
1061 }
1062
1063 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1064 Register ch, Register result,
1065 Register tmp1, Register tmp2, Register tmp3)
1066 {
1067 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1068 Register cnt1_neg = cnt1;
1069 Register ch1 = rscratch1;
1070 Register result_tmp = rscratch2;
1071
1072 cbz(cnt1, NOMATCH);
1073
1074 cmp(cnt1, (u1)8);
1075 br(LT, DO1_SHORT);
1076
1077 orr(ch, ch, ch, LSL, 8);
1078 orr(ch, ch, ch, LSL, 16);
1079 orr(ch, ch, ch, LSL, 32);
1080
1081 sub(cnt1, cnt1, 8);
1082 mov(result_tmp, cnt1);
1083 lea(str1, Address(str1, cnt1));
1084 sub(cnt1_neg, zr, cnt1);
1085
1086 mov(tmp3, 0x0101010101010101);
1087
1088 BIND(CH1_LOOP);
1089 ldr(ch1, Address(str1, cnt1_neg));
1090 eor(ch1, ch, ch1);
1091 sub(tmp1, ch1, tmp3);
1092 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1093 bics(tmp1, tmp1, tmp2);
1094 br(NE, HAS_ZERO);
1095 adds(cnt1_neg, cnt1_neg, 8);
1096 br(LT, CH1_LOOP);
1097
1098 cmp(cnt1_neg, (u1)8);
1099 mov(cnt1_neg, 0);
1100 br(LT, CH1_LOOP);
1101 b(NOMATCH);
1102
1103 BIND(HAS_ZERO);
1104 rev(tmp1, tmp1);
1105 clz(tmp1, tmp1);
1106 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1107 b(MATCH);
1108
1109 BIND(DO1_SHORT);
1110 mov(result_tmp, cnt1);
1111 lea(str1, Address(str1, cnt1));
1112 sub(cnt1_neg, zr, cnt1);
1113 BIND(DO1_LOOP);
1114 ldrb(ch1, Address(str1, cnt1_neg));
1115 cmp(ch, ch1);
1116 br(EQ, MATCH);
1117 adds(cnt1_neg, cnt1_neg, 1);
1118 br(LT, DO1_LOOP);
1119 BIND(NOMATCH);
1120 mov(result, -1);
1121 b(DONE);
1122 BIND(MATCH);
1123 add(result, result_tmp, cnt1_neg);
1124 BIND(DONE);
1125 }
1126
1127 // Compare strings.
1128 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1129 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1130 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1131 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1132 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1133 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1134 SHORT_LOOP_START, TAIL_CHECK;
1135
1136 bool isLL = ae == StrIntrinsicNode::LL;
1137 bool isLU = ae == StrIntrinsicNode::LU;
1138 bool isUL = ae == StrIntrinsicNode::UL;
1139
1140 // The stub threshold for LL strings is: 72 (64 + 8) chars
1141 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1142 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1143 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1144
1145 bool str1_isL = isLL || isLU;
1146 bool str2_isL = isLL || isUL;
1147
1148 int str1_chr_shift = str1_isL ? 0 : 1;
1149 int str2_chr_shift = str2_isL ? 0 : 1;
1150 int str1_chr_size = str1_isL ? 1 : 2;
1151 int str2_chr_size = str2_isL ? 1 : 2;
1152 int minCharsInWord = isLL ? wordSize : wordSize/2;
1153
1154 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1155 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1156 (chr_insn)&MacroAssembler::ldrh;
1157 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1158 (chr_insn)&MacroAssembler::ldrh;
1159 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1160 (uxt_insn)&MacroAssembler::uxthw;
1161
1162 BLOCK_COMMENT("string_compare {");
1163
1164 // Bizarrely, the counts are passed in bytes, regardless of whether they
1165 // are L or U strings, however the result is always in characters.
1166 if (!str1_isL) asrw(cnt1, cnt1, 1);
1167 if (!str2_isL) asrw(cnt2, cnt2, 1);
1168
1169 // Compute the minimum of the string lengths and save the difference.
1170 subsw(result, cnt1, cnt2);
1171 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1172
1173 // A very short string
1174 cmpw(cnt2, minCharsInWord);
1175 br(Assembler::LE, SHORT_STRING);
1176
1177 // Compare longwords
1178 // load first parts of strings and finish initialization while loading
1179 {
1180 if (str1_isL == str2_isL) { // LL or UU
1181 ldr(tmp1, Address(str1));
1182 cmp(str1, str2);
1183 br(Assembler::EQ, DONE);
1184 ldr(tmp2, Address(str2));
1185 cmp(cnt2, stub_threshold);
1186 br(GE, STUB);
1187 subsw(cnt2, cnt2, minCharsInWord);
1188 br(EQ, TAIL_CHECK);
1189 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1190 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1191 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1192 } else if (isLU) {
1193 ldrs(vtmp, Address(str1));
1194 ldr(tmp2, Address(str2));
1195 cmp(cnt2, stub_threshold);
1196 br(GE, STUB);
1197 subw(cnt2, cnt2, 4);
1198 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1199 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1200 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1201 zip1(vtmp, T8B, vtmp, vtmpZ);
1202 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1203 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1204 add(cnt1, cnt1, 4);
1205 fmovd(tmp1, vtmp);
1206 } else { // UL case
1207 ldr(tmp1, Address(str1));
1208 ldrs(vtmp, Address(str2));
1209 cmp(cnt2, stub_threshold);
1210 br(GE, STUB);
1211 subw(cnt2, cnt2, 4);
1212 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1213 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1214 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1215 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1216 zip1(vtmp, T8B, vtmp, vtmpZ);
1217 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1218 add(cnt1, cnt1, 8);
1219 fmovd(tmp2, vtmp);
1220 }
1221 adds(cnt2, cnt2, isUL ? 4 : 8);
1222 br(GE, TAIL);
1223 eor(rscratch2, tmp1, tmp2);
1224 cbnz(rscratch2, DIFF);
1225 // main loop
1226 bind(NEXT_WORD);
1227 if (str1_isL == str2_isL) {
1228 ldr(tmp1, Address(str1, cnt2));
1229 ldr(tmp2, Address(str2, cnt2));
1230 adds(cnt2, cnt2, 8);
1231 } else if (isLU) {
1232 ldrs(vtmp, Address(str1, cnt1));
1233 ldr(tmp2, Address(str2, cnt2));
1234 add(cnt1, cnt1, 4);
1235 zip1(vtmp, T8B, vtmp, vtmpZ);
1236 fmovd(tmp1, vtmp);
1237 adds(cnt2, cnt2, 8);
1238 } else { // UL
1239 ldrs(vtmp, Address(str2, cnt2));
1240 ldr(tmp1, Address(str1, cnt1));
1241 zip1(vtmp, T8B, vtmp, vtmpZ);
1242 add(cnt1, cnt1, 8);
1243 fmovd(tmp2, vtmp);
1244 adds(cnt2, cnt2, 4);
1245 }
1246 br(GE, TAIL);
1247
1248 eor(rscratch2, tmp1, tmp2);
1249 cbz(rscratch2, NEXT_WORD);
1250 b(DIFF);
1251 bind(TAIL);
1252 eor(rscratch2, tmp1, tmp2);
1253 cbnz(rscratch2, DIFF);
1254 // Last longword. In the case where length == 4 we compare the
1255 // same longword twice, but that's still faster than another
1256 // conditional branch.
1257 if (str1_isL == str2_isL) {
1258 ldr(tmp1, Address(str1));
1259 ldr(tmp2, Address(str2));
1260 } else if (isLU) {
1261 ldrs(vtmp, Address(str1));
1262 ldr(tmp2, Address(str2));
1263 zip1(vtmp, T8B, vtmp, vtmpZ);
1264 fmovd(tmp1, vtmp);
1265 } else { // UL
1266 ldrs(vtmp, Address(str2));
1267 ldr(tmp1, Address(str1));
1268 zip1(vtmp, T8B, vtmp, vtmpZ);
1269 fmovd(tmp2, vtmp);
1270 }
1271 bind(TAIL_CHECK);
1272 eor(rscratch2, tmp1, tmp2);
1273 cbz(rscratch2, DONE);
1274
1275 // Find the first different characters in the longwords and
1276 // compute their difference.
1277 bind(DIFF);
1278 rev(rscratch2, rscratch2);
1279 clz(rscratch2, rscratch2);
1280 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1281 lsrv(tmp1, tmp1, rscratch2);
1282 (this->*ext_chr)(tmp1, tmp1);
1283 lsrv(tmp2, tmp2, rscratch2);
1284 (this->*ext_chr)(tmp2, tmp2);
1285 subw(result, tmp1, tmp2);
1286 b(DONE);
1287 }
1288
1289 bind(STUB);
1290 RuntimeAddress stub = nullptr;
1291 switch(ae) {
1292 case StrIntrinsicNode::LL:
1293 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1294 break;
1295 case StrIntrinsicNode::UU:
1296 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1297 break;
1298 case StrIntrinsicNode::LU:
1299 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1300 break;
1301 case StrIntrinsicNode::UL:
1302 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1303 break;
1304 default:
1305 ShouldNotReachHere();
1306 }
1307 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1308 address call = trampoline_call(stub);
1309 if (call == nullptr) {
1310 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1311 ciEnv::current()->record_failure("CodeCache is full");
1312 return;
1313 }
1314 b(DONE);
1315
1316 bind(SHORT_STRING);
1317 // Is the minimum length zero?
1318 cbz(cnt2, DONE);
1319 // arrange code to do most branches while loading and loading next characters
1320 // while comparing previous
1321 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1322 subs(cnt2, cnt2, 1);
1323 br(EQ, SHORT_LAST_INIT);
1324 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1325 b(SHORT_LOOP_START);
1326 bind(SHORT_LOOP);
1327 subs(cnt2, cnt2, 1);
1328 br(EQ, SHORT_LAST);
1329 bind(SHORT_LOOP_START);
1330 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1331 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1332 cmp(tmp1, cnt1);
1333 br(NE, SHORT_LOOP_TAIL);
1334 subs(cnt2, cnt2, 1);
1335 br(EQ, SHORT_LAST2);
1336 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1337 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1338 cmp(tmp2, rscratch1);
1339 br(EQ, SHORT_LOOP);
1340 sub(result, tmp2, rscratch1);
1341 b(DONE);
1342 bind(SHORT_LOOP_TAIL);
1343 sub(result, tmp1, cnt1);
1344 b(DONE);
1345 bind(SHORT_LAST2);
1346 cmp(tmp2, rscratch1);
1347 br(EQ, DONE);
1348 sub(result, tmp2, rscratch1);
1349
1350 b(DONE);
1351 bind(SHORT_LAST_INIT);
1352 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1353 bind(SHORT_LAST);
1354 cmp(tmp1, cnt1);
1355 br(EQ, DONE);
1356 sub(result, tmp1, cnt1);
1357
1358 bind(DONE);
1359
1360 BLOCK_COMMENT("} string_compare");
1361 }
1362
1363 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1364 FloatRegister src2, Condition cond, bool isQ) {
1365 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1366 FloatRegister zn = src1, zm = src2;
1367 bool needs_negation = false;
1368 switch (cond) {
1369 case LT: cond = GT; zn = src2; zm = src1; break;
1370 case LE: cond = GE; zn = src2; zm = src1; break;
1371 case LO: cond = HI; zn = src2; zm = src1; break;
1372 case LS: cond = HS; zn = src2; zm = src1; break;
1373 case NE: cond = EQ; needs_negation = true; break;
1374 default:
1375 break;
1376 }
1377
1378 if (is_floating_point_type(bt)) {
1379 fcm(cond, dst, size, zn, zm);
1380 } else {
1381 cm(cond, dst, size, zn, zm);
1382 }
1383
1384 if (needs_negation) {
1385 notr(dst, isQ ? T16B : T8B, dst);
1386 }
1387 }
1388
1389 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1390 Condition cond, bool isQ) {
1391 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1392 if (bt == T_FLOAT || bt == T_DOUBLE) {
1393 if (cond == Assembler::NE) {
1394 fcm(Assembler::EQ, dst, size, src);
1395 notr(dst, isQ ? T16B : T8B, dst);
1396 } else {
1397 fcm(cond, dst, size, src);
1398 }
1399 } else {
1400 if (cond == Assembler::NE) {
1401 cm(Assembler::EQ, dst, size, src);
1402 notr(dst, isQ ? T16B : T8B, dst);
1403 } else {
1404 cm(cond, dst, size, src);
1405 }
1406 }
1407 }
1408
1409 // Compress the least significant bit of each byte to the rightmost and clear
1410 // the higher garbage bits.
1411 void C2_MacroAssembler::bytemask_compress(Register dst) {
1412 // Example input, dst = 0x01 00 00 00 01 01 00 01
1413 // The "??" bytes are garbage.
1414 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1415 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1416 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1417 andr(dst, dst, 0xff); // dst = 0x8D
1418 }
1419
1420 // Pack the value of each mask element in "src" into a long value in "dst", at most
1421 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1422 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1423 // one bit in "dst".
1424 //
1425 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1426 // Expected: dst = 0x658D
1427 //
1428 // Clobbers: rscratch1
1429 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1430 FloatRegister vtmp, int lane_cnt) {
1431 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1432 assert_different_registers(dst, rscratch1);
1433 assert_different_registers(src, vtmp);
1434 assert(UseSVE > 0, "must be");
1435
1436 // Compress the lowest 8 bytes.
1437 fmovd(dst, src);
1438 bytemask_compress(dst);
1439 if (lane_cnt <= 8) return;
1440
1441 // Repeat on higher bytes and join the results.
1442 // Compress 8 bytes in each iteration.
1443 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1444 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1445 bytemask_compress(rscratch1);
1446 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1447 }
1448 }
1449
1450 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1451 // instruction which requires the FEAT_BITPERM feature.
1452 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1453 FloatRegister vtmp1, FloatRegister vtmp2,
1454 int lane_cnt) {
1455 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1456 assert_different_registers(src, vtmp1, vtmp2);
1457 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1458
1459 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1460 // is to compress each significant bit of the byte in a cross-lane way. Due
1461 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1462 // (bit-compress in each lane) with the biggest lane size (T = D) then
1463 // concatenate the results.
1464
1465 // The second source input of BEXT, initialized with 0x01 in each byte.
1466 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1467 sve_dup(vtmp2, B, 1);
1468
1469 // BEXT vtmp1.D, src.D, vtmp2.D
1470 // src = 0x0001010000010001 | 0x0100000001010001
1471 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1472 // ---------------------------------------
1473 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1474 sve_bext(vtmp1, D, src, vtmp2);
1475
1476 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1477 // result to dst.
1478 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1479 // dst = 0x658D
1480 if (lane_cnt <= 8) {
1481 // No need to concatenate.
1482 umov(dst, vtmp1, B, 0);
1483 } else if (lane_cnt <= 16) {
1484 ins(vtmp1, B, vtmp1, 1, 8);
1485 umov(dst, vtmp1, H, 0);
1486 } else {
1487 // As the lane count is 64 at most, the final expected value must be in
1488 // the lowest 64 bits after narrowing vtmp1 from D to B.
1489 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1490 umov(dst, vtmp1, D, 0);
1491 }
1492 }
1493
1494 // Unpack the mask, a long value in "src", into a vector register of boolean
1495 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1496 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1497 // most 64 lanes.
1498 //
1499 // Below example gives the expected dst vector register, with a valid src(0x658D)
1500 // on a 128-bit vector size machine.
1501 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1502 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1503 FloatRegister vtmp, int lane_cnt) {
1504 assert_different_registers(dst, vtmp);
1505 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1506 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1507
1508 // Example: src = 0x658D, lane_cnt = 16
1509 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1510
1511 // Put long value from general purpose register into the first lane of vector.
1512 // vtmp = 0x0000000000000000 | 0x000000000000658D
1513 sve_dup(vtmp, B, 0);
1514 mov(vtmp, D, 0, src);
1515
1516 // Transform the value in the first lane which is mask in bit now to the mask in
1517 // byte, which can be done by SVE2's BDEP instruction.
1518
1519 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1520 // vtmp = 0x0000000000000065 | 0x000000000000008D
1521 if (lane_cnt <= 8) {
1522 // Nothing. As only one byte exsits.
1523 } else if (lane_cnt <= 16) {
1524 ins(vtmp, B, vtmp, 8, 1);
1525 } else {
1526 sve_vector_extend(vtmp, D, vtmp, B);
1527 }
1528
1529 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1530 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1531 sve_dup(dst, B, 1);
1532
1533 // BDEP dst.D, vtmp.D, dst.D
1534 // vtmp = 0x0000000000000065 | 0x000000000000008D
1535 // dst = 0x0101010101010101 | 0x0101010101010101
1536 // ---------------------------------------
1537 // dst = 0x0001010000010001 | 0x0100000001010001
1538 sve_bdep(dst, D, vtmp, dst);
1539 }
1540
1541 // Clobbers: rflags
1542 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1543 FloatRegister zn, FloatRegister zm, Condition cond) {
1544 assert(pg->is_governing(), "This register has to be a governing predicate register");
1545 FloatRegister z1 = zn, z2 = zm;
1546 switch (cond) {
1547 case LE: z1 = zm; z2 = zn; cond = GE; break;
1548 case LT: z1 = zm; z2 = zn; cond = GT; break;
1549 case LO: z1 = zm; z2 = zn; cond = HI; break;
1550 case LS: z1 = zm; z2 = zn; cond = HS; break;
1551 default:
1552 break;
1553 }
1554
1555 SIMD_RegVariant size = elemType_to_regVariant(bt);
1556 if (is_floating_point_type(bt)) {
1557 sve_fcm(cond, pd, size, pg, z1, z2);
1558 } else {
1559 assert(is_integral_type(bt), "unsupported element type");
1560 sve_cmp(cond, pd, size, pg, z1, z2);
1561 }
1562 }
1563
1564 // Get index of the last mask lane that is set
1565 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1566 SIMD_RegVariant size = elemType_to_regVariant(bt);
1567 sve_rev(ptmp, size, src);
1568 sve_brkb(ptmp, ptrue, ptmp, false);
1569 sve_cntp(dst, size, ptrue, ptmp);
1570 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1571 subw(dst, rscratch1, dst);
1572 }
1573
1574 // Extend integer vector src to dst with the same lane count
1575 // but larger element size, e.g. 4B -> 4I
1576 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1577 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1578 if (src_bt == T_BYTE) {
1579 // 4B to 4S/4I, 8B to 8S
1580 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1581 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1582 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1583 if (dst_bt == T_INT) {
1584 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1585 }
1586 } else if (src_bt == T_SHORT) {
1587 // 2S to 2I/2L, 4S to 4I
1588 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1589 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1590 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1591 if (dst_bt == T_LONG) {
1592 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1593 }
1594 } else if (src_bt == T_INT) {
1595 // 2I to 2L
1596 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1597 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1598 } else {
1599 ShouldNotReachHere();
1600 }
1601 }
1602
1603 // Narrow integer vector src down to dst with the same lane count
1604 // but smaller element size, e.g. 4I -> 4B
1605 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1606 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1607 if (src_bt == T_SHORT) {
1608 // 4S/8S to 4B/8B
1609 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1610 assert(dst_bt == T_BYTE, "unsupported");
1611 xtn(dst, T8B, src, T8H);
1612 } else if (src_bt == T_INT) {
1613 // 2I to 2S, 4I to 4B/4S
1614 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1615 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1616 xtn(dst, T4H, src, T4S);
1617 if (dst_bt == T_BYTE) {
1618 xtn(dst, T8B, dst, T8H);
1619 }
1620 } else if (src_bt == T_LONG) {
1621 // 2L to 2S/2I
1622 assert(src_vlen_in_bytes == 16, "unsupported");
1623 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1624 xtn(dst, T2S, src, T2D);
1625 if (dst_bt == T_SHORT) {
1626 xtn(dst, T4H, dst, T4S);
1627 }
1628 } else {
1629 ShouldNotReachHere();
1630 }
1631 }
1632
1633 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1634 FloatRegister src, SIMD_RegVariant src_size,
1635 bool is_unsigned) {
1636 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1637
1638 if (src_size == B) {
1639 switch (dst_size) {
1640 case H:
1641 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1642 break;
1643 case S:
1644 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1645 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1646 break;
1647 case D:
1648 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1649 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1650 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1651 break;
1652 default:
1653 ShouldNotReachHere();
1654 }
1655 } else if (src_size == H) {
1656 if (dst_size == S) {
1657 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1658 } else { // D
1659 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1660 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1661 }
1662 } else if (src_size == S) {
1663 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1664 }
1665 }
1666
1667 // Vector narrow from src to dst with specified element sizes.
1668 // High part of dst vector will be filled with zero.
1669 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1670 FloatRegister src, SIMD_RegVariant src_size,
1671 FloatRegister tmp) {
1672 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1673 assert_different_registers(src, tmp);
1674 sve_dup(tmp, src_size, 0);
1675 if (src_size == D) {
1676 switch (dst_size) {
1677 case S:
1678 sve_uzp1(dst, S, src, tmp);
1679 break;
1680 case H:
1681 assert_different_registers(dst, tmp);
1682 sve_uzp1(dst, S, src, tmp);
1683 sve_uzp1(dst, H, dst, tmp);
1684 break;
1685 case B:
1686 assert_different_registers(dst, tmp);
1687 sve_uzp1(dst, S, src, tmp);
1688 sve_uzp1(dst, H, dst, tmp);
1689 sve_uzp1(dst, B, dst, tmp);
1690 break;
1691 default:
1692 ShouldNotReachHere();
1693 }
1694 } else if (src_size == S) {
1695 if (dst_size == H) {
1696 sve_uzp1(dst, H, src, tmp);
1697 } else { // B
1698 assert_different_registers(dst, tmp);
1699 sve_uzp1(dst, H, src, tmp);
1700 sve_uzp1(dst, B, dst, tmp);
1701 }
1702 } else if (src_size == H) {
1703 sve_uzp1(dst, B, src, tmp);
1704 }
1705 }
1706
1707 // Extend src predicate to dst predicate with the same lane count but larger
1708 // element size, e.g. 64Byte -> 512Long
1709 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1710 uint dst_element_length_in_bytes,
1711 uint src_element_length_in_bytes) {
1712 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1713 sve_punpklo(dst, src);
1714 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1715 sve_punpklo(dst, src);
1716 sve_punpklo(dst, dst);
1717 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1718 sve_punpklo(dst, src);
1719 sve_punpklo(dst, dst);
1720 sve_punpklo(dst, dst);
1721 } else {
1722 assert(false, "unsupported");
1723 ShouldNotReachHere();
1724 }
1725 }
1726
1727 // Narrow src predicate to dst predicate with the same lane count but
1728 // smaller element size, e.g. 512Long -> 64Byte
1729 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1730 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1731 // The insignificant bits in src predicate are expected to be zero.
1732 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1733 // passed as the second argument. An example narrowing operation with a given mask would be -
1734 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1735 // Mask (for 2 Longs) : TF
1736 // Predicate register for the above mask (16 bits) : 00000001 00000000
1737 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1738 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1739 assert_different_registers(src, ptmp);
1740 assert_different_registers(dst, ptmp);
1741 sve_pfalse(ptmp);
1742 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1743 sve_uzp1(dst, B, src, ptmp);
1744 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1745 sve_uzp1(dst, H, src, ptmp);
1746 sve_uzp1(dst, B, dst, ptmp);
1747 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1748 sve_uzp1(dst, S, src, ptmp);
1749 sve_uzp1(dst, H, dst, ptmp);
1750 sve_uzp1(dst, B, dst, ptmp);
1751 } else {
1752 assert(false, "unsupported");
1753 ShouldNotReachHere();
1754 }
1755 }
1756
1757 // Vector reduction add for integral type with ASIMD instructions.
1758 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1759 Register isrc, FloatRegister vsrc,
1760 unsigned vector_length_in_bytes,
1761 FloatRegister vtmp) {
1762 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1763 assert_different_registers(dst, isrc);
1764 bool isQ = vector_length_in_bytes == 16;
1765
1766 BLOCK_COMMENT("neon_reduce_add_integral {");
1767 switch(bt) {
1768 case T_BYTE:
1769 addv(vtmp, isQ ? T16B : T8B, vsrc);
1770 smov(dst, vtmp, B, 0);
1771 addw(dst, dst, isrc, ext::sxtb);
1772 break;
1773 case T_SHORT:
1774 addv(vtmp, isQ ? T8H : T4H, vsrc);
1775 smov(dst, vtmp, H, 0);
1776 addw(dst, dst, isrc, ext::sxth);
1777 break;
1778 case T_INT:
1779 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1780 umov(dst, vtmp, S, 0);
1781 addw(dst, dst, isrc);
1782 break;
1783 case T_LONG:
1784 assert(isQ, "unsupported");
1785 addpd(vtmp, vsrc);
1786 umov(dst, vtmp, D, 0);
1787 add(dst, dst, isrc);
1788 break;
1789 default:
1790 assert(false, "unsupported");
1791 ShouldNotReachHere();
1792 }
1793 BLOCK_COMMENT("} neon_reduce_add_integral");
1794 }
1795
1796 // Vector reduction multiply for integral type with ASIMD instructions.
1797 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1798 // Clobbers: rscratch1
1799 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1800 Register isrc, FloatRegister vsrc,
1801 unsigned vector_length_in_bytes,
1802 FloatRegister vtmp1, FloatRegister vtmp2) {
1803 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1804 bool isQ = vector_length_in_bytes == 16;
1805
1806 BLOCK_COMMENT("neon_reduce_mul_integral {");
1807 switch(bt) {
1808 case T_BYTE:
1809 if (isQ) {
1810 // Multiply the lower half and higher half of vector iteratively.
1811 // vtmp1 = vsrc[8:15]
1812 ins(vtmp1, D, vsrc, 0, 1);
1813 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1814 mulv(vtmp1, T8B, vtmp1, vsrc);
1815 // vtmp2 = vtmp1[4:7]
1816 ins(vtmp2, S, vtmp1, 0, 1);
1817 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1818 mulv(vtmp1, T8B, vtmp2, vtmp1);
1819 } else {
1820 ins(vtmp1, S, vsrc, 0, 1);
1821 mulv(vtmp1, T8B, vtmp1, vsrc);
1822 }
1823 // vtmp2 = vtmp1[2:3]
1824 ins(vtmp2, H, vtmp1, 0, 1);
1825 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1826 mulv(vtmp2, T8B, vtmp2, vtmp1);
1827 // dst = vtmp2[0] * isrc * vtmp2[1]
1828 umov(rscratch1, vtmp2, B, 0);
1829 mulw(dst, rscratch1, isrc);
1830 sxtb(dst, dst);
1831 umov(rscratch1, vtmp2, B, 1);
1832 mulw(dst, rscratch1, dst);
1833 sxtb(dst, dst);
1834 break;
1835 case T_SHORT:
1836 if (isQ) {
1837 ins(vtmp2, D, vsrc, 0, 1);
1838 mulv(vtmp2, T4H, vtmp2, vsrc);
1839 ins(vtmp1, S, vtmp2, 0, 1);
1840 mulv(vtmp1, T4H, vtmp1, vtmp2);
1841 } else {
1842 ins(vtmp1, S, vsrc, 0, 1);
1843 mulv(vtmp1, T4H, vtmp1, vsrc);
1844 }
1845 umov(rscratch1, vtmp1, H, 0);
1846 mulw(dst, rscratch1, isrc);
1847 sxth(dst, dst);
1848 umov(rscratch1, vtmp1, H, 1);
1849 mulw(dst, rscratch1, dst);
1850 sxth(dst, dst);
1851 break;
1852 case T_INT:
1853 if (isQ) {
1854 ins(vtmp1, D, vsrc, 0, 1);
1855 mulv(vtmp1, T2S, vtmp1, vsrc);
1856 } else {
1857 vtmp1 = vsrc;
1858 }
1859 umov(rscratch1, vtmp1, S, 0);
1860 mul(dst, rscratch1, isrc);
1861 umov(rscratch1, vtmp1, S, 1);
1862 mul(dst, rscratch1, dst);
1863 break;
1864 case T_LONG:
1865 umov(rscratch1, vsrc, D, 0);
1866 mul(dst, isrc, rscratch1);
1867 umov(rscratch1, vsrc, D, 1);
1868 mul(dst, dst, rscratch1);
1869 break;
1870 default:
1871 assert(false, "unsupported");
1872 ShouldNotReachHere();
1873 }
1874 BLOCK_COMMENT("} neon_reduce_mul_integral");
1875 }
1876
1877 // Vector reduction multiply for floating-point type with ASIMD instructions.
1878 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1879 FloatRegister fsrc, FloatRegister vsrc,
1880 unsigned vector_length_in_bytes,
1881 FloatRegister vtmp) {
1882 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1883 bool isQ = vector_length_in_bytes == 16;
1884
1885 BLOCK_COMMENT("neon_reduce_mul_fp {");
1886 switch(bt) {
1887 // The T_SHORT type below is for Float16 type which also uses floating-point
1888 // instructions.
1889 case T_SHORT:
1890 fmulh(dst, fsrc, vsrc);
1891 ext(vtmp, T8B, vsrc, vsrc, 2);
1892 fmulh(dst, dst, vtmp);
1893 ext(vtmp, T8B, vsrc, vsrc, 4);
1894 fmulh(dst, dst, vtmp);
1895 ext(vtmp, T8B, vsrc, vsrc, 6);
1896 fmulh(dst, dst, vtmp);
1897 if (isQ) {
1898 ext(vtmp, T16B, vsrc, vsrc, 8);
1899 fmulh(dst, dst, vtmp);
1900 ext(vtmp, T16B, vsrc, vsrc, 10);
1901 fmulh(dst, dst, vtmp);
1902 ext(vtmp, T16B, vsrc, vsrc, 12);
1903 fmulh(dst, dst, vtmp);
1904 ext(vtmp, T16B, vsrc, vsrc, 14);
1905 fmulh(dst, dst, vtmp);
1906 }
1907 break;
1908 case T_FLOAT:
1909 fmuls(dst, fsrc, vsrc);
1910 ins(vtmp, S, vsrc, 0, 1);
1911 fmuls(dst, dst, vtmp);
1912 if (isQ) {
1913 ins(vtmp, S, vsrc, 0, 2);
1914 fmuls(dst, dst, vtmp);
1915 ins(vtmp, S, vsrc, 0, 3);
1916 fmuls(dst, dst, vtmp);
1917 }
1918 break;
1919 case T_DOUBLE:
1920 assert(isQ, "unsupported");
1921 fmuld(dst, fsrc, vsrc);
1922 ins(vtmp, D, vsrc, 0, 1);
1923 fmuld(dst, dst, vtmp);
1924 break;
1925 default:
1926 assert(false, "unsupported");
1927 ShouldNotReachHere();
1928 }
1929 BLOCK_COMMENT("} neon_reduce_mul_fp");
1930 }
1931
1932 // Vector reduction add for half float type with ASIMD instructions.
1933 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1934 unsigned vector_length_in_bytes, FloatRegister vtmp) {
1935 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1936 bool isQ = vector_length_in_bytes == 16;
1937
1938 BLOCK_COMMENT("neon_reduce_add_fp16 {");
1939 faddh(dst, fsrc, vsrc);
1940 ext(vtmp, T8B, vsrc, vsrc, 2);
1941 faddh(dst, dst, vtmp);
1942 ext(vtmp, T8B, vsrc, vsrc, 4);
1943 faddh(dst, dst, vtmp);
1944 ext(vtmp, T8B, vsrc, vsrc, 6);
1945 faddh(dst, dst, vtmp);
1946 if (isQ) {
1947 ext(vtmp, T16B, vsrc, vsrc, 8);
1948 faddh(dst, dst, vtmp);
1949 ext(vtmp, T16B, vsrc, vsrc, 10);
1950 faddh(dst, dst, vtmp);
1951 ext(vtmp, T16B, vsrc, vsrc, 12);
1952 faddh(dst, dst, vtmp);
1953 ext(vtmp, T16B, vsrc, vsrc, 14);
1954 faddh(dst, dst, vtmp);
1955 }
1956 BLOCK_COMMENT("} neon_reduce_add_fp16");
1957 }
1958
1959 // Helper to select logical instruction
1960 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1961 Register Rn, Register Rm,
1962 enum shift_kind kind, unsigned shift) {
1963 switch(opc) {
1964 case Op_AndReductionV:
1965 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1966 break;
1967 case Op_OrReductionV:
1968 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1969 break;
1970 case Op_XorReductionV:
1971 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1972 break;
1973 default:
1974 assert(false, "unsupported");
1975 ShouldNotReachHere();
1976 }
1977 }
1978
1979 // Vector reduction logical operations And, Or, Xor
1980 // Clobbers: rscratch1
1981 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1982 Register isrc, FloatRegister vsrc,
1983 unsigned vector_length_in_bytes) {
1984 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1985 "unsupported");
1986 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1987 assert_different_registers(dst, isrc);
1988 bool isQ = vector_length_in_bytes == 16;
1989
1990 BLOCK_COMMENT("neon_reduce_logical {");
1991 umov(rscratch1, vsrc, isQ ? D : S, 0);
1992 umov(dst, vsrc, isQ ? D : S, 1);
1993 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1994 switch(bt) {
1995 case T_BYTE:
1996 if (isQ) {
1997 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1998 }
1999 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2000 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2001 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2002 sxtb(dst, dst);
2003 break;
2004 case T_SHORT:
2005 if (isQ) {
2006 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2007 }
2008 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2009 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2010 sxth(dst, dst);
2011 break;
2012 case T_INT:
2013 if (isQ) {
2014 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2015 }
2016 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2017 break;
2018 case T_LONG:
2019 assert(isQ, "unsupported");
2020 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2021 break;
2022 default:
2023 assert(false, "unsupported");
2024 ShouldNotReachHere();
2025 }
2026 BLOCK_COMMENT("} neon_reduce_logical");
2027 }
2028
2029 // Helper function to decode min/max reduction operation properties
2030 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2031 bool* is_unsigned,
2032 Condition* cond) {
2033 switch(opc) {
2034 case Op_MinReductionV:
2035 *is_min = true; *is_unsigned = false; *cond = LT; break;
2036 case Op_MaxReductionV:
2037 *is_min = false; *is_unsigned = false; *cond = GT; break;
2038 case Op_UMinReductionV:
2039 *is_min = true; *is_unsigned = true; *cond = LO; break;
2040 case Op_UMaxReductionV:
2041 *is_min = false; *is_unsigned = true; *cond = HI; break;
2042 default:
2043 ShouldNotReachHere();
2044 }
2045 }
2046
2047 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2048 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2049 // Clobbers: rscratch1, rflags
2050 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2051 Register isrc, FloatRegister vsrc,
2052 unsigned vector_length_in_bytes,
2053 FloatRegister vtmp) {
2054 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2055 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2056 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2057 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2058 assert_different_registers(dst, isrc);
2059 bool isQ = vector_length_in_bytes == 16;
2060 bool is_min;
2061 bool is_unsigned;
2062 Condition cond;
2063 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2064 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2065 if (bt == T_LONG) {
2066 assert(vtmp == fnoreg, "should be");
2067 assert(isQ, "should be");
2068 umov(rscratch1, vsrc, D, 0);
2069 cmp(isrc, rscratch1);
2070 csel(dst, isrc, rscratch1, cond);
2071 umov(rscratch1, vsrc, D, 1);
2072 cmp(dst, rscratch1);
2073 csel(dst, dst, rscratch1, cond);
2074 } else {
2075 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2076 if (size == T2S) {
2077 // For T2S (2x32-bit elements), use pairwise instructions because
2078 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2079 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2080 } else {
2081 // For other sizes, use reduction to scalar instructions.
2082 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2083 }
2084 if (bt == T_INT) {
2085 umov(dst, vtmp, S, 0);
2086 } else if (is_unsigned) {
2087 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2088 } else {
2089 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2090 }
2091 cmpw(dst, isrc);
2092 cselw(dst, dst, isrc, cond);
2093 }
2094 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2095 }
2096
2097 // Vector reduction for integral type with SVE instruction.
2098 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2099 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2100 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2101 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2102 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2103 assert(pg->is_governing(), "This register has to be a governing predicate register");
2104 assert_different_registers(src1, dst);
2105 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2106 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2107 switch (opc) {
2108 case Op_AddReductionVI: {
2109 sve_uaddv(tmp, size, pg, src2);
2110 if (bt == T_BYTE) {
2111 smov(dst, tmp, size, 0);
2112 addw(dst, src1, dst, ext::sxtb);
2113 } else if (bt == T_SHORT) {
2114 smov(dst, tmp, size, 0);
2115 addw(dst, src1, dst, ext::sxth);
2116 } else {
2117 umov(dst, tmp, size, 0);
2118 addw(dst, dst, src1);
2119 }
2120 break;
2121 }
2122 case Op_AddReductionVL: {
2123 sve_uaddv(tmp, size, pg, src2);
2124 umov(dst, tmp, size, 0);
2125 add(dst, dst, src1);
2126 break;
2127 }
2128 case Op_AndReductionV: {
2129 sve_andv(tmp, size, pg, src2);
2130 if (bt == T_INT || bt == T_LONG) {
2131 umov(dst, tmp, size, 0);
2132 } else {
2133 smov(dst, tmp, size, 0);
2134 }
2135 if (bt == T_LONG) {
2136 andr(dst, dst, src1);
2137 } else {
2138 andw(dst, dst, src1);
2139 }
2140 break;
2141 }
2142 case Op_OrReductionV: {
2143 sve_orv(tmp, size, pg, src2);
2144 if (bt == T_INT || bt == T_LONG) {
2145 umov(dst, tmp, size, 0);
2146 } else {
2147 smov(dst, tmp, size, 0);
2148 }
2149 if (bt == T_LONG) {
2150 orr(dst, dst, src1);
2151 } else {
2152 orrw(dst, dst, src1);
2153 }
2154 break;
2155 }
2156 case Op_XorReductionV: {
2157 sve_eorv(tmp, size, pg, src2);
2158 if (bt == T_INT || bt == T_LONG) {
2159 umov(dst, tmp, size, 0);
2160 } else {
2161 smov(dst, tmp, size, 0);
2162 }
2163 if (bt == T_LONG) {
2164 eor(dst, dst, src1);
2165 } else {
2166 eorw(dst, dst, src1);
2167 }
2168 break;
2169 }
2170 case Op_MaxReductionV:
2171 case Op_MinReductionV:
2172 case Op_UMaxReductionV:
2173 case Op_UMinReductionV: {
2174 bool is_min;
2175 bool is_unsigned;
2176 Condition cond;
2177 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2178 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2179 // Move result from vector to general register
2180 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2181 umov(dst, tmp, size, 0);
2182 } else {
2183 smov(dst, tmp, size, 0);
2184 }
2185 if (bt == T_LONG) {
2186 cmp(dst, src1);
2187 csel(dst, dst, src1, cond);
2188 } else {
2189 cmpw(dst, src1);
2190 cselw(dst, dst, src1, cond);
2191 }
2192 break;
2193 }
2194 default:
2195 assert(false, "unsupported");
2196 ShouldNotReachHere();
2197 }
2198
2199 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2200 if (bt == T_BYTE) {
2201 sxtb(dst, dst);
2202 } else if (bt == T_SHORT) {
2203 sxth(dst, dst);
2204 }
2205 }
2206 }
2207
2208 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2209 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2210 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2211 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2212 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2213 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2214
2215 // Set all elements to false if the input "lane_cnt" is zero.
2216 if (lane_cnt == 0) {
2217 sve_pfalse(dst);
2218 return;
2219 }
2220
2221 SIMD_RegVariant size = elemType_to_regVariant(bt);
2222 assert(size != Q, "invalid size");
2223
2224 // Set all true if "lane_cnt" equals to the max lane count.
2225 if (lane_cnt == max_vector_length) {
2226 sve_ptrue(dst, size, /* ALL */ 0b11111);
2227 return;
2228 }
2229
2230 // Fixed numbers for "ptrue".
2231 switch(lane_cnt) {
2232 case 1: /* VL1 */
2233 case 2: /* VL2 */
2234 case 3: /* VL3 */
2235 case 4: /* VL4 */
2236 case 5: /* VL5 */
2237 case 6: /* VL6 */
2238 case 7: /* VL7 */
2239 case 8: /* VL8 */
2240 sve_ptrue(dst, size, lane_cnt);
2241 return;
2242 case 16:
2243 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2244 return;
2245 case 32:
2246 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2247 return;
2248 case 64:
2249 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2250 return;
2251 case 128:
2252 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2253 return;
2254 case 256:
2255 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2256 return;
2257 default:
2258 break;
2259 }
2260
2261 // Special patterns for "ptrue".
2262 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2263 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2264 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2265 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2266 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2267 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2268 } else {
2269 // Encode to "whileltw" for the remaining cases.
2270 mov(rscratch1, lane_cnt);
2271 sve_whileltw(dst, size, zr, rscratch1);
2272 }
2273 }
2274
2275 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2276 // Any remaining elements of dst will be filled with zero.
2277 // Clobbers: rscratch1
2278 // Preserves: mask, vzr
2279 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2280 FloatRegister vzr, FloatRegister vtmp,
2281 PRegister pgtmp, unsigned vector_length_in_bytes) {
2282 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2283 // When called by sve_compress_byte, src and vtmp may be the same register.
2284 assert_different_registers(dst, src, vzr);
2285 assert_different_registers(dst, vtmp, vzr);
2286 assert_different_registers(mask, pgtmp);
2287 // high <-- low
2288 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2289 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2290 // Expected result: dst = 00 00 00 hh ee dd bb aa
2291
2292 // Extend lowest half to type INT.
2293 // dst = 00dd 00cc 00bb 00aa
2294 sve_uunpklo(dst, S, src);
2295 // pgtmp = 0001 0000 0001 0001
2296 sve_punpklo(pgtmp, mask);
2297 // Pack the active elements in size of type INT to the right,
2298 // and fill the remainings with zero.
2299 // dst = 0000 00dd 00bb 00aa
2300 sve_compact(dst, S, dst, pgtmp);
2301 // Narrow the result back to type SHORT.
2302 // dst = 00 00 00 00 00 dd bb aa
2303 sve_uzp1(dst, H, dst, vzr);
2304
2305 // Return if the vector length is no more than MaxVectorSize/2, since the
2306 // highest half is invalid.
2307 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2308 return;
2309 }
2310
2311 // Count the active elements of lowest half.
2312 // rscratch1 = 3
2313 sve_cntp(rscratch1, S, ptrue, pgtmp);
2314
2315 // Repeat to the highest half.
2316 // pgtmp = 0001 0000 0000 0001
2317 sve_punpkhi(pgtmp, mask);
2318 // vtmp = 00hh 00gg 00ff 00ee
2319 sve_uunpkhi(vtmp, S, src);
2320 // vtmp = 0000 0000 00hh 00ee
2321 sve_compact(vtmp, S, vtmp, pgtmp);
2322 // vtmp = 00 00 00 00 00 00 hh ee
2323 sve_uzp1(vtmp, H, vtmp, vzr);
2324
2325 // pgtmp = 00 00 00 00 00 01 01 01
2326 sve_whilelt(pgtmp, H, zr, rscratch1);
2327 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2328 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2329 // Combine the compressed low with the compressed high:
2330 // dst = 00 00 00 hh ee dd bb aa
2331 sve_splice(dst, H, pgtmp, vtmp);
2332 }
2333
2334 // Clobbers: rscratch1, rscratch2
2335 // Preserves: src, mask
2336 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2337 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2338 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2339 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2340 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2341 assert_different_registers(mask, ptmp, pgtmp);
2342 // high <-- low
2343 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2344 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2345 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2346 FloatRegister vzr = vtmp3;
2347 sve_dup(vzr, B, 0);
2348
2349 // Extend lowest half to type SHORT.
2350 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2351 sve_uunpklo(vtmp1, H, src);
2352 // ptmp = 00 01 00 00 00 01 00 01
2353 sve_punpklo(ptmp, mask);
2354 // Pack the active elements in size of type SHORT to the right,
2355 // and fill the remainings with zero.
2356 // dst = 00 00 00 00 00 0g 0c 0a
2357 unsigned extended_size = vector_length_in_bytes << 1;
2358 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2359 // Narrow the result back to type BYTE.
2360 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2361 sve_uzp1(dst, B, dst, vzr);
2362
2363 // Return if the vector length is no more than MaxVectorSize/2, since the
2364 // highest half is invalid.
2365 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2366 return;
2367 }
2368 // Count the active elements of lowest half.
2369 // rscratch2 = 3
2370 sve_cntp(rscratch2, H, ptrue, ptmp);
2371
2372 // Repeat to the highest half.
2373 // ptmp = 00 01 00 00 00 00 00 01
2374 sve_punpkhi(ptmp, mask);
2375 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2376 sve_uunpkhi(vtmp2, H, src);
2377 // vtmp1 = 00 00 00 00 00 00 0p 0i
2378 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2379 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2380 sve_uzp1(vtmp1, B, vtmp1, vzr);
2381
2382 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2383 sve_whilelt(ptmp, B, zr, rscratch2);
2384 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2385 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2386 // Combine the compressed low with the compressed high:
2387 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2388 sve_splice(dst, B, ptmp, vtmp1);
2389 }
2390
2391 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2392 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2393 SIMD_Arrangement size = isQ ? T16B : T8B;
2394 if (bt == T_BYTE) {
2395 rbit(dst, size, src);
2396 } else {
2397 neon_reverse_bytes(dst, src, bt, isQ);
2398 rbit(dst, size, dst);
2399 }
2400 }
2401
2402 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2403 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2404 SIMD_Arrangement size = isQ ? T16B : T8B;
2405 switch (bt) {
2406 case T_BYTE:
2407 if (dst != src) {
2408 orr(dst, size, src, src);
2409 }
2410 break;
2411 case T_SHORT:
2412 rev16(dst, size, src);
2413 break;
2414 case T_INT:
2415 rev32(dst, size, src);
2416 break;
2417 case T_LONG:
2418 rev64(dst, size, src);
2419 break;
2420 default:
2421 assert(false, "unsupported");
2422 ShouldNotReachHere();
2423 }
2424 }
2425
2426 // VectorRearrange implementation for short/int/float/long/double types with NEON
2427 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2428 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2429 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2430 // and use bsl to implement the operation.
2431 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2432 FloatRegister shuffle, FloatRegister tmp,
2433 BasicType bt, bool isQ) {
2434 assert_different_registers(dst, src, shuffle, tmp);
2435 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2436 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2437
2438 // Here is an example that rearranges a NEON vector with 4 ints:
2439 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2440 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2441 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2442 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2443 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2444 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2445 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2446 // 4. Use Vm as index register, and use V1 as table register.
2447 // Then get V2 as the result by tbl NEON instructions.
2448 switch (bt) {
2449 case T_SHORT:
2450 mov(tmp, size1, 0x02);
2451 mulv(dst, size2, shuffle, tmp);
2452 mov(tmp, size2, 0x0100);
2453 addv(dst, size1, dst, tmp);
2454 tbl(dst, size1, src, 1, dst);
2455 break;
2456 case T_INT:
2457 case T_FLOAT:
2458 mov(tmp, size1, 0x04);
2459 mulv(dst, size2, shuffle, tmp);
2460 mov(tmp, size2, 0x03020100);
2461 addv(dst, size1, dst, tmp);
2462 tbl(dst, size1, src, 1, dst);
2463 break;
2464 case T_LONG:
2465 case T_DOUBLE:
2466 {
2467 int idx = vector_iota_entry_index(T_LONG);
2468 lea(rscratch1,
2469 ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2470 ldrq(tmp, rscratch1);
2471 // Check whether the input "shuffle" is the same with iota indices.
2472 // Return "src" if true, otherwise swap the two elements of "src".
2473 cm(EQ, dst, size2, shuffle, tmp);
2474 ext(tmp, size1, src, src, 8);
2475 bsl(dst, size1, src, tmp);
2476 }
2477 break;
2478 default:
2479 assert(false, "unsupported element type");
2480 ShouldNotReachHere();
2481 }
2482 }
2483
2484 // Extract a scalar element from an sve vector at position 'idx'.
2485 // The input elements in src are expected to be of integral type.
2486 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2487 int idx, FloatRegister vtmp) {
2488 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2489 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2490 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2491 if (bt == T_INT || bt == T_LONG) {
2492 umov(dst, src, size, idx);
2493 } else {
2494 smov(dst, src, size, idx);
2495 }
2496 } else {
2497 sve_orr(vtmp, src, src);
2498 sve_ext(vtmp, vtmp, idx << size);
2499 if (bt == T_INT || bt == T_LONG) {
2500 umov(dst, vtmp, size, 0);
2501 } else {
2502 smov(dst, vtmp, size, 0);
2503 }
2504 }
2505 }
2506
2507 // java.lang.Math::round intrinsics
2508
2509 // Clobbers: rscratch1, rflags
2510 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2511 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2512 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2513 switch (T) {
2514 case T2S:
2515 case T4S:
2516 fmovs(tmp1, T, 0.5f);
2517 mov(rscratch1, jint_cast(0x1.0p23f));
2518 break;
2519 case T2D:
2520 fmovd(tmp1, T, 0.5);
2521 mov(rscratch1, julong_cast(0x1.0p52));
2522 break;
2523 default:
2524 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2525 }
2526 fadd(tmp1, T, tmp1, src);
2527 fcvtms(tmp1, T, tmp1);
2528 // tmp1 = floor(src + 0.5, ties to even)
2529
2530 fcvtas(dst, T, src);
2531 // dst = round(src), ties to away
2532
2533 fneg(tmp3, T, src);
2534 dup(tmp2, T, rscratch1);
2535 cm(HS, tmp3, T, tmp3, tmp2);
2536 // tmp3 is now a set of flags
2537
2538 bif(dst, T16B, tmp1, tmp3);
2539 // result in dst
2540 }
2541
2542 // Clobbers: rscratch1, rflags
2543 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2544 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2545 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2546 assert_different_registers(tmp1, tmp2, src, dst);
2547
2548 switch (T) {
2549 case S:
2550 mov(rscratch1, jint_cast(0x1.0p23f));
2551 break;
2552 case D:
2553 mov(rscratch1, julong_cast(0x1.0p52));
2554 break;
2555 default:
2556 assert(T == S || T == D, "invalid register variant");
2557 }
2558
2559 sve_frinta(dst, T, ptrue, src);
2560 // dst = round(src), ties to away
2561
2562 Label none;
2563
2564 sve_fneg(tmp1, T, ptrue, src);
2565 sve_dup(tmp2, T, rscratch1);
2566 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2567 br(EQ, none);
2568 {
2569 sve_cpy(tmp1, T, pgtmp, 0.5);
2570 sve_fadd(tmp1, T, pgtmp, src);
2571 sve_frintm(dst, T, pgtmp, tmp1);
2572 // dst = floor(src + 0.5, ties to even)
2573 }
2574 bind(none);
2575
2576 sve_fcvtzs(dst, T, ptrue, dst, T);
2577 // result in dst
2578 }
2579
2580 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2581 FloatRegister one, SIMD_Arrangement T) {
2582 assert_different_registers(dst, src, zero, one);
2583 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2584
2585 facgt(dst, T, src, zero);
2586 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2587 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2588 }
2589
2590 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2591 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2592 assert_different_registers(dst, src, zero, one, vtmp);
2593 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2594
2595 sve_orr(vtmp, src, src);
2596 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2597 switch (T) {
2598 case S:
2599 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2600 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2601 // on the sign of the float value
2602 break;
2603 case D:
2604 sve_and(vtmp, T, min_jlong);
2605 sve_orr(vtmp, T, jlong_cast(1.0));
2606 break;
2607 default:
2608 assert(false, "unsupported");
2609 ShouldNotReachHere();
2610 }
2611 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2612 // Result in dst
2613 }
2614
2615 bool C2_MacroAssembler::in_scratch_emit_size() {
2616 if (ciEnv::current()->task() != nullptr) {
2617 PhaseOutput* phase_output = Compile::current()->output();
2618 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2619 return true;
2620 }
2621 }
2622 return MacroAssembler::in_scratch_emit_size();
2623 }
2624
2625 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2626 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2627 }
2628
2629 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2630 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2631 if (t == TypeInt::INT) {
2632 return;
2633 }
2634
2635 BLOCK_COMMENT("verify_int_in_range {");
2636 Label L_success, L_failure;
2637
2638 jint lo = t->_lo;
2639 jint hi = t->_hi;
2640
2641 if (lo != min_jint) {
2642 subsw(rtmp, rval, lo);
2643 br(Assembler::LT, L_failure);
2644 }
2645 if (hi != max_jint) {
2646 subsw(rtmp, rval, hi);
2647 br(Assembler::GT, L_failure);
2648 }
2649 b(L_success);
2650
2651 bind(L_failure);
2652 movw(c_rarg0, idx);
2653 mov(c_rarg1, rval);
2654 movw(c_rarg2, lo);
2655 movw(c_rarg3, hi);
2656 reconstruct_frame_pointer(rtmp);
2657 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2658 hlt(0);
2659
2660 bind(L_success);
2661 BLOCK_COMMENT("} verify_int_in_range");
2662 }
2663
2664 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2665 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2666 }
2667
2668 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2669 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2670 if (t == TypeLong::LONG) {
2671 return;
2672 }
2673
2674 BLOCK_COMMENT("verify_long_in_range {");
2675 Label L_success, L_failure;
2676
2677 jlong lo = t->_lo;
2678 jlong hi = t->_hi;
2679
2680 if (lo != min_jlong) {
2681 subs(rtmp, rval, lo);
2682 br(Assembler::LT, L_failure);
2683 }
2684 if (hi != max_jlong) {
2685 subs(rtmp, rval, hi);
2686 br(Assembler::GT, L_failure);
2687 }
2688 b(L_success);
2689
2690 bind(L_failure);
2691 movw(c_rarg0, idx);
2692 mov(c_rarg1, rval);
2693 mov(c_rarg2, lo);
2694 mov(c_rarg3, hi);
2695 reconstruct_frame_pointer(rtmp);
2696 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2697 hlt(0);
2698
2699 bind(L_success);
2700 BLOCK_COMMENT("} verify_long_in_range");
2701 }
2702
2703 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2704 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2705 if (PreserveFramePointer) {
2706 // frame pointer is valid
2707 #ifdef ASSERT
2708 // Verify frame pointer value in rfp.
2709 add(rtmp, sp, framesize - 2 * wordSize);
2710 Label L_success;
2711 cmp(rfp, rtmp);
2712 br(Assembler::EQ, L_success);
2713 stop("frame pointer mismatch");
2714 bind(L_success);
2715 #endif // ASSERT
2716 } else {
2717 add(rfp, sp, framesize - 2 * wordSize);
2718 }
2719 }
2720
2721 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2722 // using Neon instructions and places it in the destination vector element corresponding to the
2723 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2724 // where NUM_ELEM is the number of BasicType elements per vector.
2725 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2726 // Otherwise, selects src2[idx – NUM_ELEM]
2727 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2728 FloatRegister src2, FloatRegister index,
2729 FloatRegister tmp, unsigned vector_length_in_bytes) {
2730 assert_different_registers(dst, src1, src2, tmp);
2731 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2732
2733 if (vector_length_in_bytes == 16) {
2734 assert(UseSVE <= 1, "sve must be <= 1");
2735 assert(src1->successor() == src2, "Source registers must be ordered");
2736 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2737 tbl(dst, size, src1, 2, index);
2738 } else { // vector length == 8
2739 assert(UseSVE == 0, "must be Neon only");
2740 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2741 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2742 // instruction with one vector lookup
2743 ins(tmp, D, src1, 0, 0);
2744 ins(tmp, D, src2, 1, 0);
2745 tbl(dst, size, tmp, 1, index);
2746 }
2747 }
2748
2749 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2750 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2751 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2752 // where NUM_ELEM is the number of BasicType elements per vector.
2753 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2754 // Otherwise, selects src2[idx – NUM_ELEM]
2755 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2756 FloatRegister src2, FloatRegister index,
2757 FloatRegister tmp, SIMD_RegVariant T,
2758 unsigned vector_length_in_bytes) {
2759 assert_different_registers(dst, src1, src2, index, tmp);
2760
2761 if (vector_length_in_bytes == 8) {
2762 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2763 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2764 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2765 // instruction with one vector lookup
2766 assert(UseSVE >= 1, "sve must be >= 1");
2767 ins(tmp, D, src1, 0, 0);
2768 ins(tmp, D, src2, 1, 0);
2769 sve_tbl(dst, T, tmp, index);
2770 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2771 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2772 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2773 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2774 // with the only exception of 8B vector length.
2775 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2776 assert(src1->successor() == src2, "Source registers must be ordered");
2777 sve_tbl(dst, T, src1, src2, index);
2778 }
2779 }
2780
2781 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2782 FloatRegister src2, FloatRegister index,
2783 FloatRegister tmp, BasicType bt,
2784 unsigned vector_length_in_bytes) {
2785
2786 assert_different_registers(dst, src1, src2, index, tmp);
2787
2788 // The cases that can reach this method are -
2789 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2790 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2791 //
2792 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2793 // and UseSVE = 2 with vector_length_in_bytes >= 8
2794 //
2795 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2796 // UseSVE = 1 with vector_length_in_bytes = 16
2797
2798 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2799 SIMD_RegVariant T = elemType_to_regVariant(bt);
2800 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2801 return;
2802 }
2803
2804 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2805 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2806 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2807
2808 bool isQ = vector_length_in_bytes == 16;
2809
2810 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2811 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2812
2813 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2814 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2815 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2816 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2817 // the indices can range from [0, 8).
2818 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2819 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2820 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2821 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2822 // Add the multiplied result to the vector in tmp to obtain the byte level
2823 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2824 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2825
2826 if (bt == T_BYTE) {
2827 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2828 } else {
2829 int elem_size = (bt == T_SHORT) ? 2 : 4;
2830 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2831
2832 mov(tmp, size1, elem_size);
2833 mulv(dst, size2, index, tmp);
2834 mov(tmp, size2, tbl_offset);
2835 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2836 // to select a set of 2B/4B
2837 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2838 }
2839 }
2840
2841 // Vector expand implementation. Elements from the src vector are expanded into
2842 // the dst vector under the control of the vector mask.
2843 // Since there are no native instructions directly corresponding to expand before
2844 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2845 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2846 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2847 // for NEON and SVE, but with different instructions where appropriate.
2848
2849 // Vector expand implementation for NEON.
2850 //
2851 // An example of 128-bit Byte vector:
2852 // Data direction: high <== low
2853 // Input:
2854 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2855 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2856 // Expected result:
2857 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2858 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2859 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2860 int vector_length_in_bytes) {
2861 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2862 assert_different_registers(dst, src, mask, tmp1, tmp2);
2863 // Since the TBL instruction only supports byte table, we need to
2864 // compute indices in byte type for all types.
2865 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2866 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2867 dup(tmp1, size, zr);
2868 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2869 negr(dst, size, mask);
2870 // Calculate vector index for TBL with prefix sum algorithm.
2871 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2872 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2873 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2874 addv(dst, size, tmp2, dst);
2875 }
2876 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2877 orr(tmp2, size, mask, mask);
2878 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2879 bsl(tmp2, size, dst, tmp1);
2880 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2881 movi(tmp1, size, 1);
2882 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2883 subv(dst, size, tmp2, tmp1);
2884 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2885 tbl(dst, size, src, 1, dst);
2886 }
2887
2888 // Vector expand implementation for SVE.
2889 //
2890 // An example of 128-bit Short vector:
2891 // Data direction: high <== low
2892 // Input:
2893 // src = gf ed cb a9 87 65 43 21
2894 // pg = 00 01 00 01 00 01 00 01
2895 // Expected result:
2896 // dst = 00 87 00 65 00 43 00 21
2897 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2898 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2899 int vector_length_in_bytes) {
2900 assert(UseSVE > 0, "expand implementation only for SVE");
2901 assert_different_registers(dst, src, tmp1, tmp2);
2902 SIMD_RegVariant size = elemType_to_regVariant(bt);
2903
2904 // tmp1 = 00 00 00 00 00 00 00 00
2905 sve_dup(tmp1, size, 0);
2906 sve_movprfx(tmp2, tmp1);
2907 // tmp2 = 00 01 00 01 00 01 00 01
2908 sve_cpy(tmp2, size, pg, 1, true);
2909 // Calculate vector index for TBL with prefix sum algorithm.
2910 // tmp2 = 04 04 03 03 02 02 01 01
2911 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2912 sve_movprfx(dst, tmp1);
2913 // The EXT instruction operates on the full-width sve register. The correct
2914 // index calculation method is:
2915 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2916 // MaxVectorSize - i.
2917 sve_ext(dst, tmp2, MaxVectorSize - i);
2918 sve_add(tmp2, size, dst, tmp2);
2919 }
2920 // dst = 00 04 00 03 00 02 00 01
2921 sve_sel(dst, size, pg, tmp2, tmp1);
2922 // dst = -1 03 -1 02 -1 01 -1 00
2923 sve_sub(dst, size, 1);
2924 // dst = 00 87 00 65 00 43 00 21
2925 sve_tbl(dst, size, src, dst);
2926 }
2927
2928 // Optimized SVE cpy (imm, zeroing) instruction.
2929 //
2930 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2931 // functionality, but test results show that `movi; cpy(imm, merging)` has
2932 // higher throughput on some microarchitectures. This would depend on
2933 // microarchitecture and so may vary between implementations.
2934 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2935 PRegister pg, int imm8, bool isMerge) {
2936 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2937 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2938 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2939 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2940 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2941 // entire Z<dst> register. According to the Arm Software Optimization
2942 // Guide, `movi` is zero latency.
2943 movi(dst, T2D, 0);
2944 isMerge = true;
2945 }
2946 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2947 }
2948
2949 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2950 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2951 // the offset between two types is 16.
2952 switch(bt) {
2953 case T_BYTE:
2954 return 0;
2955 case T_SHORT:
2956 return 1;
2957 case T_INT:
2958 return 2;
2959 case T_LONG:
2960 return 3;
2961 case T_FLOAT:
2962 return 4;
2963 case T_DOUBLE:
2964 return 5;
2965 default:
2966 ShouldNotReachHere();
2967 }
2968 }