1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2026 Arm Limited and/or its affiliates.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/matcher.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/objectMonitorTable.hpp"
35 #include "runtime/stubRoutines.hpp"
36 #include "runtime/synchronizer.hpp"
37 #include "utilities/globalDefinitions.hpp"
38 #include "utilities/powerOfTwo.hpp"
39
40 #ifdef PRODUCT
41 #define BLOCK_COMMENT(str) /* nothing */
42 #define STOP(error) stop(error)
43 #else
44 #define BLOCK_COMMENT(str) block_comment(str)
45 #define STOP(error) block_comment(error); stop(error)
46 #endif
47
48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
49
50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
51
52 // jdk.internal.util.ArraysSupport.vectorizedHashCode
53 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
54 FloatRegister vdata0, FloatRegister vdata1,
55 FloatRegister vdata2, FloatRegister vdata3,
56 FloatRegister vmul0, FloatRegister vmul1,
57 FloatRegister vmul2, FloatRegister vmul3,
58 FloatRegister vpow, FloatRegister vpowm,
59 BasicType eltype) {
60 ARRAYS_HASHCODE_REGISTERS;
61
62 Register tmp1 = rscratch1, tmp2 = rscratch2;
63
64 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
65
66 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
67 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
68 // use 4H for chars and shorts instead, but using 8H gives better performance.
69 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
70 : eltype == T_CHAR || eltype == T_SHORT ? 8
71 : eltype == T_INT ? 4
72 : 0;
73 guarantee(vf, "unsupported eltype");
74
75 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
76 const size_t unroll_factor = 4;
77
78 switch (eltype) {
79 case T_BOOLEAN:
80 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
81 break;
82 case T_CHAR:
83 BLOCK_COMMENT("arrays_hashcode(char) {");
84 break;
85 case T_BYTE:
86 BLOCK_COMMENT("arrays_hashcode(byte) {");
87 break;
88 case T_SHORT:
89 BLOCK_COMMENT("arrays_hashcode(short) {");
90 break;
91 case T_INT:
92 BLOCK_COMMENT("arrays_hashcode(int) {");
93 break;
94 default:
95 ShouldNotReachHere();
96 }
97
98 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
99 // implemented by the stub executes just once. Call the stub only if at least two iterations will
100 // be executed.
101 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
102 cmpw(cnt, large_threshold);
103 br(Assembler::HS, LARGE);
104
105 bind(TAIL);
106
107 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
108 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
109 // Iteration eats up the remainder, uf elements at a time.
110 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
111 andr(tmp2, cnt, unroll_factor - 1);
112 adr(tmp1, BR_BASE);
113 // For Cortex-A53 offset is 4 because 2 nops are generated.
114 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
115 movw(tmp2, 0x1f);
116 br(tmp1);
117
118 bind(LOOP);
119 for (size_t i = 0; i < unroll_factor; ++i) {
120 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
121 maddw(result, result, tmp2, tmp1);
122 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
123 // Generate 2nd nop to have 4 instructions per iteration.
124 if (VM_Version::supports_a53mac()) {
125 nop();
126 }
127 }
128 bind(BR_BASE);
129 subsw(cnt, cnt, unroll_factor);
130 br(Assembler::HS, LOOP);
131
132 b(DONE);
133
134 bind(LARGE);
135
136 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
137 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
138 address tpc = trampoline_call(stub);
139 if (tpc == nullptr) {
140 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
141 postcond(pc() == badAddress);
142 return nullptr;
143 }
144
145 bind(DONE);
146
147 BLOCK_COMMENT("} // arrays_hashcode");
148
149 postcond(pc() != badAddress);
150 return pc();
151 }
152
153 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
154 Register t2, Register t3) {
155 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
156
157 // Handle inflated monitor.
158 Label inflated;
159 // Finish fast lock successfully. MUST branch to with flag == EQ
160 Label locked;
161 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
162 Label slow_path;
163
164 if (UseObjectMonitorTable) {
165 // Clear cache in case fast locking succeeds or we need to take the slow-path.
166 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
167 }
168
169 if (DiagnoseSyncOnValueBasedClasses != 0) {
170 load_klass(t1, obj);
171 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
172 tst(t1, KlassFlags::_misc_is_value_based_class);
173 br(Assembler::NE, slow_path);
174 }
175
176 const Register t1_mark = t1;
177 const Register t3_t = t3;
178
179 { // Fast locking
180
181 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
182 Label push;
183
184 const Register t2_top = t2;
185
186 // Check if lock-stack is full.
187 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
188 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
189 br(Assembler::GT, slow_path);
190
191 // Check if recursive.
192 subw(t3_t, t2_top, oopSize);
193 ldr(t3_t, Address(rthread, t3_t));
194 cmp(obj, t3_t);
195 br(Assembler::EQ, push);
196
197 // Relaxed normal load to check for monitor. Optimization for monitor case.
198 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
199 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
200
201 // Not inflated
202 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
203
204 // Try to lock. Transition lock-bits 0b01 => 0b00
205 orr(t1_mark, t1_mark, markWord::unlocked_value);
206 eor(t3_t, t1_mark, markWord::unlocked_value);
207 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_acquire);
208 br(Assembler::NE, slow_path);
209
210 bind(push);
211 // After successful lock, push object on lock-stack.
212 str(obj, Address(rthread, t2_top));
213 addw(t2_top, t2_top, oopSize);
214 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
215 b(locked);
216 }
217
218 { // Handle inflated monitor.
219 bind(inflated);
220
221 const Register t1_monitor = t1;
222
223 if (!UseObjectMonitorTable) {
224 assert(t1_monitor == t1_mark, "should be the same here");
225 } else {
226 const Register t1_hash = t1;
227 Label monitor_found;
228
229 // Save the mark, we might need it to extract the hash.
230 mov(t3, t1_mark);
231
232 // Look for the monitor in the om_cache.
233
234 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
235 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
236 const int num_unrolled = OMCache::CAPACITY;
237 for (int i = 0; i < num_unrolled; i++) {
238 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
239 ldr(t2, Address(rthread, cache_offset));
240 cmp(obj, t2);
241 br(Assembler::EQ, monitor_found);
242 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
243 }
244
245 // Look for the monitor in the table.
246
247 // Get the hash code.
248 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
249
250 // Get the table and calculate the bucket's address
251 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
252 ldr(t3, Address(t3));
253 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
254 ands(t1_hash, t1_hash, t2);
255 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
256
257 // Read the monitor from the bucket.
258 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
259
260 // Check if the monitor in the bucket is special (empty, tombstone or removed).
261 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
262 br(Assembler::LO, slow_path);
263
264 // Check if object matches.
265 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
266 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
267 bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path);
268 cmp(t3, obj);
269 br(Assembler::NE, slow_path);
270
271 bind(monitor_found);
272 }
273
274 const Register t2_owner_addr = t2;
275 const Register t3_owner = t3;
276 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
277 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
278 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
279
280 Label monitor_locked;
281
282 // Compute owner address.
283 lea(t2_owner_addr, owner_address);
284
285 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
286 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
287 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, memory_order_acquire, t3_owner);
288 br(Assembler::EQ, monitor_locked);
289
290 // Check if recursive.
291 cmp(t3_owner, rscratch2);
292 br(Assembler::NE, slow_path);
293
294 // Recursive.
295 increment(recursions_address, 1);
296
297 bind(monitor_locked);
298 if (UseObjectMonitorTable) {
299 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
300 }
301 }
302
303 bind(locked);
304
305 #ifdef ASSERT
306 // Check that locked label is reached with Flags == EQ.
307 Label flag_correct;
308 br(Assembler::EQ, flag_correct);
309 stop("Fast Lock Flag != EQ");
310 #endif
311
312 bind(slow_path);
313 #ifdef ASSERT
314 // Check that slow_path label is reached with Flags == NE.
315 br(Assembler::NE, flag_correct);
316 stop("Fast Lock Flag != NE");
317 bind(flag_correct);
318 #endif
319 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
320 }
321
322 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
323 Register t2, Register t3) {
324 assert_different_registers(obj, box, t1, t2, t3);
325
326 // Handle inflated monitor.
327 Label inflated, inflated_load_mark;
328 // Finish fast unlock successfully. MUST branch to with flag == EQ
329 Label unlocked;
330 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
331 Label slow_path;
332
333 const Register t1_mark = t1;
334 const Register t2_top = t2;
335 const Register t3_t = t3;
336
337 { // Fast unlock
338
339 Label push_and_slow_path;
340
341 // Check if obj is top of lock-stack.
342 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
343 subw(t2_top, t2_top, oopSize);
344 ldr(t3_t, Address(rthread, t2_top));
345 cmp(obj, t3_t);
346 // Top of lock stack was not obj. Must be monitor.
347 br(Assembler::NE, inflated_load_mark);
348
349 // Pop lock-stack.
350 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
351 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
352
353 // Check if recursive.
354 subw(t3_t, t2_top, oopSize);
355 ldr(t3_t, Address(rthread, t3_t));
356 cmp(obj, t3_t);
357 br(Assembler::EQ, unlocked);
358
359 // Not recursive.
360 // Load Mark.
361 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
362
363 // Check header for monitor (0b10).
364 // Because we got here by popping (meaning we pushed in locked)
365 // there will be no monitor in the box. So we need to push back the obj
366 // so that the runtime can fix any potential anonymous owner.
367 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
368
369 // Try to unlock. Transition lock bits 0b00 => 0b01
370 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
371 orr(t3_t, t1_mark, markWord::unlocked_value);
372 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_release);
373 br(Assembler::EQ, unlocked);
374
375 bind(push_and_slow_path);
376 // Compare and exchange failed.
377 // Restore lock-stack and handle the unlock in runtime.
378 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
379 addw(t2_top, t2_top, oopSize);
380 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
381 b(slow_path);
382 }
383
384
385 { // Handle inflated monitor.
386 bind(inflated_load_mark);
387 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
388 #ifdef ASSERT
389 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
390 stop("Fast Unlock not monitor");
391 #endif
392
393 bind(inflated);
394
395 #ifdef ASSERT
396 Label check_done;
397 subw(t2_top, t2_top, oopSize);
398 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
399 br(Assembler::LT, check_done);
400 ldr(t3_t, Address(rthread, t2_top));
401 cmp(obj, t3_t);
402 br(Assembler::NE, inflated);
403 stop("Fast Unlock lock on stack");
404 bind(check_done);
405 #endif
406
407 const Register t1_monitor = t1;
408
409 if (!UseObjectMonitorTable) {
410 assert(t1_monitor == t1_mark, "should be the same here");
411
412 // Untag the monitor.
413 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
414 } else {
415 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
416 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
417 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
418 br(Assembler::LO, slow_path);
419 }
420
421 const Register t2_recursions = t2;
422 Label not_recursive;
423
424 // Check if recursive.
425 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
426 cbz(t2_recursions, not_recursive);
427
428 // Recursive unlock.
429 sub(t2_recursions, t2_recursions, 1u);
430 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
431 // Set flag == EQ
432 cmp(t2_recursions, t2_recursions);
433 b(unlocked);
434
435 bind(not_recursive);
436
437 const Register t2_owner_addr = t2;
438
439 // Compute owner address.
440 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
441
442 // Set owner to null.
443 // Release to satisfy the JMM
444 stlr(zr, t2_owner_addr);
445 // We need a full fence after clearing owner to avoid stranding.
446 // StoreLoad achieves this.
447 membar(StoreLoad);
448
449 // Check if the entry_list is empty.
450 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
451 cmp(rscratch1, zr);
452 br(Assembler::EQ, unlocked); // If so we are done.
453
454 // Check if there is a successor.
455 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
456 cmp(rscratch1, zr);
457 br(Assembler::NE, unlocked); // If so we are done.
458
459 // Save the monitor pointer in the current thread, so we can try to
460 // reacquire the lock in SharedRuntime::monitor_exit_helper().
461 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
462
463 cmp(zr, rthread); // Set Flag to NE => slow path
464 b(slow_path);
465 }
466
467 bind(unlocked);
468 cmp(zr, zr); // Set Flags to EQ => fast path
469
470 #ifdef ASSERT
471 // Check that unlocked label is reached with Flags == EQ.
472 Label flag_correct;
473 br(Assembler::EQ, flag_correct);
474 stop("Fast Unlock Flag != EQ");
475 #endif
476
477 bind(slow_path);
478 #ifdef ASSERT
479 // Check that slow_path label is reached with Flags == NE.
480 br(Assembler::NE, flag_correct);
481 stop("Fast Unlock Flag != NE");
482 bind(flag_correct);
483 #endif
484 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
485 }
486
487 // Search for str1 in str2 and return index or -1
488 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
489 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
490 Register cnt2, Register cnt1,
491 Register tmp1, Register tmp2,
492 Register tmp3, Register tmp4,
493 Register tmp5, Register tmp6,
494 int icnt1, Register result, int ae) {
495 // NOTE: tmp5, tmp6 can be zr depending on specific method version
496 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
497
498 Register ch1 = rscratch1;
499 Register ch2 = rscratch2;
500 Register cnt1tmp = tmp1;
501 Register cnt2tmp = tmp2;
502 Register cnt1_neg = cnt1;
503 Register cnt2_neg = cnt2;
504 Register result_tmp = tmp4;
505
506 bool isL = ae == StrIntrinsicNode::LL;
507
508 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
509 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
510 int str1_chr_shift = str1_isL ? 0:1;
511 int str2_chr_shift = str2_isL ? 0:1;
512 int str1_chr_size = str1_isL ? 1:2;
513 int str2_chr_size = str2_isL ? 1:2;
514 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
515 (chr_insn)&MacroAssembler::ldrh;
516 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
517 (chr_insn)&MacroAssembler::ldrh;
518 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
519 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
520
521 // Note, inline_string_indexOf() generates checks:
522 // if (substr.count > string.count) return -1;
523 // if (substr.count == 0) return 0;
524
525 // We have two strings, a source string in str2, cnt2 and a pattern string
526 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
527
528 // For larger pattern and source we use a simplified Boyer Moore algorithm.
529 // With a small pattern and source we use linear scan.
530
531 if (icnt1 == -1) {
532 sub(result_tmp, cnt2, cnt1);
533 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
534 br(LT, LINEARSEARCH);
535 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
536 subs(zr, cnt1, 256);
537 lsr(tmp1, cnt2, 2);
538 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
539 br(GE, LINEARSTUB);
540 }
541
542 // The Boyer Moore alogorithm is based on the description here:-
543 //
544 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
545 //
546 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
547 // and the 'Good Suffix' rule.
548 //
549 // These rules are essentially heuristics for how far we can shift the
550 // pattern along the search string.
551 //
552 // The implementation here uses the 'Bad Character' rule only because of the
553 // complexity of initialisation for the 'Good Suffix' rule.
554 //
555 // This is also known as the Boyer-Moore-Horspool algorithm:-
556 //
557 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
558 //
559 // This particular implementation has few java-specific optimizations.
560 //
561 // #define ASIZE 256
562 //
563 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
564 // int i, j;
565 // unsigned c;
566 // unsigned char bc[ASIZE];
567 //
568 // /* Preprocessing */
569 // for (i = 0; i < ASIZE; ++i)
570 // bc[i] = m;
571 // for (i = 0; i < m - 1; ) {
572 // c = x[i];
573 // ++i;
574 // // c < 256 for Latin1 string, so, no need for branch
575 // #ifdef PATTERN_STRING_IS_LATIN1
576 // bc[c] = m - i;
577 // #else
578 // if (c < ASIZE) bc[c] = m - i;
579 // #endif
580 // }
581 //
582 // /* Searching */
583 // j = 0;
584 // while (j <= n - m) {
585 // c = y[i+j];
586 // if (x[m-1] == c)
587 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
588 // if (i < 0) return j;
589 // // c < 256 for Latin1 string, so, no need for branch
590 // #ifdef SOURCE_STRING_IS_LATIN1
591 // // LL case: (c< 256) always true. Remove branch
592 // j += bc[y[j+m-1]];
593 // #endif
594 // #ifndef PATTERN_STRING_IS_UTF
595 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
596 // if (c < ASIZE)
597 // j += bc[y[j+m-1]];
598 // else
599 // j += 1
600 // #endif
601 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
602 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
603 // if (c < ASIZE)
604 // j += bc[y[j+m-1]];
605 // else
606 // j += m
607 // #endif
608 // }
609 // }
610
611 if (icnt1 == -1) {
612 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
613 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
614 Register cnt1end = tmp2;
615 Register str2end = cnt2;
616 Register skipch = tmp2;
617
618 // str1 length is >=8, so, we can read at least 1 register for cases when
619 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
620 // UL case. We'll re-read last character in inner pre-loop code to have
621 // single outer pre-loop load
622 const int firstStep = isL ? 7 : 3;
623
624 const int ASIZE = 256;
625 const int STORED_BYTES = 32; // amount of bytes stored per instruction
626 sub(sp, sp, ASIZE);
627 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
628 mov(ch1, sp);
629 BIND(BM_INIT_LOOP);
630 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
631 subs(tmp5, tmp5, 1);
632 br(GT, BM_INIT_LOOP);
633
634 sub(cnt1tmp, cnt1, 1);
635 mov(tmp5, str2);
636 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
637 sub(ch2, cnt1, 1);
638 mov(tmp3, str1);
639 BIND(BCLOOP);
640 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
641 if (!str1_isL) {
642 subs(zr, ch1, ASIZE);
643 br(HS, BCSKIP);
644 }
645 strb(ch2, Address(sp, ch1));
646 BIND(BCSKIP);
647 subs(ch2, ch2, 1);
648 br(GT, BCLOOP);
649
650 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
651 if (str1_isL == str2_isL) {
652 // load last 8 bytes (8LL/4UU symbols)
653 ldr(tmp6, Address(tmp6, -wordSize));
654 } else {
655 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
656 // convert Latin1 to UTF. We'll have to wait until load completed, but
657 // it's still faster than per-character loads+checks
658 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
659 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
660 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
661 andr(tmp6, tmp6, 0xFF); // str1[N-4]
662 orr(ch2, ch1, ch2, LSL, 16);
663 orr(tmp6, tmp6, tmp3, LSL, 48);
664 orr(tmp6, tmp6, ch2, LSL, 16);
665 }
666 BIND(BMLOOPSTR2);
667 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
668 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
669 if (str1_isL == str2_isL) {
670 // re-init tmp3. It's for free because it's executed in parallel with
671 // load above. Alternative is to initialize it before loop, but it'll
672 // affect performance on in-order systems with 2 or more ld/st pipelines
673 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
674 }
675 if (!isL) { // UU/UL case
676 lsl(ch2, cnt1tmp, 1); // offset in bytes
677 }
678 cmp(tmp3, skipch);
679 br(NE, BMSKIP);
680 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
681 mov(ch1, tmp6);
682 if (isL) {
683 b(BMLOOPSTR1_AFTER_LOAD);
684 } else {
685 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
686 b(BMLOOPSTR1_CMP);
687 }
688 BIND(BMLOOPSTR1);
689 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
690 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
691 BIND(BMLOOPSTR1_AFTER_LOAD);
692 subs(cnt1tmp, cnt1tmp, 1);
693 br(LT, BMLOOPSTR1_LASTCMP);
694 BIND(BMLOOPSTR1_CMP);
695 cmp(ch1, ch2);
696 br(EQ, BMLOOPSTR1);
697 BIND(BMSKIP);
698 if (!isL) {
699 // if we've met UTF symbol while searching Latin1 pattern, then we can
700 // skip cnt1 symbols
701 if (str1_isL != str2_isL) {
702 mov(result_tmp, cnt1);
703 } else {
704 mov(result_tmp, 1);
705 }
706 subs(zr, skipch, ASIZE);
707 br(HS, BMADV);
708 }
709 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
710 BIND(BMADV);
711 sub(cnt1tmp, cnt1, 1);
712 add(str2, str2, result_tmp, LSL, str2_chr_shift);
713 cmp(str2, str2end);
714 br(LE, BMLOOPSTR2);
715 add(sp, sp, ASIZE);
716 b(NOMATCH);
717 BIND(BMLOOPSTR1_LASTCMP);
718 cmp(ch1, ch2);
719 br(NE, BMSKIP);
720 BIND(BMMATCH);
721 sub(result, str2, tmp5);
722 if (!str2_isL) lsr(result, result, 1);
723 add(sp, sp, ASIZE);
724 b(DONE);
725
726 BIND(LINEARSTUB);
727 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
728 br(LT, LINEAR_MEDIUM);
729 mov(result, zr);
730 RuntimeAddress stub = nullptr;
731 if (isL) {
732 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
733 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
734 } else if (str1_isL) {
735 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
736 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
737 } else {
738 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
739 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
740 }
741 address call = trampoline_call(stub);
742 if (call == nullptr) {
743 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
744 ciEnv::current()->record_failure("CodeCache is full");
745 return;
746 }
747 b(DONE);
748 }
749
750 BIND(LINEARSEARCH);
751 {
752 Label DO1, DO2, DO3;
753
754 Register str2tmp = tmp2;
755 Register first = tmp3;
756
757 if (icnt1 == -1)
758 {
759 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
760
761 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
762 br(LT, DOSHORT);
763 BIND(LINEAR_MEDIUM);
764 (this->*str1_load_1chr)(first, Address(str1));
765 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
766 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
767 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
768 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
769
770 BIND(FIRST_LOOP);
771 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
772 cmp(first, ch2);
773 br(EQ, STR1_LOOP);
774 BIND(STR2_NEXT);
775 adds(cnt2_neg, cnt2_neg, str2_chr_size);
776 br(LE, FIRST_LOOP);
777 b(NOMATCH);
778
779 BIND(STR1_LOOP);
780 adds(cnt1tmp, cnt1_neg, str1_chr_size);
781 add(cnt2tmp, cnt2_neg, str2_chr_size);
782 br(GE, MATCH);
783
784 BIND(STR1_NEXT);
785 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
786 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
787 cmp(ch1, ch2);
788 br(NE, STR2_NEXT);
789 adds(cnt1tmp, cnt1tmp, str1_chr_size);
790 add(cnt2tmp, cnt2tmp, str2_chr_size);
791 br(LT, STR1_NEXT);
792 b(MATCH);
793
794 BIND(DOSHORT);
795 if (str1_isL == str2_isL) {
796 cmp(cnt1, (u1)2);
797 br(LT, DO1);
798 br(GT, DO3);
799 }
800 }
801
802 if (icnt1 == 4) {
803 Label CH1_LOOP;
804
805 (this->*load_4chr)(ch1, str1);
806 sub(result_tmp, cnt2, 4);
807 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
808 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
809
810 BIND(CH1_LOOP);
811 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
812 cmp(ch1, ch2);
813 br(EQ, MATCH);
814 adds(cnt2_neg, cnt2_neg, str2_chr_size);
815 br(LE, CH1_LOOP);
816 b(NOMATCH);
817 }
818
819 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
820 Label CH1_LOOP;
821
822 BIND(DO2);
823 (this->*load_2chr)(ch1, str1);
824 if (icnt1 == 2) {
825 sub(result_tmp, cnt2, 2);
826 }
827 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
828 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
829 BIND(CH1_LOOP);
830 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
831 cmp(ch1, ch2);
832 br(EQ, MATCH);
833 adds(cnt2_neg, cnt2_neg, str2_chr_size);
834 br(LE, CH1_LOOP);
835 b(NOMATCH);
836 }
837
838 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
839 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
840
841 BIND(DO3);
842 (this->*load_2chr)(first, str1);
843 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
844 if (icnt1 == 3) {
845 sub(result_tmp, cnt2, 3);
846 }
847 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
848 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
849 BIND(FIRST_LOOP);
850 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
851 cmpw(first, ch2);
852 br(EQ, STR1_LOOP);
853 BIND(STR2_NEXT);
854 adds(cnt2_neg, cnt2_neg, str2_chr_size);
855 br(LE, FIRST_LOOP);
856 b(NOMATCH);
857
858 BIND(STR1_LOOP);
859 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
860 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
861 cmp(ch1, ch2);
862 br(NE, STR2_NEXT);
863 b(MATCH);
864 }
865
866 if (icnt1 == -1 || icnt1 == 1) {
867 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
868
869 BIND(DO1);
870 (this->*str1_load_1chr)(ch1, str1);
871 cmp(cnt2, (u1)8);
872 br(LT, DO1_SHORT);
873
874 sub(result_tmp, cnt2, 8/str2_chr_size);
875 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
876 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
877 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
878
879 if (str2_isL) {
880 orr(ch1, ch1, ch1, LSL, 8);
881 }
882 orr(ch1, ch1, ch1, LSL, 16);
883 orr(ch1, ch1, ch1, LSL, 32);
884 BIND(CH1_LOOP);
885 ldr(ch2, Address(str2, cnt2_neg));
886 eor(ch2, ch1, ch2);
887 sub(tmp1, ch2, tmp3);
888 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
889 bics(tmp1, tmp1, tmp2);
890 br(NE, HAS_ZERO);
891 adds(cnt2_neg, cnt2_neg, 8);
892 br(LT, CH1_LOOP);
893
894 cmp(cnt2_neg, (u1)8);
895 mov(cnt2_neg, 0);
896 br(LT, CH1_LOOP);
897 b(NOMATCH);
898
899 BIND(HAS_ZERO);
900 rev(tmp1, tmp1);
901 clz(tmp1, tmp1);
902 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
903 b(MATCH);
904
905 BIND(DO1_SHORT);
906 mov(result_tmp, cnt2);
907 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
908 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
909 BIND(DO1_LOOP);
910 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
911 cmpw(ch1, ch2);
912 br(EQ, MATCH);
913 adds(cnt2_neg, cnt2_neg, str2_chr_size);
914 br(LT, DO1_LOOP);
915 }
916 }
917 BIND(NOMATCH);
918 mov(result, -1);
919 b(DONE);
920 BIND(MATCH);
921 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
922 BIND(DONE);
923 }
924
925 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
926 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
927
928 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
929 Register ch, Register result,
930 Register tmp1, Register tmp2, Register tmp3)
931 {
932 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
933 Register cnt1_neg = cnt1;
934 Register ch1 = rscratch1;
935 Register result_tmp = rscratch2;
936
937 cbz(cnt1, NOMATCH);
938
939 cmp(cnt1, (u1)4);
940 br(LT, DO1_SHORT);
941
942 orr(ch, ch, ch, LSL, 16);
943 orr(ch, ch, ch, LSL, 32);
944
945 sub(cnt1, cnt1, 4);
946 mov(result_tmp, cnt1);
947 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
948 sub(cnt1_neg, zr, cnt1, LSL, 1);
949
950 mov(tmp3, 0x0001000100010001);
951
952 BIND(CH1_LOOP);
953 ldr(ch1, Address(str1, cnt1_neg));
954 eor(ch1, ch, ch1);
955 sub(tmp1, ch1, tmp3);
956 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
957 bics(tmp1, tmp1, tmp2);
958 br(NE, HAS_ZERO);
959 adds(cnt1_neg, cnt1_neg, 8);
960 br(LT, CH1_LOOP);
961
962 cmp(cnt1_neg, (u1)8);
963 mov(cnt1_neg, 0);
964 br(LT, CH1_LOOP);
965 b(NOMATCH);
966
967 BIND(HAS_ZERO);
968 rev(tmp1, tmp1);
969 clz(tmp1, tmp1);
970 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
971 b(MATCH);
972
973 BIND(DO1_SHORT);
974 mov(result_tmp, cnt1);
975 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
976 sub(cnt1_neg, zr, cnt1, LSL, 1);
977 BIND(DO1_LOOP);
978 ldrh(ch1, Address(str1, cnt1_neg));
979 cmpw(ch, ch1);
980 br(EQ, MATCH);
981 adds(cnt1_neg, cnt1_neg, 2);
982 br(LT, DO1_LOOP);
983 BIND(NOMATCH);
984 mov(result, -1);
985 b(DONE);
986 BIND(MATCH);
987 add(result, result_tmp, cnt1_neg, ASR, 1);
988 BIND(DONE);
989 }
990
991 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
992 Register ch, Register result,
993 FloatRegister ztmp1,
994 FloatRegister ztmp2,
995 PRegister tmp_pg,
996 PRegister tmp_pdn, bool isL)
997 {
998 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
999 assert(tmp_pg->is_governing(),
1000 "this register has to be a governing predicate register");
1001
1002 Label LOOP, MATCH, DONE, NOMATCH;
1003 Register vec_len = rscratch1;
1004 Register idx = rscratch2;
1005
1006 SIMD_RegVariant T = (isL == true) ? B : H;
1007
1008 cbz(cnt1, NOMATCH);
1009
1010 // Assign the particular char throughout the vector.
1011 sve_dup(ztmp2, T, ch);
1012 if (isL) {
1013 sve_cntb(vec_len);
1014 } else {
1015 sve_cnth(vec_len);
1016 }
1017 mov(idx, 0);
1018
1019 // Generate a predicate to control the reading of input string.
1020 sve_whilelt(tmp_pg, T, idx, cnt1);
1021
1022 BIND(LOOP);
1023 // Read a vector of 8- or 16-bit data depending on the string type. Note
1024 // that inactive elements indicated by the predicate register won't cause
1025 // a data read from memory to the destination vector.
1026 if (isL) {
1027 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1028 } else {
1029 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1030 }
1031 add(idx, idx, vec_len);
1032
1033 // Perform the comparison. An element of the destination predicate is set
1034 // to active if the particular char is matched.
1035 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1036
1037 // Branch if the particular char is found.
1038 br(NE, MATCH);
1039
1040 sve_whilelt(tmp_pg, T, idx, cnt1);
1041
1042 // Loop back if the particular char not found.
1043 br(MI, LOOP);
1044
1045 BIND(NOMATCH);
1046 mov(result, -1);
1047 b(DONE);
1048
1049 BIND(MATCH);
1050 // Undo the index increment.
1051 sub(idx, idx, vec_len);
1052
1053 // Crop the vector to find its location.
1054 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1055 add(result, idx, -1);
1056 sve_incp(result, T, tmp_pdn);
1057 BIND(DONE);
1058 }
1059
1060 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1061 Register ch, Register result,
1062 Register tmp1, Register tmp2, Register tmp3)
1063 {
1064 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1065 Register cnt1_neg = cnt1;
1066 Register ch1 = rscratch1;
1067 Register result_tmp = rscratch2;
1068
1069 cbz(cnt1, NOMATCH);
1070
1071 cmp(cnt1, (u1)8);
1072 br(LT, DO1_SHORT);
1073
1074 orr(ch, ch, ch, LSL, 8);
1075 orr(ch, ch, ch, LSL, 16);
1076 orr(ch, ch, ch, LSL, 32);
1077
1078 sub(cnt1, cnt1, 8);
1079 mov(result_tmp, cnt1);
1080 lea(str1, Address(str1, cnt1));
1081 sub(cnt1_neg, zr, cnt1);
1082
1083 mov(tmp3, 0x0101010101010101);
1084
1085 BIND(CH1_LOOP);
1086 ldr(ch1, Address(str1, cnt1_neg));
1087 eor(ch1, ch, ch1);
1088 sub(tmp1, ch1, tmp3);
1089 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1090 bics(tmp1, tmp1, tmp2);
1091 br(NE, HAS_ZERO);
1092 adds(cnt1_neg, cnt1_neg, 8);
1093 br(LT, CH1_LOOP);
1094
1095 cmp(cnt1_neg, (u1)8);
1096 mov(cnt1_neg, 0);
1097 br(LT, CH1_LOOP);
1098 b(NOMATCH);
1099
1100 BIND(HAS_ZERO);
1101 rev(tmp1, tmp1);
1102 clz(tmp1, tmp1);
1103 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1104 b(MATCH);
1105
1106 BIND(DO1_SHORT);
1107 mov(result_tmp, cnt1);
1108 lea(str1, Address(str1, cnt1));
1109 sub(cnt1_neg, zr, cnt1);
1110 BIND(DO1_LOOP);
1111 ldrb(ch1, Address(str1, cnt1_neg));
1112 cmp(ch, ch1);
1113 br(EQ, MATCH);
1114 adds(cnt1_neg, cnt1_neg, 1);
1115 br(LT, DO1_LOOP);
1116 BIND(NOMATCH);
1117 mov(result, -1);
1118 b(DONE);
1119 BIND(MATCH);
1120 add(result, result_tmp, cnt1_neg);
1121 BIND(DONE);
1122 }
1123
1124 // Compare strings.
1125 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1126 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1127 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1128 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1129 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1130 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1131 SHORT_LOOP_START, TAIL_CHECK;
1132
1133 bool isLL = ae == StrIntrinsicNode::LL;
1134 bool isLU = ae == StrIntrinsicNode::LU;
1135 bool isUL = ae == StrIntrinsicNode::UL;
1136
1137 // The stub threshold for LL strings is: 72 (64 + 8) chars
1138 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1139 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1140 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1141
1142 bool str1_isL = isLL || isLU;
1143 bool str2_isL = isLL || isUL;
1144
1145 int str1_chr_shift = str1_isL ? 0 : 1;
1146 int str2_chr_shift = str2_isL ? 0 : 1;
1147 int str1_chr_size = str1_isL ? 1 : 2;
1148 int str2_chr_size = str2_isL ? 1 : 2;
1149 int minCharsInWord = isLL ? wordSize : wordSize/2;
1150
1151 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1152 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1153 (chr_insn)&MacroAssembler::ldrh;
1154 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1155 (chr_insn)&MacroAssembler::ldrh;
1156 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1157 (uxt_insn)&MacroAssembler::uxthw;
1158
1159 BLOCK_COMMENT("string_compare {");
1160
1161 // Bizarrely, the counts are passed in bytes, regardless of whether they
1162 // are L or U strings, however the result is always in characters.
1163 if (!str1_isL) asrw(cnt1, cnt1, 1);
1164 if (!str2_isL) asrw(cnt2, cnt2, 1);
1165
1166 // Compute the minimum of the string lengths and save the difference.
1167 subsw(result, cnt1, cnt2);
1168 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1169
1170 // A very short string
1171 cmpw(cnt2, minCharsInWord);
1172 br(Assembler::LE, SHORT_STRING);
1173
1174 // Compare longwords
1175 // load first parts of strings and finish initialization while loading
1176 {
1177 if (str1_isL == str2_isL) { // LL or UU
1178 ldr(tmp1, Address(str1));
1179 cmp(str1, str2);
1180 br(Assembler::EQ, DONE);
1181 ldr(tmp2, Address(str2));
1182 cmp(cnt2, stub_threshold);
1183 br(GE, STUB);
1184 subsw(cnt2, cnt2, minCharsInWord);
1185 br(EQ, TAIL_CHECK);
1186 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1187 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1188 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1189 } else if (isLU) {
1190 ldrs(vtmp, Address(str1));
1191 ldr(tmp2, Address(str2));
1192 cmp(cnt2, stub_threshold);
1193 br(GE, STUB);
1194 subw(cnt2, cnt2, 4);
1195 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1196 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1197 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1198 zip1(vtmp, T8B, vtmp, vtmpZ);
1199 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1200 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1201 add(cnt1, cnt1, 4);
1202 fmovd(tmp1, vtmp);
1203 } else { // UL case
1204 ldr(tmp1, Address(str1));
1205 ldrs(vtmp, Address(str2));
1206 cmp(cnt2, stub_threshold);
1207 br(GE, STUB);
1208 subw(cnt2, cnt2, 4);
1209 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1210 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1211 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1212 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1213 zip1(vtmp, T8B, vtmp, vtmpZ);
1214 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1215 add(cnt1, cnt1, 8);
1216 fmovd(tmp2, vtmp);
1217 }
1218 adds(cnt2, cnt2, isUL ? 4 : 8);
1219 br(GE, TAIL);
1220 eor(rscratch2, tmp1, tmp2);
1221 cbnz(rscratch2, DIFF);
1222 // main loop
1223 bind(NEXT_WORD);
1224 if (str1_isL == str2_isL) {
1225 ldr(tmp1, Address(str1, cnt2));
1226 ldr(tmp2, Address(str2, cnt2));
1227 adds(cnt2, cnt2, 8);
1228 } else if (isLU) {
1229 ldrs(vtmp, Address(str1, cnt1));
1230 ldr(tmp2, Address(str2, cnt2));
1231 add(cnt1, cnt1, 4);
1232 zip1(vtmp, T8B, vtmp, vtmpZ);
1233 fmovd(tmp1, vtmp);
1234 adds(cnt2, cnt2, 8);
1235 } else { // UL
1236 ldrs(vtmp, Address(str2, cnt2));
1237 ldr(tmp1, Address(str1, cnt1));
1238 zip1(vtmp, T8B, vtmp, vtmpZ);
1239 add(cnt1, cnt1, 8);
1240 fmovd(tmp2, vtmp);
1241 adds(cnt2, cnt2, 4);
1242 }
1243 br(GE, TAIL);
1244
1245 eor(rscratch2, tmp1, tmp2);
1246 cbz(rscratch2, NEXT_WORD);
1247 b(DIFF);
1248 bind(TAIL);
1249 eor(rscratch2, tmp1, tmp2);
1250 cbnz(rscratch2, DIFF);
1251 // Last longword. In the case where length == 4 we compare the
1252 // same longword twice, but that's still faster than another
1253 // conditional branch.
1254 if (str1_isL == str2_isL) {
1255 ldr(tmp1, Address(str1));
1256 ldr(tmp2, Address(str2));
1257 } else if (isLU) {
1258 ldrs(vtmp, Address(str1));
1259 ldr(tmp2, Address(str2));
1260 zip1(vtmp, T8B, vtmp, vtmpZ);
1261 fmovd(tmp1, vtmp);
1262 } else { // UL
1263 ldrs(vtmp, Address(str2));
1264 ldr(tmp1, Address(str1));
1265 zip1(vtmp, T8B, vtmp, vtmpZ);
1266 fmovd(tmp2, vtmp);
1267 }
1268 bind(TAIL_CHECK);
1269 eor(rscratch2, tmp1, tmp2);
1270 cbz(rscratch2, DONE);
1271
1272 // Find the first different characters in the longwords and
1273 // compute their difference.
1274 bind(DIFF);
1275 rev(rscratch2, rscratch2);
1276 clz(rscratch2, rscratch2);
1277 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1278 lsrv(tmp1, tmp1, rscratch2);
1279 (this->*ext_chr)(tmp1, tmp1);
1280 lsrv(tmp2, tmp2, rscratch2);
1281 (this->*ext_chr)(tmp2, tmp2);
1282 subw(result, tmp1, tmp2);
1283 b(DONE);
1284 }
1285
1286 bind(STUB);
1287 RuntimeAddress stub = nullptr;
1288 switch(ae) {
1289 case StrIntrinsicNode::LL:
1290 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1291 break;
1292 case StrIntrinsicNode::UU:
1293 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1294 break;
1295 case StrIntrinsicNode::LU:
1296 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1297 break;
1298 case StrIntrinsicNode::UL:
1299 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1300 break;
1301 default:
1302 ShouldNotReachHere();
1303 }
1304 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1305 address call = trampoline_call(stub);
1306 if (call == nullptr) {
1307 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1308 ciEnv::current()->record_failure("CodeCache is full");
1309 return;
1310 }
1311 b(DONE);
1312
1313 bind(SHORT_STRING);
1314 // Is the minimum length zero?
1315 cbz(cnt2, DONE);
1316 // arrange code to do most branches while loading and loading next characters
1317 // while comparing previous
1318 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1319 subs(cnt2, cnt2, 1);
1320 br(EQ, SHORT_LAST_INIT);
1321 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1322 b(SHORT_LOOP_START);
1323 bind(SHORT_LOOP);
1324 subs(cnt2, cnt2, 1);
1325 br(EQ, SHORT_LAST);
1326 bind(SHORT_LOOP_START);
1327 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1328 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1329 cmp(tmp1, cnt1);
1330 br(NE, SHORT_LOOP_TAIL);
1331 subs(cnt2, cnt2, 1);
1332 br(EQ, SHORT_LAST2);
1333 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1334 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1335 cmp(tmp2, rscratch1);
1336 br(EQ, SHORT_LOOP);
1337 sub(result, tmp2, rscratch1);
1338 b(DONE);
1339 bind(SHORT_LOOP_TAIL);
1340 sub(result, tmp1, cnt1);
1341 b(DONE);
1342 bind(SHORT_LAST2);
1343 cmp(tmp2, rscratch1);
1344 br(EQ, DONE);
1345 sub(result, tmp2, rscratch1);
1346
1347 b(DONE);
1348 bind(SHORT_LAST_INIT);
1349 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1350 bind(SHORT_LAST);
1351 cmp(tmp1, cnt1);
1352 br(EQ, DONE);
1353 sub(result, tmp1, cnt1);
1354
1355 bind(DONE);
1356
1357 BLOCK_COMMENT("} string_compare");
1358 }
1359
1360 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1361 FloatRegister src2, Condition cond, bool isQ) {
1362 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1363 FloatRegister zn = src1, zm = src2;
1364 bool needs_negation = false;
1365 switch (cond) {
1366 case LT: cond = GT; zn = src2; zm = src1; break;
1367 case LE: cond = GE; zn = src2; zm = src1; break;
1368 case LO: cond = HI; zn = src2; zm = src1; break;
1369 case LS: cond = HS; zn = src2; zm = src1; break;
1370 case NE: cond = EQ; needs_negation = true; break;
1371 default:
1372 break;
1373 }
1374
1375 if (is_floating_point_type(bt)) {
1376 fcm(cond, dst, size, zn, zm);
1377 } else {
1378 cm(cond, dst, size, zn, zm);
1379 }
1380
1381 if (needs_negation) {
1382 notr(dst, isQ ? T16B : T8B, dst);
1383 }
1384 }
1385
1386 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1387 Condition cond, bool isQ) {
1388 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1389 if (bt == T_FLOAT || bt == T_DOUBLE) {
1390 if (cond == Assembler::NE) {
1391 fcm(Assembler::EQ, dst, size, src);
1392 notr(dst, isQ ? T16B : T8B, dst);
1393 } else {
1394 fcm(cond, dst, size, src);
1395 }
1396 } else {
1397 if (cond == Assembler::NE) {
1398 cm(Assembler::EQ, dst, size, src);
1399 notr(dst, isQ ? T16B : T8B, dst);
1400 } else {
1401 cm(cond, dst, size, src);
1402 }
1403 }
1404 }
1405
1406 // Compress the least significant bit of each byte to the rightmost and clear
1407 // the higher garbage bits.
1408 void C2_MacroAssembler::bytemask_compress(Register dst) {
1409 // Example input, dst = 0x01 00 00 00 01 01 00 01
1410 // The "??" bytes are garbage.
1411 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1412 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1413 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1414 andr(dst, dst, 0xff); // dst = 0x8D
1415 }
1416
1417 // Pack the value of each mask element in "src" into a long value in "dst", at most
1418 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1419 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1420 // one bit in "dst".
1421 //
1422 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1423 // Expected: dst = 0x658D
1424 //
1425 // Clobbers: rscratch1
1426 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1427 FloatRegister vtmp, int lane_cnt) {
1428 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1429 assert_different_registers(dst, rscratch1);
1430 assert_different_registers(src, vtmp);
1431 assert(UseSVE > 0, "must be");
1432
1433 // Compress the lowest 8 bytes.
1434 fmovd(dst, src);
1435 bytemask_compress(dst);
1436 if (lane_cnt <= 8) return;
1437
1438 // Repeat on higher bytes and join the results.
1439 // Compress 8 bytes in each iteration.
1440 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1441 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1442 bytemask_compress(rscratch1);
1443 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1444 }
1445 }
1446
1447 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1448 // instruction which requires the FEAT_BITPERM feature.
1449 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1450 FloatRegister vtmp1, FloatRegister vtmp2,
1451 int lane_cnt) {
1452 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1453 assert_different_registers(src, vtmp1, vtmp2);
1454 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1455
1456 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1457 // is to compress each significant bit of the byte in a cross-lane way. Due
1458 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1459 // (bit-compress in each lane) with the biggest lane size (T = D) then
1460 // concatenate the results.
1461
1462 // The second source input of BEXT, initialized with 0x01 in each byte.
1463 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1464 sve_dup(vtmp2, B, 1);
1465
1466 // BEXT vtmp1.D, src.D, vtmp2.D
1467 // src = 0x0001010000010001 | 0x0100000001010001
1468 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1469 // ---------------------------------------
1470 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1471 sve_bext(vtmp1, D, src, vtmp2);
1472
1473 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1474 // result to dst.
1475 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1476 // dst = 0x658D
1477 if (lane_cnt <= 8) {
1478 // No need to concatenate.
1479 umov(dst, vtmp1, B, 0);
1480 } else if (lane_cnt <= 16) {
1481 ins(vtmp1, B, vtmp1, 1, 8);
1482 umov(dst, vtmp1, H, 0);
1483 } else {
1484 // As the lane count is 64 at most, the final expected value must be in
1485 // the lowest 64 bits after narrowing vtmp1 from D to B.
1486 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1487 umov(dst, vtmp1, D, 0);
1488 }
1489 }
1490
1491 // Unpack the mask, a long value in "src", into a vector register of boolean
1492 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1493 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1494 // most 64 lanes.
1495 //
1496 // Below example gives the expected dst vector register, with a valid src(0x658D)
1497 // on a 128-bit vector size machine.
1498 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1499 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1500 FloatRegister vtmp, int lane_cnt) {
1501 assert_different_registers(dst, vtmp);
1502 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1503 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1504
1505 // Example: src = 0x658D, lane_cnt = 16
1506 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1507
1508 // Put long value from general purpose register into the first lane of vector.
1509 // vtmp = 0x0000000000000000 | 0x000000000000658D
1510 sve_dup(vtmp, B, 0);
1511 mov(vtmp, D, 0, src);
1512
1513 // Transform the value in the first lane which is mask in bit now to the mask in
1514 // byte, which can be done by SVE2's BDEP instruction.
1515
1516 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1517 // vtmp = 0x0000000000000065 | 0x000000000000008D
1518 if (lane_cnt <= 8) {
1519 // Nothing. As only one byte exsits.
1520 } else if (lane_cnt <= 16) {
1521 ins(vtmp, B, vtmp, 8, 1);
1522 } else {
1523 sve_vector_extend(vtmp, D, vtmp, B);
1524 }
1525
1526 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1527 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1528 sve_dup(dst, B, 1);
1529
1530 // BDEP dst.D, vtmp.D, dst.D
1531 // vtmp = 0x0000000000000065 | 0x000000000000008D
1532 // dst = 0x0101010101010101 | 0x0101010101010101
1533 // ---------------------------------------
1534 // dst = 0x0001010000010001 | 0x0100000001010001
1535 sve_bdep(dst, D, vtmp, dst);
1536 }
1537
1538 // Clobbers: rflags
1539 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1540 FloatRegister zn, FloatRegister zm, Condition cond) {
1541 assert(pg->is_governing(), "This register has to be a governing predicate register");
1542 FloatRegister z1 = zn, z2 = zm;
1543 switch (cond) {
1544 case LE: z1 = zm; z2 = zn; cond = GE; break;
1545 case LT: z1 = zm; z2 = zn; cond = GT; break;
1546 case LO: z1 = zm; z2 = zn; cond = HI; break;
1547 case LS: z1 = zm; z2 = zn; cond = HS; break;
1548 default:
1549 break;
1550 }
1551
1552 SIMD_RegVariant size = elemType_to_regVariant(bt);
1553 if (is_floating_point_type(bt)) {
1554 sve_fcm(cond, pd, size, pg, z1, z2);
1555 } else {
1556 assert(is_integral_type(bt), "unsupported element type");
1557 sve_cmp(cond, pd, size, pg, z1, z2);
1558 }
1559 }
1560
1561 // Get index of the last mask lane that is set
1562 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1563 SIMD_RegVariant size = elemType_to_regVariant(bt);
1564 sve_rev(ptmp, size, src);
1565 sve_brkb(ptmp, ptrue, ptmp, false);
1566 sve_cntp(dst, size, ptrue, ptmp);
1567 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1568 subw(dst, rscratch1, dst);
1569 }
1570
1571 // Extend integer vector src to dst with the same lane count
1572 // but larger element size, e.g. 4B -> 4I
1573 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1574 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1575 if (src_bt == T_BYTE) {
1576 // 4B to 4S/4I, 8B to 8S
1577 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1578 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1579 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1580 if (dst_bt == T_INT) {
1581 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1582 }
1583 } else if (src_bt == T_SHORT) {
1584 // 2S to 2I/2L, 4S to 4I
1585 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1586 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1587 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1588 if (dst_bt == T_LONG) {
1589 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1590 }
1591 } else if (src_bt == T_INT) {
1592 // 2I to 2L
1593 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1594 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1595 } else {
1596 ShouldNotReachHere();
1597 }
1598 }
1599
1600 // Narrow integer vector src down to dst with the same lane count
1601 // but smaller element size, e.g. 4I -> 4B
1602 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1603 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1604 if (src_bt == T_SHORT) {
1605 // 4S/8S to 4B/8B
1606 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1607 assert(dst_bt == T_BYTE, "unsupported");
1608 xtn(dst, T8B, src, T8H);
1609 } else if (src_bt == T_INT) {
1610 // 2I to 2S, 4I to 4B/4S
1611 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1612 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1613 xtn(dst, T4H, src, T4S);
1614 if (dst_bt == T_BYTE) {
1615 xtn(dst, T8B, dst, T8H);
1616 }
1617 } else if (src_bt == T_LONG) {
1618 // 2L to 2S/2I
1619 assert(src_vlen_in_bytes == 16, "unsupported");
1620 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1621 xtn(dst, T2S, src, T2D);
1622 if (dst_bt == T_SHORT) {
1623 xtn(dst, T4H, dst, T4S);
1624 }
1625 } else {
1626 ShouldNotReachHere();
1627 }
1628 }
1629
1630 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1631 FloatRegister src, SIMD_RegVariant src_size,
1632 bool is_unsigned) {
1633 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1634
1635 if (src_size == B) {
1636 switch (dst_size) {
1637 case H:
1638 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1639 break;
1640 case S:
1641 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1642 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1643 break;
1644 case D:
1645 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1646 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1647 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1648 break;
1649 default:
1650 ShouldNotReachHere();
1651 }
1652 } else if (src_size == H) {
1653 if (dst_size == S) {
1654 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1655 } else { // D
1656 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1657 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1658 }
1659 } else if (src_size == S) {
1660 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1661 }
1662 }
1663
1664 // Vector narrow from src to dst with specified element sizes.
1665 // High part of dst vector will be filled with zero.
1666 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1667 FloatRegister src, SIMD_RegVariant src_size,
1668 FloatRegister tmp) {
1669 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1670 assert_different_registers(src, tmp);
1671 sve_dup(tmp, src_size, 0);
1672 if (src_size == D) {
1673 switch (dst_size) {
1674 case S:
1675 sve_uzp1(dst, S, src, tmp);
1676 break;
1677 case H:
1678 assert_different_registers(dst, tmp);
1679 sve_uzp1(dst, S, src, tmp);
1680 sve_uzp1(dst, H, dst, tmp);
1681 break;
1682 case B:
1683 assert_different_registers(dst, tmp);
1684 sve_uzp1(dst, S, src, tmp);
1685 sve_uzp1(dst, H, dst, tmp);
1686 sve_uzp1(dst, B, dst, tmp);
1687 break;
1688 default:
1689 ShouldNotReachHere();
1690 }
1691 } else if (src_size == S) {
1692 if (dst_size == H) {
1693 sve_uzp1(dst, H, src, tmp);
1694 } else { // B
1695 assert_different_registers(dst, tmp);
1696 sve_uzp1(dst, H, src, tmp);
1697 sve_uzp1(dst, B, dst, tmp);
1698 }
1699 } else if (src_size == H) {
1700 sve_uzp1(dst, B, src, tmp);
1701 }
1702 }
1703
1704 // Extend src predicate to dst predicate with the same lane count but larger
1705 // element size, e.g. 64Byte -> 512Long
1706 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1707 uint dst_element_length_in_bytes,
1708 uint src_element_length_in_bytes) {
1709 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1710 sve_punpklo(dst, src);
1711 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1712 sve_punpklo(dst, src);
1713 sve_punpklo(dst, dst);
1714 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1715 sve_punpklo(dst, src);
1716 sve_punpklo(dst, dst);
1717 sve_punpklo(dst, dst);
1718 } else {
1719 assert(false, "unsupported");
1720 ShouldNotReachHere();
1721 }
1722 }
1723
1724 // Narrow src predicate to dst predicate with the same lane count but
1725 // smaller element size, e.g. 512Long -> 64Byte
1726 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1727 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1728 // The insignificant bits in src predicate are expected to be zero.
1729 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1730 // passed as the second argument. An example narrowing operation with a given mask would be -
1731 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1732 // Mask (for 2 Longs) : TF
1733 // Predicate register for the above mask (16 bits) : 00000001 00000000
1734 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1735 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1736 assert_different_registers(src, ptmp);
1737 assert_different_registers(dst, ptmp);
1738 sve_pfalse(ptmp);
1739 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1740 sve_uzp1(dst, B, src, ptmp);
1741 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1742 sve_uzp1(dst, H, src, ptmp);
1743 sve_uzp1(dst, B, dst, ptmp);
1744 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1745 sve_uzp1(dst, S, src, ptmp);
1746 sve_uzp1(dst, H, dst, ptmp);
1747 sve_uzp1(dst, B, dst, ptmp);
1748 } else {
1749 assert(false, "unsupported");
1750 ShouldNotReachHere();
1751 }
1752 }
1753
1754 // Vector reduction add for integral type with ASIMD instructions.
1755 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1756 Register isrc, FloatRegister vsrc,
1757 unsigned vector_length_in_bytes,
1758 FloatRegister vtmp) {
1759 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1760 assert_different_registers(dst, isrc);
1761 bool isQ = vector_length_in_bytes == 16;
1762
1763 BLOCK_COMMENT("neon_reduce_add_integral {");
1764 switch(bt) {
1765 case T_BYTE:
1766 addv(vtmp, isQ ? T16B : T8B, vsrc);
1767 smov(dst, vtmp, B, 0);
1768 addw(dst, dst, isrc, ext::sxtb);
1769 break;
1770 case T_SHORT:
1771 addv(vtmp, isQ ? T8H : T4H, vsrc);
1772 smov(dst, vtmp, H, 0);
1773 addw(dst, dst, isrc, ext::sxth);
1774 break;
1775 case T_INT:
1776 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1777 umov(dst, vtmp, S, 0);
1778 addw(dst, dst, isrc);
1779 break;
1780 case T_LONG:
1781 assert(isQ, "unsupported");
1782 addpd(vtmp, vsrc);
1783 umov(dst, vtmp, D, 0);
1784 add(dst, dst, isrc);
1785 break;
1786 default:
1787 assert(false, "unsupported");
1788 ShouldNotReachHere();
1789 }
1790 BLOCK_COMMENT("} neon_reduce_add_integral");
1791 }
1792
1793 // Vector reduction multiply for integral type with ASIMD instructions.
1794 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1795 // Clobbers: rscratch1
1796 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1797 Register isrc, FloatRegister vsrc,
1798 unsigned vector_length_in_bytes,
1799 FloatRegister vtmp1, FloatRegister vtmp2) {
1800 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1801 bool isQ = vector_length_in_bytes == 16;
1802
1803 BLOCK_COMMENT("neon_reduce_mul_integral {");
1804 switch(bt) {
1805 case T_BYTE:
1806 if (isQ) {
1807 // Multiply the lower half and higher half of vector iteratively.
1808 // vtmp1 = vsrc[8:15]
1809 ins(vtmp1, D, vsrc, 0, 1);
1810 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1811 mulv(vtmp1, T8B, vtmp1, vsrc);
1812 // vtmp2 = vtmp1[4:7]
1813 ins(vtmp2, S, vtmp1, 0, 1);
1814 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1815 mulv(vtmp1, T8B, vtmp2, vtmp1);
1816 } else {
1817 ins(vtmp1, S, vsrc, 0, 1);
1818 mulv(vtmp1, T8B, vtmp1, vsrc);
1819 }
1820 // vtmp2 = vtmp1[2:3]
1821 ins(vtmp2, H, vtmp1, 0, 1);
1822 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1823 mulv(vtmp2, T8B, vtmp2, vtmp1);
1824 // dst = vtmp2[0] * isrc * vtmp2[1]
1825 umov(rscratch1, vtmp2, B, 0);
1826 mulw(dst, rscratch1, isrc);
1827 sxtb(dst, dst);
1828 umov(rscratch1, vtmp2, B, 1);
1829 mulw(dst, rscratch1, dst);
1830 sxtb(dst, dst);
1831 break;
1832 case T_SHORT:
1833 if (isQ) {
1834 ins(vtmp2, D, vsrc, 0, 1);
1835 mulv(vtmp2, T4H, vtmp2, vsrc);
1836 ins(vtmp1, S, vtmp2, 0, 1);
1837 mulv(vtmp1, T4H, vtmp1, vtmp2);
1838 } else {
1839 ins(vtmp1, S, vsrc, 0, 1);
1840 mulv(vtmp1, T4H, vtmp1, vsrc);
1841 }
1842 umov(rscratch1, vtmp1, H, 0);
1843 mulw(dst, rscratch1, isrc);
1844 sxth(dst, dst);
1845 umov(rscratch1, vtmp1, H, 1);
1846 mulw(dst, rscratch1, dst);
1847 sxth(dst, dst);
1848 break;
1849 case T_INT:
1850 if (isQ) {
1851 ins(vtmp1, D, vsrc, 0, 1);
1852 mulv(vtmp1, T2S, vtmp1, vsrc);
1853 } else {
1854 vtmp1 = vsrc;
1855 }
1856 umov(rscratch1, vtmp1, S, 0);
1857 mul(dst, rscratch1, isrc);
1858 umov(rscratch1, vtmp1, S, 1);
1859 mul(dst, rscratch1, dst);
1860 break;
1861 case T_LONG:
1862 umov(rscratch1, vsrc, D, 0);
1863 mul(dst, isrc, rscratch1);
1864 umov(rscratch1, vsrc, D, 1);
1865 mul(dst, dst, rscratch1);
1866 break;
1867 default:
1868 assert(false, "unsupported");
1869 ShouldNotReachHere();
1870 }
1871 BLOCK_COMMENT("} neon_reduce_mul_integral");
1872 }
1873
1874 // Vector reduction multiply for floating-point type with ASIMD instructions.
1875 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1876 FloatRegister fsrc, FloatRegister vsrc,
1877 unsigned vector_length_in_bytes,
1878 FloatRegister vtmp) {
1879 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1880 bool isQ = vector_length_in_bytes == 16;
1881
1882 BLOCK_COMMENT("neon_reduce_mul_fp {");
1883 switch(bt) {
1884 // The T_SHORT type below is for Float16 type which also uses floating-point
1885 // instructions.
1886 case T_SHORT:
1887 fmulh(dst, fsrc, vsrc);
1888 ext(vtmp, T8B, vsrc, vsrc, 2);
1889 fmulh(dst, dst, vtmp);
1890 ext(vtmp, T8B, vsrc, vsrc, 4);
1891 fmulh(dst, dst, vtmp);
1892 ext(vtmp, T8B, vsrc, vsrc, 6);
1893 fmulh(dst, dst, vtmp);
1894 if (isQ) {
1895 ext(vtmp, T16B, vsrc, vsrc, 8);
1896 fmulh(dst, dst, vtmp);
1897 ext(vtmp, T16B, vsrc, vsrc, 10);
1898 fmulh(dst, dst, vtmp);
1899 ext(vtmp, T16B, vsrc, vsrc, 12);
1900 fmulh(dst, dst, vtmp);
1901 ext(vtmp, T16B, vsrc, vsrc, 14);
1902 fmulh(dst, dst, vtmp);
1903 }
1904 break;
1905 case T_FLOAT:
1906 fmuls(dst, fsrc, vsrc);
1907 ins(vtmp, S, vsrc, 0, 1);
1908 fmuls(dst, dst, vtmp);
1909 if (isQ) {
1910 ins(vtmp, S, vsrc, 0, 2);
1911 fmuls(dst, dst, vtmp);
1912 ins(vtmp, S, vsrc, 0, 3);
1913 fmuls(dst, dst, vtmp);
1914 }
1915 break;
1916 case T_DOUBLE:
1917 assert(isQ, "unsupported");
1918 fmuld(dst, fsrc, vsrc);
1919 ins(vtmp, D, vsrc, 0, 1);
1920 fmuld(dst, dst, vtmp);
1921 break;
1922 default:
1923 assert(false, "unsupported");
1924 ShouldNotReachHere();
1925 }
1926 BLOCK_COMMENT("} neon_reduce_mul_fp");
1927 }
1928
1929 // Vector reduction add for half float type with ASIMD instructions.
1930 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1931 unsigned vector_length_in_bytes, FloatRegister vtmp) {
1932 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1933 bool isQ = vector_length_in_bytes == 16;
1934
1935 BLOCK_COMMENT("neon_reduce_add_fp16 {");
1936 faddh(dst, fsrc, vsrc);
1937 ext(vtmp, T8B, vsrc, vsrc, 2);
1938 faddh(dst, dst, vtmp);
1939 ext(vtmp, T8B, vsrc, vsrc, 4);
1940 faddh(dst, dst, vtmp);
1941 ext(vtmp, T8B, vsrc, vsrc, 6);
1942 faddh(dst, dst, vtmp);
1943 if (isQ) {
1944 ext(vtmp, T16B, vsrc, vsrc, 8);
1945 faddh(dst, dst, vtmp);
1946 ext(vtmp, T16B, vsrc, vsrc, 10);
1947 faddh(dst, dst, vtmp);
1948 ext(vtmp, T16B, vsrc, vsrc, 12);
1949 faddh(dst, dst, vtmp);
1950 ext(vtmp, T16B, vsrc, vsrc, 14);
1951 faddh(dst, dst, vtmp);
1952 }
1953 BLOCK_COMMENT("} neon_reduce_add_fp16");
1954 }
1955
1956 // Helper to select logical instruction
1957 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1958 Register Rn, Register Rm,
1959 enum shift_kind kind, unsigned shift) {
1960 switch(opc) {
1961 case Op_AndReductionV:
1962 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1963 break;
1964 case Op_OrReductionV:
1965 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1966 break;
1967 case Op_XorReductionV:
1968 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1969 break;
1970 default:
1971 assert(false, "unsupported");
1972 ShouldNotReachHere();
1973 }
1974 }
1975
1976 // Vector reduction logical operations And, Or, Xor
1977 // Clobbers: rscratch1
1978 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1979 Register isrc, FloatRegister vsrc,
1980 unsigned vector_length_in_bytes) {
1981 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1982 "unsupported");
1983 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1984 assert_different_registers(dst, isrc);
1985 bool isQ = vector_length_in_bytes == 16;
1986
1987 BLOCK_COMMENT("neon_reduce_logical {");
1988 umov(rscratch1, vsrc, isQ ? D : S, 0);
1989 umov(dst, vsrc, isQ ? D : S, 1);
1990 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1991 switch(bt) {
1992 case T_BYTE:
1993 if (isQ) {
1994 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1995 }
1996 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1997 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1998 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1999 sxtb(dst, dst);
2000 break;
2001 case T_SHORT:
2002 if (isQ) {
2003 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2004 }
2005 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2006 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2007 sxth(dst, dst);
2008 break;
2009 case T_INT:
2010 if (isQ) {
2011 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2012 }
2013 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2014 break;
2015 case T_LONG:
2016 assert(isQ, "unsupported");
2017 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2018 break;
2019 default:
2020 assert(false, "unsupported");
2021 ShouldNotReachHere();
2022 }
2023 BLOCK_COMMENT("} neon_reduce_logical");
2024 }
2025
2026 // Helper function to decode min/max reduction operation properties
2027 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2028 bool* is_unsigned,
2029 Condition* cond) {
2030 switch(opc) {
2031 case Op_MinReductionV:
2032 *is_min = true; *is_unsigned = false; *cond = LT; break;
2033 case Op_MaxReductionV:
2034 *is_min = false; *is_unsigned = false; *cond = GT; break;
2035 case Op_UMinReductionV:
2036 *is_min = true; *is_unsigned = true; *cond = LO; break;
2037 case Op_UMaxReductionV:
2038 *is_min = false; *is_unsigned = true; *cond = HI; break;
2039 default:
2040 ShouldNotReachHere();
2041 }
2042 }
2043
2044 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2045 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2046 // Clobbers: rscratch1, rflags
2047 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2048 Register isrc, FloatRegister vsrc,
2049 unsigned vector_length_in_bytes,
2050 FloatRegister vtmp) {
2051 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2052 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2053 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2054 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2055 assert_different_registers(dst, isrc);
2056 bool isQ = vector_length_in_bytes == 16;
2057 bool is_min;
2058 bool is_unsigned;
2059 Condition cond;
2060 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2061 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2062 if (bt == T_LONG) {
2063 assert(vtmp == fnoreg, "should be");
2064 assert(isQ, "should be");
2065 umov(rscratch1, vsrc, D, 0);
2066 cmp(isrc, rscratch1);
2067 csel(dst, isrc, rscratch1, cond);
2068 umov(rscratch1, vsrc, D, 1);
2069 cmp(dst, rscratch1);
2070 csel(dst, dst, rscratch1, cond);
2071 } else {
2072 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2073 if (size == T2S) {
2074 // For T2S (2x32-bit elements), use pairwise instructions because
2075 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2076 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2077 } else {
2078 // For other sizes, use reduction to scalar instructions.
2079 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2080 }
2081 if (bt == T_INT) {
2082 umov(dst, vtmp, S, 0);
2083 } else if (is_unsigned) {
2084 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2085 } else {
2086 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2087 }
2088 cmpw(dst, isrc);
2089 cselw(dst, dst, isrc, cond);
2090 }
2091 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2092 }
2093
2094 // Vector reduction for integral type with SVE instruction.
2095 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2096 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2097 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2098 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2099 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2100 assert(pg->is_governing(), "This register has to be a governing predicate register");
2101 assert_different_registers(src1, dst);
2102 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2103 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2104 switch (opc) {
2105 case Op_AddReductionVI: {
2106 sve_uaddv(tmp, size, pg, src2);
2107 if (bt == T_BYTE) {
2108 smov(dst, tmp, size, 0);
2109 addw(dst, src1, dst, ext::sxtb);
2110 } else if (bt == T_SHORT) {
2111 smov(dst, tmp, size, 0);
2112 addw(dst, src1, dst, ext::sxth);
2113 } else {
2114 umov(dst, tmp, size, 0);
2115 addw(dst, dst, src1);
2116 }
2117 break;
2118 }
2119 case Op_AddReductionVL: {
2120 sve_uaddv(tmp, size, pg, src2);
2121 umov(dst, tmp, size, 0);
2122 add(dst, dst, src1);
2123 break;
2124 }
2125 case Op_AndReductionV: {
2126 sve_andv(tmp, size, pg, src2);
2127 if (bt == T_INT || bt == T_LONG) {
2128 umov(dst, tmp, size, 0);
2129 } else {
2130 smov(dst, tmp, size, 0);
2131 }
2132 if (bt == T_LONG) {
2133 andr(dst, dst, src1);
2134 } else {
2135 andw(dst, dst, src1);
2136 }
2137 break;
2138 }
2139 case Op_OrReductionV: {
2140 sve_orv(tmp, size, pg, src2);
2141 if (bt == T_INT || bt == T_LONG) {
2142 umov(dst, tmp, size, 0);
2143 } else {
2144 smov(dst, tmp, size, 0);
2145 }
2146 if (bt == T_LONG) {
2147 orr(dst, dst, src1);
2148 } else {
2149 orrw(dst, dst, src1);
2150 }
2151 break;
2152 }
2153 case Op_XorReductionV: {
2154 sve_eorv(tmp, size, pg, src2);
2155 if (bt == T_INT || bt == T_LONG) {
2156 umov(dst, tmp, size, 0);
2157 } else {
2158 smov(dst, tmp, size, 0);
2159 }
2160 if (bt == T_LONG) {
2161 eor(dst, dst, src1);
2162 } else {
2163 eorw(dst, dst, src1);
2164 }
2165 break;
2166 }
2167 case Op_MaxReductionV:
2168 case Op_MinReductionV:
2169 case Op_UMaxReductionV:
2170 case Op_UMinReductionV: {
2171 bool is_min;
2172 bool is_unsigned;
2173 Condition cond;
2174 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2175 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2176 // Move result from vector to general register
2177 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2178 umov(dst, tmp, size, 0);
2179 } else {
2180 smov(dst, tmp, size, 0);
2181 }
2182 if (bt == T_LONG) {
2183 cmp(dst, src1);
2184 csel(dst, dst, src1, cond);
2185 } else {
2186 cmpw(dst, src1);
2187 cselw(dst, dst, src1, cond);
2188 }
2189 break;
2190 }
2191 default:
2192 assert(false, "unsupported");
2193 ShouldNotReachHere();
2194 }
2195
2196 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2197 if (bt == T_BYTE) {
2198 sxtb(dst, dst);
2199 } else if (bt == T_SHORT) {
2200 sxth(dst, dst);
2201 }
2202 }
2203 }
2204
2205 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2206 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2207 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2208 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2209 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2210 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2211
2212 // Set all elements to false if the input "lane_cnt" is zero.
2213 if (lane_cnt == 0) {
2214 sve_pfalse(dst);
2215 return;
2216 }
2217
2218 SIMD_RegVariant size = elemType_to_regVariant(bt);
2219 assert(size != Q, "invalid size");
2220
2221 // Set all true if "lane_cnt" equals to the max lane count.
2222 if (lane_cnt == max_vector_length) {
2223 sve_ptrue(dst, size, /* ALL */ 0b11111);
2224 return;
2225 }
2226
2227 // Fixed numbers for "ptrue".
2228 switch(lane_cnt) {
2229 case 1: /* VL1 */
2230 case 2: /* VL2 */
2231 case 3: /* VL3 */
2232 case 4: /* VL4 */
2233 case 5: /* VL5 */
2234 case 6: /* VL6 */
2235 case 7: /* VL7 */
2236 case 8: /* VL8 */
2237 sve_ptrue(dst, size, lane_cnt);
2238 return;
2239 case 16:
2240 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2241 return;
2242 case 32:
2243 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2244 return;
2245 case 64:
2246 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2247 return;
2248 case 128:
2249 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2250 return;
2251 case 256:
2252 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2253 return;
2254 default:
2255 break;
2256 }
2257
2258 // Special patterns for "ptrue".
2259 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2260 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2261 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2262 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2263 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2264 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2265 } else {
2266 // Encode to "whileltw" for the remaining cases.
2267 mov(rscratch1, lane_cnt);
2268 sve_whileltw(dst, size, zr, rscratch1);
2269 }
2270 }
2271
2272 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2273 // Any remaining elements of dst will be filled with zero.
2274 // Clobbers: rscratch1
2275 // Preserves: mask, vzr
2276 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2277 FloatRegister vzr, FloatRegister vtmp,
2278 PRegister pgtmp, unsigned vector_length_in_bytes) {
2279 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2280 // When called by sve_compress_byte, src and vtmp may be the same register.
2281 assert_different_registers(dst, src, vzr);
2282 assert_different_registers(dst, vtmp, vzr);
2283 assert_different_registers(mask, pgtmp);
2284 // high <-- low
2285 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2286 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2287 // Expected result: dst = 00 00 00 hh ee dd bb aa
2288
2289 // Extend lowest half to type INT.
2290 // dst = 00dd 00cc 00bb 00aa
2291 sve_uunpklo(dst, S, src);
2292 // pgtmp = 0001 0000 0001 0001
2293 sve_punpklo(pgtmp, mask);
2294 // Pack the active elements in size of type INT to the right,
2295 // and fill the remainings with zero.
2296 // dst = 0000 00dd 00bb 00aa
2297 sve_compact(dst, S, dst, pgtmp);
2298 // Narrow the result back to type SHORT.
2299 // dst = 00 00 00 00 00 dd bb aa
2300 sve_uzp1(dst, H, dst, vzr);
2301
2302 // Return if the vector length is no more than MaxVectorSize/2, since the
2303 // highest half is invalid.
2304 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2305 return;
2306 }
2307
2308 // Count the active elements of lowest half.
2309 // rscratch1 = 3
2310 sve_cntp(rscratch1, S, ptrue, pgtmp);
2311
2312 // Repeat to the highest half.
2313 // pgtmp = 0001 0000 0000 0001
2314 sve_punpkhi(pgtmp, mask);
2315 // vtmp = 00hh 00gg 00ff 00ee
2316 sve_uunpkhi(vtmp, S, src);
2317 // vtmp = 0000 0000 00hh 00ee
2318 sve_compact(vtmp, S, vtmp, pgtmp);
2319 // vtmp = 00 00 00 00 00 00 hh ee
2320 sve_uzp1(vtmp, H, vtmp, vzr);
2321
2322 // pgtmp = 00 00 00 00 00 01 01 01
2323 sve_whilelt(pgtmp, H, zr, rscratch1);
2324 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2325 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2326 // Combine the compressed low with the compressed high:
2327 // dst = 00 00 00 hh ee dd bb aa
2328 sve_splice(dst, H, pgtmp, vtmp);
2329 }
2330
2331 // Clobbers: rscratch1, rscratch2
2332 // Preserves: src, mask
2333 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2334 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2335 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2336 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2337 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2338 assert_different_registers(mask, ptmp, pgtmp);
2339 // high <-- low
2340 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2341 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2342 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2343 FloatRegister vzr = vtmp3;
2344 sve_dup(vzr, B, 0);
2345
2346 // Extend lowest half to type SHORT.
2347 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2348 sve_uunpklo(vtmp1, H, src);
2349 // ptmp = 00 01 00 00 00 01 00 01
2350 sve_punpklo(ptmp, mask);
2351 // Pack the active elements in size of type SHORT to the right,
2352 // and fill the remainings with zero.
2353 // dst = 00 00 00 00 00 0g 0c 0a
2354 unsigned extended_size = vector_length_in_bytes << 1;
2355 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2356 // Narrow the result back to type BYTE.
2357 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2358 sve_uzp1(dst, B, dst, vzr);
2359
2360 // Return if the vector length is no more than MaxVectorSize/2, since the
2361 // highest half is invalid.
2362 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2363 return;
2364 }
2365 // Count the active elements of lowest half.
2366 // rscratch2 = 3
2367 sve_cntp(rscratch2, H, ptrue, ptmp);
2368
2369 // Repeat to the highest half.
2370 // ptmp = 00 01 00 00 00 00 00 01
2371 sve_punpkhi(ptmp, mask);
2372 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2373 sve_uunpkhi(vtmp2, H, src);
2374 // vtmp1 = 00 00 00 00 00 00 0p 0i
2375 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2376 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2377 sve_uzp1(vtmp1, B, vtmp1, vzr);
2378
2379 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2380 sve_whilelt(ptmp, B, zr, rscratch2);
2381 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2382 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2383 // Combine the compressed low with the compressed high:
2384 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2385 sve_splice(dst, B, ptmp, vtmp1);
2386 }
2387
2388 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2389 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2390 SIMD_Arrangement size = isQ ? T16B : T8B;
2391 if (bt == T_BYTE) {
2392 rbit(dst, size, src);
2393 } else {
2394 neon_reverse_bytes(dst, src, bt, isQ);
2395 rbit(dst, size, dst);
2396 }
2397 }
2398
2399 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2400 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2401 SIMD_Arrangement size = isQ ? T16B : T8B;
2402 switch (bt) {
2403 case T_BYTE:
2404 if (dst != src) {
2405 orr(dst, size, src, src);
2406 }
2407 break;
2408 case T_SHORT:
2409 rev16(dst, size, src);
2410 break;
2411 case T_INT:
2412 rev32(dst, size, src);
2413 break;
2414 case T_LONG:
2415 rev64(dst, size, src);
2416 break;
2417 default:
2418 assert(false, "unsupported");
2419 ShouldNotReachHere();
2420 }
2421 }
2422
2423 // VectorRearrange implementation for short/int/float/long/double types with NEON
2424 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2425 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2426 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2427 // and use bsl to implement the operation.
2428 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2429 FloatRegister shuffle, FloatRegister tmp,
2430 BasicType bt, bool isQ) {
2431 assert_different_registers(dst, src, shuffle, tmp);
2432 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2433 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2434
2435 // Here is an example that rearranges a NEON vector with 4 ints:
2436 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2437 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2438 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2439 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2440 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2441 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2442 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2443 // 4. Use Vm as index register, and use V1 as table register.
2444 // Then get V2 as the result by tbl NEON instructions.
2445 switch (bt) {
2446 case T_SHORT:
2447 mov(tmp, size1, 0x02);
2448 mulv(dst, size2, shuffle, tmp);
2449 mov(tmp, size2, 0x0100);
2450 addv(dst, size1, dst, tmp);
2451 tbl(dst, size1, src, 1, dst);
2452 break;
2453 case T_INT:
2454 case T_FLOAT:
2455 mov(tmp, size1, 0x04);
2456 mulv(dst, size2, shuffle, tmp);
2457 mov(tmp, size2, 0x03020100);
2458 addv(dst, size1, dst, tmp);
2459 tbl(dst, size1, src, 1, dst);
2460 break;
2461 case T_LONG:
2462 case T_DOUBLE:
2463 {
2464 int idx = vector_iota_entry_index(T_LONG);
2465 lea(rscratch1,
2466 ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2467 ldrq(tmp, rscratch1);
2468 // Check whether the input "shuffle" is the same with iota indices.
2469 // Return "src" if true, otherwise swap the two elements of "src".
2470 cm(EQ, dst, size2, shuffle, tmp);
2471 ext(tmp, size1, src, src, 8);
2472 bsl(dst, size1, src, tmp);
2473 }
2474 break;
2475 default:
2476 assert(false, "unsupported element type");
2477 ShouldNotReachHere();
2478 }
2479 }
2480
2481 // Extract a scalar element from an sve vector at position 'idx'.
2482 // The input elements in src are expected to be of integral type.
2483 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2484 int idx, FloatRegister vtmp) {
2485 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2486 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2487 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2488 if (bt == T_INT || bt == T_LONG) {
2489 umov(dst, src, size, idx);
2490 } else {
2491 smov(dst, src, size, idx);
2492 }
2493 } else {
2494 sve_movprfx(vtmp, src);
2495 // Although vtmp and src hold the same value after movprfx, we must use src
2496 // (not vtmp) as the second source of ext. The movprfx destination register
2497 // must not appear in any source operand of the following instruction except
2498 // as the destructive operand.
2499 sve_ext(vtmp, src, idx << size);
2500 if (bt == T_INT || bt == T_LONG) {
2501 umov(dst, vtmp, size, 0);
2502 } else {
2503 smov(dst, vtmp, size, 0);
2504 }
2505 }
2506 }
2507
2508 // java.lang.Math::round intrinsics
2509
2510 // Clobbers: rscratch1, rflags
2511 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2512 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2513 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2514 switch (T) {
2515 case T2S:
2516 case T4S:
2517 fmovs(tmp1, T, 0.5f);
2518 mov(rscratch1, jint_cast(0x1.0p23f));
2519 break;
2520 case T2D:
2521 fmovd(tmp1, T, 0.5);
2522 mov(rscratch1, julong_cast(0x1.0p52));
2523 break;
2524 default:
2525 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2526 }
2527 fadd(tmp1, T, tmp1, src);
2528 fcvtms(tmp1, T, tmp1);
2529 // tmp1 = floor(src + 0.5, ties to even)
2530
2531 fcvtas(dst, T, src);
2532 // dst = round(src), ties to away
2533
2534 fneg(tmp3, T, src);
2535 dup(tmp2, T, rscratch1);
2536 cm(HS, tmp3, T, tmp3, tmp2);
2537 // tmp3 is now a set of flags
2538
2539 bif(dst, T16B, tmp1, tmp3);
2540 // result in dst
2541 }
2542
2543 // Clobbers: rscratch1, rflags
2544 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2545 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2546 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2547 assert_different_registers(tmp1, tmp2, src, dst);
2548
2549 switch (T) {
2550 case S:
2551 mov(rscratch1, jint_cast(0x1.0p23f));
2552 break;
2553 case D:
2554 mov(rscratch1, julong_cast(0x1.0p52));
2555 break;
2556 default:
2557 assert(T == S || T == D, "invalid register variant");
2558 }
2559
2560 sve_frinta(dst, T, ptrue, src);
2561 // dst = round(src), ties to away
2562
2563 Label none;
2564
2565 sve_fneg(tmp1, T, ptrue, src);
2566 sve_dup(tmp2, T, rscratch1);
2567 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2568 br(EQ, none);
2569 {
2570 sve_cpy(tmp1, T, pgtmp, 0.5);
2571 sve_fadd(tmp1, T, pgtmp, src);
2572 sve_frintm(dst, T, pgtmp, tmp1);
2573 // dst = floor(src + 0.5, ties to even)
2574 }
2575 bind(none);
2576
2577 sve_fcvtzs(dst, T, ptrue, dst, T);
2578 // result in dst
2579 }
2580
2581 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2582 FloatRegister one, SIMD_Arrangement T) {
2583 assert_different_registers(dst, src, zero, one);
2584 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2585
2586 facgt(dst, T, src, zero);
2587 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2588 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2589 }
2590
2591 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2592 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2593 assert_different_registers(dst, src, zero, one, vtmp);
2594 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2595
2596 sve_orr(vtmp, src, src);
2597 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2598 switch (T) {
2599 case S:
2600 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2601 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2602 // on the sign of the float value
2603 break;
2604 case D:
2605 sve_and(vtmp, T, min_jlong);
2606 sve_orr(vtmp, T, jlong_cast(1.0));
2607 break;
2608 default:
2609 assert(false, "unsupported");
2610 ShouldNotReachHere();
2611 }
2612 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2613 // Result in dst
2614 }
2615
2616 bool C2_MacroAssembler::in_scratch_emit_size() {
2617 if (ciEnv::current()->task() != nullptr) {
2618 PhaseOutput* phase_output = Compile::current()->output();
2619 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2620 return true;
2621 }
2622 }
2623 return MacroAssembler::in_scratch_emit_size();
2624 }
2625
2626 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2627 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2628 }
2629
2630 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2631 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2632 if (t == TypeInt::INT) {
2633 return;
2634 }
2635
2636 BLOCK_COMMENT("verify_int_in_range {");
2637 Label L_success, L_failure;
2638
2639 jint lo = t->_lo;
2640 jint hi = t->_hi;
2641
2642 if (lo != min_jint) {
2643 subsw(rtmp, rval, lo);
2644 br(Assembler::LT, L_failure);
2645 }
2646 if (hi != max_jint) {
2647 subsw(rtmp, rval, hi);
2648 br(Assembler::GT, L_failure);
2649 }
2650 b(L_success);
2651
2652 bind(L_failure);
2653 movw(c_rarg0, idx);
2654 mov(c_rarg1, rval);
2655 movw(c_rarg2, lo);
2656 movw(c_rarg3, hi);
2657 reconstruct_frame_pointer(rtmp);
2658 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2659 hlt(0);
2660
2661 bind(L_success);
2662 BLOCK_COMMENT("} verify_int_in_range");
2663 }
2664
2665 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2666 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2667 }
2668
2669 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2670 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2671 if (t == TypeLong::LONG) {
2672 return;
2673 }
2674
2675 BLOCK_COMMENT("verify_long_in_range {");
2676 Label L_success, L_failure;
2677
2678 jlong lo = t->_lo;
2679 jlong hi = t->_hi;
2680
2681 if (lo != min_jlong) {
2682 subs(rtmp, rval, lo);
2683 br(Assembler::LT, L_failure);
2684 }
2685 if (hi != max_jlong) {
2686 subs(rtmp, rval, hi);
2687 br(Assembler::GT, L_failure);
2688 }
2689 b(L_success);
2690
2691 bind(L_failure);
2692 movw(c_rarg0, idx);
2693 mov(c_rarg1, rval);
2694 mov(c_rarg2, lo);
2695 mov(c_rarg3, hi);
2696 reconstruct_frame_pointer(rtmp);
2697 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2698 hlt(0);
2699
2700 bind(L_success);
2701 BLOCK_COMMENT("} verify_long_in_range");
2702 }
2703
2704 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2705 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2706 if (PreserveFramePointer) {
2707 // frame pointer is valid
2708 #ifdef ASSERT
2709 // Verify frame pointer value in rfp.
2710 add(rtmp, sp, framesize - 2 * wordSize);
2711 Label L_success;
2712 cmp(rfp, rtmp);
2713 br(Assembler::EQ, L_success);
2714 stop("frame pointer mismatch");
2715 bind(L_success);
2716 #endif // ASSERT
2717 } else {
2718 add(rfp, sp, framesize - 2 * wordSize);
2719 }
2720 }
2721
2722 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2723 // using Neon instructions and places it in the destination vector element corresponding to the
2724 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2725 // where NUM_ELEM is the number of BasicType elements per vector.
2726 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2727 // Otherwise, selects src2[idx – NUM_ELEM]
2728 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2729 FloatRegister src2, FloatRegister index,
2730 FloatRegister tmp, unsigned vector_length_in_bytes) {
2731 assert_different_registers(dst, src1, src2, tmp);
2732 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2733
2734 if (vector_length_in_bytes == 16) {
2735 assert(UseSVE <= 1, "sve must be <= 1");
2736 assert(src1->successor() == src2, "Source registers must be ordered");
2737 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2738 tbl(dst, size, src1, 2, index);
2739 } else { // vector length == 8
2740 assert(UseSVE == 0, "must be Neon only");
2741 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2742 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2743 // instruction with one vector lookup
2744 ins(tmp, D, src1, 0, 0);
2745 ins(tmp, D, src2, 1, 0);
2746 tbl(dst, size, tmp, 1, index);
2747 }
2748 }
2749
2750 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2751 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2752 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2753 // where NUM_ELEM is the number of BasicType elements per vector.
2754 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2755 // Otherwise, selects src2[idx – NUM_ELEM]
2756 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2757 FloatRegister src2, FloatRegister index,
2758 FloatRegister tmp, SIMD_RegVariant T,
2759 unsigned vector_length_in_bytes) {
2760 assert_different_registers(dst, src1, src2, index, tmp);
2761
2762 if (vector_length_in_bytes == 8) {
2763 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2764 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2765 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2766 // instruction with one vector lookup
2767 assert(UseSVE >= 1, "sve must be >= 1");
2768 ins(tmp, D, src1, 0, 0);
2769 ins(tmp, D, src2, 1, 0);
2770 sve_tbl(dst, T, tmp, index);
2771 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2772 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2773 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2774 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2775 // with the only exception of 8B vector length.
2776 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2777 assert(src1->successor() == src2, "Source registers must be ordered");
2778 sve_tbl(dst, T, src1, src2, index);
2779 }
2780 }
2781
2782 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2783 FloatRegister src2, FloatRegister index,
2784 FloatRegister tmp, BasicType bt,
2785 unsigned vector_length_in_bytes) {
2786
2787 assert_different_registers(dst, src1, src2, index, tmp);
2788
2789 // The cases that can reach this method are -
2790 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2791 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2792 //
2793 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2794 // and UseSVE = 2 with vector_length_in_bytes >= 8
2795 //
2796 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2797 // UseSVE = 1 with vector_length_in_bytes = 16
2798
2799 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2800 SIMD_RegVariant T = elemType_to_regVariant(bt);
2801 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2802 return;
2803 }
2804
2805 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2806 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2807 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2808
2809 bool isQ = vector_length_in_bytes == 16;
2810
2811 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2812 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2813
2814 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2815 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2816 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2817 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2818 // the indices can range from [0, 8).
2819 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2820 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2821 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2822 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2823 // Add the multiplied result to the vector in tmp to obtain the byte level
2824 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2825 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2826
2827 if (bt == T_BYTE) {
2828 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2829 } else {
2830 int elem_size = (bt == T_SHORT) ? 2 : 4;
2831 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2832
2833 mov(tmp, size1, elem_size);
2834 mulv(dst, size2, index, tmp);
2835 mov(tmp, size2, tbl_offset);
2836 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2837 // to select a set of 2B/4B
2838 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2839 }
2840 }
2841
2842 // Vector expand implementation. Elements from the src vector are expanded into
2843 // the dst vector under the control of the vector mask.
2844 // Since there are no native instructions directly corresponding to expand before
2845 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2846 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2847 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2848 // for NEON and SVE, but with different instructions where appropriate.
2849
2850 // Vector expand implementation for NEON.
2851 //
2852 // An example of 128-bit Byte vector:
2853 // Data direction: high <== low
2854 // Input:
2855 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2856 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2857 // Expected result:
2858 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2859 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2860 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2861 int vector_length_in_bytes) {
2862 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2863 assert_different_registers(dst, src, mask, tmp1, tmp2);
2864 // Since the TBL instruction only supports byte table, we need to
2865 // compute indices in byte type for all types.
2866 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2867 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2868 dup(tmp1, size, zr);
2869 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2870 negr(dst, size, mask);
2871 // Calculate vector index for TBL with prefix sum algorithm.
2872 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2873 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2874 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2875 addv(dst, size, tmp2, dst);
2876 }
2877 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2878 orr(tmp2, size, mask, mask);
2879 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2880 bsl(tmp2, size, dst, tmp1);
2881 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2882 movi(tmp1, size, 1);
2883 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2884 subv(dst, size, tmp2, tmp1);
2885 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2886 tbl(dst, size, src, 1, dst);
2887 }
2888
2889 // Vector expand implementation for SVE.
2890 //
2891 // An example of 128-bit Short vector:
2892 // Data direction: high <== low
2893 // Input:
2894 // src = gf ed cb a9 87 65 43 21
2895 // pg = 00 01 00 01 00 01 00 01
2896 // Expected result:
2897 // dst = 00 87 00 65 00 43 00 21
2898 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2899 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2900 int vector_length_in_bytes) {
2901 assert(UseSVE > 0, "expand implementation only for SVE");
2902 assert_different_registers(dst, src, tmp1, tmp2);
2903 SIMD_RegVariant size = elemType_to_regVariant(bt);
2904
2905 // tmp1 = 00 00 00 00 00 00 00 00
2906 sve_dup(tmp1, size, 0);
2907 sve_movprfx(tmp2, tmp1);
2908 // tmp2 = 00 01 00 01 00 01 00 01
2909 sve_cpy(tmp2, size, pg, 1, true);
2910 // Calculate vector index for TBL with prefix sum algorithm.
2911 // tmp2 = 04 04 03 03 02 02 01 01
2912 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2913 sve_movprfx(dst, tmp1);
2914 // The EXT instruction operates on the full-width sve register. The correct
2915 // index calculation method is:
2916 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2917 // MaxVectorSize - i.
2918 sve_ext(dst, tmp2, MaxVectorSize - i);
2919 sve_add(tmp2, size, dst, tmp2);
2920 }
2921 // dst = 00 04 00 03 00 02 00 01
2922 sve_sel(dst, size, pg, tmp2, tmp1);
2923 // dst = -1 03 -1 02 -1 01 -1 00
2924 sve_sub(dst, size, 1);
2925 // dst = 00 87 00 65 00 43 00 21
2926 sve_tbl(dst, size, src, dst);
2927 }
2928
2929 // Optimized SVE cpy (imm, zeroing) instruction.
2930 //
2931 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2932 // functionality, but test results show that `movi; cpy(imm, merging)` has
2933 // higher throughput on some microarchitectures. This would depend on
2934 // microarchitecture and so may vary between implementations.
2935 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2936 PRegister pg, int imm8, bool isMerge) {
2937 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2938 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2939 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2940 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2941 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2942 // entire Z<dst> register. According to the Arm Software Optimization
2943 // Guide, `movi` is zero latency.
2944 movi(dst, T2D, 0);
2945 isMerge = true;
2946 }
2947 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2948 }
2949
2950 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2951 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2952 // the offset between two types is 16.
2953 switch(bt) {
2954 case T_BYTE:
2955 return 0;
2956 case T_SHORT:
2957 return 1;
2958 case T_INT:
2959 return 2;
2960 case T_LONG:
2961 return 3;
2962 case T_FLOAT:
2963 return 4;
2964 case T_DOUBLE:
2965 return 5;
2966 default:
2967 ShouldNotReachHere();
2968 }
2969 }