1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2026 Arm Limited and/or its affiliates.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/matcher.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/objectMonitorTable.hpp"
35 #include "runtime/stubRoutines.hpp"
36 #include "runtime/synchronizer.hpp"
37 #include "utilities/globalDefinitions.hpp"
38 #include "utilities/powerOfTwo.hpp"
39
40 #ifdef PRODUCT
41 #define BLOCK_COMMENT(str) /* nothing */
42 #define STOP(error) stop(error)
43 #else
44 #define BLOCK_COMMENT(str) block_comment(str)
45 #define STOP(error) block_comment(error); stop(error)
46 #endif
47
48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
49
50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
51
52 // jdk.internal.util.ArraysSupport.vectorizedHashCode
53 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
54 FloatRegister vdata0, FloatRegister vdata1,
55 FloatRegister vdata2, FloatRegister vdata3,
56 FloatRegister vmul0, FloatRegister vmul1,
57 FloatRegister vmul2, FloatRegister vmul3,
58 FloatRegister vpow, FloatRegister vpowm,
59 BasicType eltype) {
60 ARRAYS_HASHCODE_REGISTERS;
61
62 Register tmp1 = rscratch1, tmp2 = rscratch2;
63
64 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
65
66 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
67 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
68 // use 4H for chars and shorts instead, but using 8H gives better performance.
69 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
70 : eltype == T_CHAR || eltype == T_SHORT ? 8
71 : eltype == T_INT ? 4
72 : 0;
73 guarantee(vf, "unsupported eltype");
74
75 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
76 const size_t unroll_factor = 4;
77
78 switch (eltype) {
79 case T_BOOLEAN:
80 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
81 break;
82 case T_CHAR:
83 BLOCK_COMMENT("arrays_hashcode(char) {");
84 break;
85 case T_BYTE:
86 BLOCK_COMMENT("arrays_hashcode(byte) {");
87 break;
88 case T_SHORT:
89 BLOCK_COMMENT("arrays_hashcode(short) {");
90 break;
91 case T_INT:
92 BLOCK_COMMENT("arrays_hashcode(int) {");
93 break;
94 default:
95 ShouldNotReachHere();
96 }
97
98 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
99 // implemented by the stub executes just once. Call the stub only if at least two iterations will
100 // be executed.
101 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
102 cmpw(cnt, large_threshold);
103 br(Assembler::HS, LARGE);
104
105 bind(TAIL);
106
107 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
108 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
109 // Iteration eats up the remainder, uf elements at a time.
110 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
111 andr(tmp2, cnt, unroll_factor - 1);
112 adr(tmp1, BR_BASE);
113 // For Cortex-A53 offset is 4 because 2 nops are generated.
114 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
115 movw(tmp2, 0x1f);
116 br(tmp1);
117
118 bind(LOOP);
119 for (size_t i = 0; i < unroll_factor; ++i) {
120 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
121 maddw(result, result, tmp2, tmp1);
122 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
123 // Generate 2nd nop to have 4 instructions per iteration.
124 if (VM_Version::supports_a53mac()) {
125 nop();
126 }
127 }
128 bind(BR_BASE);
129 subsw(cnt, cnt, unroll_factor);
130 br(Assembler::HS, LOOP);
131
132 b(DONE);
133
134 bind(LARGE);
135
136 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
137 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
138 address tpc = trampoline_call(stub);
139 if (tpc == nullptr) {
140 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
141 postcond(pc() == badAddress);
142 return nullptr;
143 }
144
145 bind(DONE);
146
147 BLOCK_COMMENT("} // arrays_hashcode");
148
149 postcond(pc() != badAddress);
150 return pc();
151 }
152
153 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
154 Register t2, Register t3) {
155 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
156
157 // Handle inflated monitor.
158 Label inflated;
159 // Finish fast lock successfully. MUST branch to with flag == EQ
160 Label locked;
161 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
162 Label slow_path;
163
164 if (UseObjectMonitorTable) {
165 // Clear cache in case fast locking succeeds or we need to take the slow-path.
166 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
167 }
168
169 if (DiagnoseSyncOnValueBasedClasses != 0) {
170 load_klass(t1, obj);
171 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
172 tst(t1, KlassFlags::_misc_is_value_based_class);
173 br(Assembler::NE, slow_path);
174 }
175
176 const Register t1_mark = t1;
177 const Register t3_t = t3;
178
179 { // Fast locking
180
181 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
182 Label push;
183
184 const Register t2_top = t2;
185
186 // Check if lock-stack is full.
187 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
188 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
189 br(Assembler::GT, slow_path);
190
191 // Check if recursive.
192 subw(t3_t, t2_top, oopSize);
193 ldr(t3_t, Address(rthread, t3_t));
194 cmp(obj, t3_t);
195 br(Assembler::EQ, push);
196
197 // Relaxed normal load to check for monitor. Optimization for monitor case.
198 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
199 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
200
201 // Not inflated
202 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
203
204 // Try to lock. Transition lock-bits 0b01 => 0b00
205 orr(t1_mark, t1_mark, markWord::unlocked_value);
206 eor(t3_t, t1_mark, markWord::unlocked_value);
207 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
208 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
209 br(Assembler::NE, slow_path);
210
211 bind(push);
212 // After successful lock, push object on lock-stack.
213 str(obj, Address(rthread, t2_top));
214 addw(t2_top, t2_top, oopSize);
215 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
216 b(locked);
217 }
218
219 { // Handle inflated monitor.
220 bind(inflated);
221
222 const Register t1_monitor = t1;
223
224 if (!UseObjectMonitorTable) {
225 assert(t1_monitor == t1_mark, "should be the same here");
226 } else {
227 const Register t1_hash = t1;
228 Label monitor_found;
229
230 // Save the mark, we might need it to extract the hash.
231 mov(t3, t1_mark);
232
233 // Look for the monitor in the om_cache.
234
235 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
236 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
237 const int num_unrolled = OMCache::CAPACITY;
238 for (int i = 0; i < num_unrolled; i++) {
239 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
240 ldr(t2, Address(rthread, cache_offset));
241 cmp(obj, t2);
242 br(Assembler::EQ, monitor_found);
243 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
244 }
245
246 if (UseCompactObjectHeaders) {
247 // TODO: The fast-path table lookup currently doesn't work with Lilliput's
248 // compact identity-hashcode implementation.
249 // See: https://bugs.openjdk.org/browse/JDK-8380981
250 b(slow_path);
251 } else {
252 // Look for the monitor in the table.
253
254 // Get the hash code.
255 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
256
257 // Get the table and calculate the bucket's address
258 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
259 ldr(t3, Address(t3));
260 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
261 ands(t1_hash, t1_hash, t2);
262 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
263
264 // Read the monitor from the bucket.
265 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
266
267 // Check if the monitor in the bucket is special (empty, tombstone or removed).
268 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
269 br(Assembler::LO, slow_path);
270
271 // Check if object matches.
272 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
273 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
274 bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path);
275 cmp(t3, obj);
276 br(Assembler::NE, slow_path);
277 }
278 bind(monitor_found);
279 }
280
281 const Register t2_owner_addr = t2;
282 const Register t3_owner = t3;
283 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
284 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
285 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
286
287 Label monitor_locked;
288
289 // Compute owner address.
290 lea(t2_owner_addr, owner_address);
291
292 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
293 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
294 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
295 /*release*/ false, /*weak*/ false, t3_owner);
296 br(Assembler::EQ, monitor_locked);
297
298 // Check if recursive.
299 cmp(t3_owner, rscratch2);
300 br(Assembler::NE, slow_path);
301
302 // Recursive.
303 increment(recursions_address, 1);
304
305 bind(monitor_locked);
306 if (UseObjectMonitorTable) {
307 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
308 }
309 }
310
311 bind(locked);
312
313 #ifdef ASSERT
314 // Check that locked label is reached with Flags == EQ.
315 Label flag_correct;
316 br(Assembler::EQ, flag_correct);
317 stop("Fast Lock Flag != EQ");
318 #endif
319
320 bind(slow_path);
321 #ifdef ASSERT
322 // Check that slow_path label is reached with Flags == NE.
323 br(Assembler::NE, flag_correct);
324 stop("Fast Lock Flag != NE");
325 bind(flag_correct);
326 #endif
327 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
328 }
329
330 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
331 Register t2, Register t3) {
332 assert_different_registers(obj, box, t1, t2, t3);
333
334 // Handle inflated monitor.
335 Label inflated, inflated_load_mark;
336 // Finish fast unlock successfully. MUST branch to with flag == EQ
337 Label unlocked;
338 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
339 Label slow_path;
340
341 const Register t1_mark = t1;
342 const Register t2_top = t2;
343 const Register t3_t = t3;
344
345 { // Fast unlock
346
347 Label push_and_slow_path;
348
349 // Check if obj is top of lock-stack.
350 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
351 subw(t2_top, t2_top, oopSize);
352 ldr(t3_t, Address(rthread, t2_top));
353 cmp(obj, t3_t);
354 // Top of lock stack was not obj. Must be monitor.
355 br(Assembler::NE, inflated_load_mark);
356
357 // Pop lock-stack.
358 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
359 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
360
361 // Check if recursive.
362 subw(t3_t, t2_top, oopSize);
363 ldr(t3_t, Address(rthread, t3_t));
364 cmp(obj, t3_t);
365 br(Assembler::EQ, unlocked);
366
367 // Not recursive.
368 // Load Mark.
369 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
370
371 // Check header for monitor (0b10).
372 // Because we got here by popping (meaning we pushed in locked)
373 // there will be no monitor in the box. So we need to push back the obj
374 // so that the runtime can fix any potential anonymous owner.
375 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
376
377 // Try to unlock. Transition lock bits 0b00 => 0b01
378 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
379 orr(t3_t, t1_mark, markWord::unlocked_value);
380 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
381 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
382 br(Assembler::EQ, unlocked);
383
384 bind(push_and_slow_path);
385 // Compare and exchange failed.
386 // Restore lock-stack and handle the unlock in runtime.
387 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
388 addw(t2_top, t2_top, oopSize);
389 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
390 b(slow_path);
391 }
392
393
394 { // Handle inflated monitor.
395 bind(inflated_load_mark);
396 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
397 #ifdef ASSERT
398 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
399 stop("Fast Unlock not monitor");
400 #endif
401
402 bind(inflated);
403
404 #ifdef ASSERT
405 Label check_done;
406 subw(t2_top, t2_top, oopSize);
407 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
408 br(Assembler::LT, check_done);
409 ldr(t3_t, Address(rthread, t2_top));
410 cmp(obj, t3_t);
411 br(Assembler::NE, inflated);
412 stop("Fast Unlock lock on stack");
413 bind(check_done);
414 #endif
415
416 const Register t1_monitor = t1;
417
418 if (!UseObjectMonitorTable) {
419 assert(t1_monitor == t1_mark, "should be the same here");
420
421 // Untag the monitor.
422 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
423 } else {
424 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
425 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
426 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
427 br(Assembler::LO, slow_path);
428 }
429
430 const Register t2_recursions = t2;
431 Label not_recursive;
432
433 // Check if recursive.
434 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
435 cbz(t2_recursions, not_recursive);
436
437 // Recursive unlock.
438 sub(t2_recursions, t2_recursions, 1u);
439 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
440 // Set flag == EQ
441 cmp(t2_recursions, t2_recursions);
442 b(unlocked);
443
444 bind(not_recursive);
445
446 const Register t2_owner_addr = t2;
447
448 // Compute owner address.
449 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
450
451 // Set owner to null.
452 // Release to satisfy the JMM
453 stlr(zr, t2_owner_addr);
454 // We need a full fence after clearing owner to avoid stranding.
455 // StoreLoad achieves this.
456 membar(StoreLoad);
457
458 // Check if the entry_list is empty.
459 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
460 cmp(rscratch1, zr);
461 br(Assembler::EQ, unlocked); // If so we are done.
462
463 // Check if there is a successor.
464 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
465 cmp(rscratch1, zr);
466 br(Assembler::NE, unlocked); // If so we are done.
467
468 // Save the monitor pointer in the current thread, so we can try to
469 // reacquire the lock in SharedRuntime::monitor_exit_helper().
470 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
471
472 cmp(zr, rthread); // Set Flag to NE => slow path
473 b(slow_path);
474 }
475
476 bind(unlocked);
477 cmp(zr, zr); // Set Flags to EQ => fast path
478
479 #ifdef ASSERT
480 // Check that unlocked label is reached with Flags == EQ.
481 Label flag_correct;
482 br(Assembler::EQ, flag_correct);
483 stop("Fast Unlock Flag != EQ");
484 #endif
485
486 bind(slow_path);
487 #ifdef ASSERT
488 // Check that slow_path label is reached with Flags == NE.
489 br(Assembler::NE, flag_correct);
490 stop("Fast Unlock Flag != NE");
491 bind(flag_correct);
492 #endif
493 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
494 }
495
496 // Search for str1 in str2 and return index or -1
497 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
498 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
499 Register cnt2, Register cnt1,
500 Register tmp1, Register tmp2,
501 Register tmp3, Register tmp4,
502 Register tmp5, Register tmp6,
503 int icnt1, Register result, int ae) {
504 // NOTE: tmp5, tmp6 can be zr depending on specific method version
505 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
506
507 Register ch1 = rscratch1;
508 Register ch2 = rscratch2;
509 Register cnt1tmp = tmp1;
510 Register cnt2tmp = tmp2;
511 Register cnt1_neg = cnt1;
512 Register cnt2_neg = cnt2;
513 Register result_tmp = tmp4;
514
515 bool isL = ae == StrIntrinsicNode::LL;
516
517 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
518 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
519 int str1_chr_shift = str1_isL ? 0:1;
520 int str2_chr_shift = str2_isL ? 0:1;
521 int str1_chr_size = str1_isL ? 1:2;
522 int str2_chr_size = str2_isL ? 1:2;
523 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
524 (chr_insn)&MacroAssembler::ldrh;
525 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
526 (chr_insn)&MacroAssembler::ldrh;
527 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
528 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
529
530 // Note, inline_string_indexOf() generates checks:
531 // if (substr.count > string.count) return -1;
532 // if (substr.count == 0) return 0;
533
534 // We have two strings, a source string in str2, cnt2 and a pattern string
535 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
536
537 // For larger pattern and source we use a simplified Boyer Moore algorithm.
538 // With a small pattern and source we use linear scan.
539
540 if (icnt1 == -1) {
541 sub(result_tmp, cnt2, cnt1);
542 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
543 br(LT, LINEARSEARCH);
544 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
545 subs(zr, cnt1, 256);
546 lsr(tmp1, cnt2, 2);
547 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
548 br(GE, LINEARSTUB);
549 }
550
551 // The Boyer Moore alogorithm is based on the description here:-
552 //
553 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
554 //
555 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
556 // and the 'Good Suffix' rule.
557 //
558 // These rules are essentially heuristics for how far we can shift the
559 // pattern along the search string.
560 //
561 // The implementation here uses the 'Bad Character' rule only because of the
562 // complexity of initialisation for the 'Good Suffix' rule.
563 //
564 // This is also known as the Boyer-Moore-Horspool algorithm:-
565 //
566 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
567 //
568 // This particular implementation has few java-specific optimizations.
569 //
570 // #define ASIZE 256
571 //
572 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
573 // int i, j;
574 // unsigned c;
575 // unsigned char bc[ASIZE];
576 //
577 // /* Preprocessing */
578 // for (i = 0; i < ASIZE; ++i)
579 // bc[i] = m;
580 // for (i = 0; i < m - 1; ) {
581 // c = x[i];
582 // ++i;
583 // // c < 256 for Latin1 string, so, no need for branch
584 // #ifdef PATTERN_STRING_IS_LATIN1
585 // bc[c] = m - i;
586 // #else
587 // if (c < ASIZE) bc[c] = m - i;
588 // #endif
589 // }
590 //
591 // /* Searching */
592 // j = 0;
593 // while (j <= n - m) {
594 // c = y[i+j];
595 // if (x[m-1] == c)
596 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
597 // if (i < 0) return j;
598 // // c < 256 for Latin1 string, so, no need for branch
599 // #ifdef SOURCE_STRING_IS_LATIN1
600 // // LL case: (c< 256) always true. Remove branch
601 // j += bc[y[j+m-1]];
602 // #endif
603 // #ifndef PATTERN_STRING_IS_UTF
604 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
605 // if (c < ASIZE)
606 // j += bc[y[j+m-1]];
607 // else
608 // j += 1
609 // #endif
610 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
611 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
612 // if (c < ASIZE)
613 // j += bc[y[j+m-1]];
614 // else
615 // j += m
616 // #endif
617 // }
618 // }
619
620 if (icnt1 == -1) {
621 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
622 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
623 Register cnt1end = tmp2;
624 Register str2end = cnt2;
625 Register skipch = tmp2;
626
627 // str1 length is >=8, so, we can read at least 1 register for cases when
628 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
629 // UL case. We'll re-read last character in inner pre-loop code to have
630 // single outer pre-loop load
631 const int firstStep = isL ? 7 : 3;
632
633 const int ASIZE = 256;
634 const int STORED_BYTES = 32; // amount of bytes stored per instruction
635 sub(sp, sp, ASIZE);
636 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
637 mov(ch1, sp);
638 BIND(BM_INIT_LOOP);
639 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
640 subs(tmp5, tmp5, 1);
641 br(GT, BM_INIT_LOOP);
642
643 sub(cnt1tmp, cnt1, 1);
644 mov(tmp5, str2);
645 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
646 sub(ch2, cnt1, 1);
647 mov(tmp3, str1);
648 BIND(BCLOOP);
649 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
650 if (!str1_isL) {
651 subs(zr, ch1, ASIZE);
652 br(HS, BCSKIP);
653 }
654 strb(ch2, Address(sp, ch1));
655 BIND(BCSKIP);
656 subs(ch2, ch2, 1);
657 br(GT, BCLOOP);
658
659 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
660 if (str1_isL == str2_isL) {
661 // load last 8 bytes (8LL/4UU symbols)
662 ldr(tmp6, Address(tmp6, -wordSize));
663 } else {
664 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
665 // convert Latin1 to UTF. We'll have to wait until load completed, but
666 // it's still faster than per-character loads+checks
667 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
668 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
669 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
670 andr(tmp6, tmp6, 0xFF); // str1[N-4]
671 orr(ch2, ch1, ch2, LSL, 16);
672 orr(tmp6, tmp6, tmp3, LSL, 48);
673 orr(tmp6, tmp6, ch2, LSL, 16);
674 }
675 BIND(BMLOOPSTR2);
676 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
677 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
678 if (str1_isL == str2_isL) {
679 // re-init tmp3. It's for free because it's executed in parallel with
680 // load above. Alternative is to initialize it before loop, but it'll
681 // affect performance on in-order systems with 2 or more ld/st pipelines
682 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
683 }
684 if (!isL) { // UU/UL case
685 lsl(ch2, cnt1tmp, 1); // offset in bytes
686 }
687 cmp(tmp3, skipch);
688 br(NE, BMSKIP);
689 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
690 mov(ch1, tmp6);
691 if (isL) {
692 b(BMLOOPSTR1_AFTER_LOAD);
693 } else {
694 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
695 b(BMLOOPSTR1_CMP);
696 }
697 BIND(BMLOOPSTR1);
698 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
699 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
700 BIND(BMLOOPSTR1_AFTER_LOAD);
701 subs(cnt1tmp, cnt1tmp, 1);
702 br(LT, BMLOOPSTR1_LASTCMP);
703 BIND(BMLOOPSTR1_CMP);
704 cmp(ch1, ch2);
705 br(EQ, BMLOOPSTR1);
706 BIND(BMSKIP);
707 if (!isL) {
708 // if we've met UTF symbol while searching Latin1 pattern, then we can
709 // skip cnt1 symbols
710 if (str1_isL != str2_isL) {
711 mov(result_tmp, cnt1);
712 } else {
713 mov(result_tmp, 1);
714 }
715 subs(zr, skipch, ASIZE);
716 br(HS, BMADV);
717 }
718 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
719 BIND(BMADV);
720 sub(cnt1tmp, cnt1, 1);
721 add(str2, str2, result_tmp, LSL, str2_chr_shift);
722 cmp(str2, str2end);
723 br(LE, BMLOOPSTR2);
724 add(sp, sp, ASIZE);
725 b(NOMATCH);
726 BIND(BMLOOPSTR1_LASTCMP);
727 cmp(ch1, ch2);
728 br(NE, BMSKIP);
729 BIND(BMMATCH);
730 sub(result, str2, tmp5);
731 if (!str2_isL) lsr(result, result, 1);
732 add(sp, sp, ASIZE);
733 b(DONE);
734
735 BIND(LINEARSTUB);
736 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
737 br(LT, LINEAR_MEDIUM);
738 mov(result, zr);
739 RuntimeAddress stub = nullptr;
740 if (isL) {
741 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
742 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
743 } else if (str1_isL) {
744 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
745 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
746 } else {
747 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
748 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
749 }
750 address call = trampoline_call(stub);
751 if (call == nullptr) {
752 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
753 ciEnv::current()->record_failure("CodeCache is full");
754 return;
755 }
756 b(DONE);
757 }
758
759 BIND(LINEARSEARCH);
760 {
761 Label DO1, DO2, DO3;
762
763 Register str2tmp = tmp2;
764 Register first = tmp3;
765
766 if (icnt1 == -1)
767 {
768 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
769
770 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
771 br(LT, DOSHORT);
772 BIND(LINEAR_MEDIUM);
773 (this->*str1_load_1chr)(first, Address(str1));
774 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
775 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
776 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
777 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
778
779 BIND(FIRST_LOOP);
780 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
781 cmp(first, ch2);
782 br(EQ, STR1_LOOP);
783 BIND(STR2_NEXT);
784 adds(cnt2_neg, cnt2_neg, str2_chr_size);
785 br(LE, FIRST_LOOP);
786 b(NOMATCH);
787
788 BIND(STR1_LOOP);
789 adds(cnt1tmp, cnt1_neg, str1_chr_size);
790 add(cnt2tmp, cnt2_neg, str2_chr_size);
791 br(GE, MATCH);
792
793 BIND(STR1_NEXT);
794 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
795 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
796 cmp(ch1, ch2);
797 br(NE, STR2_NEXT);
798 adds(cnt1tmp, cnt1tmp, str1_chr_size);
799 add(cnt2tmp, cnt2tmp, str2_chr_size);
800 br(LT, STR1_NEXT);
801 b(MATCH);
802
803 BIND(DOSHORT);
804 if (str1_isL == str2_isL) {
805 cmp(cnt1, (u1)2);
806 br(LT, DO1);
807 br(GT, DO3);
808 }
809 }
810
811 if (icnt1 == 4) {
812 Label CH1_LOOP;
813
814 (this->*load_4chr)(ch1, str1);
815 sub(result_tmp, cnt2, 4);
816 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
817 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
818
819 BIND(CH1_LOOP);
820 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
821 cmp(ch1, ch2);
822 br(EQ, MATCH);
823 adds(cnt2_neg, cnt2_neg, str2_chr_size);
824 br(LE, CH1_LOOP);
825 b(NOMATCH);
826 }
827
828 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
829 Label CH1_LOOP;
830
831 BIND(DO2);
832 (this->*load_2chr)(ch1, str1);
833 if (icnt1 == 2) {
834 sub(result_tmp, cnt2, 2);
835 }
836 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
837 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
838 BIND(CH1_LOOP);
839 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
840 cmp(ch1, ch2);
841 br(EQ, MATCH);
842 adds(cnt2_neg, cnt2_neg, str2_chr_size);
843 br(LE, CH1_LOOP);
844 b(NOMATCH);
845 }
846
847 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
848 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
849
850 BIND(DO3);
851 (this->*load_2chr)(first, str1);
852 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
853 if (icnt1 == 3) {
854 sub(result_tmp, cnt2, 3);
855 }
856 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
857 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
858 BIND(FIRST_LOOP);
859 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
860 cmpw(first, ch2);
861 br(EQ, STR1_LOOP);
862 BIND(STR2_NEXT);
863 adds(cnt2_neg, cnt2_neg, str2_chr_size);
864 br(LE, FIRST_LOOP);
865 b(NOMATCH);
866
867 BIND(STR1_LOOP);
868 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
869 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
870 cmp(ch1, ch2);
871 br(NE, STR2_NEXT);
872 b(MATCH);
873 }
874
875 if (icnt1 == -1 || icnt1 == 1) {
876 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
877
878 BIND(DO1);
879 (this->*str1_load_1chr)(ch1, str1);
880 cmp(cnt2, (u1)8);
881 br(LT, DO1_SHORT);
882
883 sub(result_tmp, cnt2, 8/str2_chr_size);
884 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
885 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
886 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
887
888 if (str2_isL) {
889 orr(ch1, ch1, ch1, LSL, 8);
890 }
891 orr(ch1, ch1, ch1, LSL, 16);
892 orr(ch1, ch1, ch1, LSL, 32);
893 BIND(CH1_LOOP);
894 ldr(ch2, Address(str2, cnt2_neg));
895 eor(ch2, ch1, ch2);
896 sub(tmp1, ch2, tmp3);
897 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
898 bics(tmp1, tmp1, tmp2);
899 br(NE, HAS_ZERO);
900 adds(cnt2_neg, cnt2_neg, 8);
901 br(LT, CH1_LOOP);
902
903 cmp(cnt2_neg, (u1)8);
904 mov(cnt2_neg, 0);
905 br(LT, CH1_LOOP);
906 b(NOMATCH);
907
908 BIND(HAS_ZERO);
909 rev(tmp1, tmp1);
910 clz(tmp1, tmp1);
911 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
912 b(MATCH);
913
914 BIND(DO1_SHORT);
915 mov(result_tmp, cnt2);
916 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
917 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
918 BIND(DO1_LOOP);
919 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
920 cmpw(ch1, ch2);
921 br(EQ, MATCH);
922 adds(cnt2_neg, cnt2_neg, str2_chr_size);
923 br(LT, DO1_LOOP);
924 }
925 }
926 BIND(NOMATCH);
927 mov(result, -1);
928 b(DONE);
929 BIND(MATCH);
930 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
931 BIND(DONE);
932 }
933
934 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
935 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
936
937 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
938 Register ch, Register result,
939 Register tmp1, Register tmp2, Register tmp3)
940 {
941 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
942 Register cnt1_neg = cnt1;
943 Register ch1 = rscratch1;
944 Register result_tmp = rscratch2;
945
946 cbz(cnt1, NOMATCH);
947
948 cmp(cnt1, (u1)4);
949 br(LT, DO1_SHORT);
950
951 orr(ch, ch, ch, LSL, 16);
952 orr(ch, ch, ch, LSL, 32);
953
954 sub(cnt1, cnt1, 4);
955 mov(result_tmp, cnt1);
956 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
957 sub(cnt1_neg, zr, cnt1, LSL, 1);
958
959 mov(tmp3, 0x0001000100010001);
960
961 BIND(CH1_LOOP);
962 ldr(ch1, Address(str1, cnt1_neg));
963 eor(ch1, ch, ch1);
964 sub(tmp1, ch1, tmp3);
965 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
966 bics(tmp1, tmp1, tmp2);
967 br(NE, HAS_ZERO);
968 adds(cnt1_neg, cnt1_neg, 8);
969 br(LT, CH1_LOOP);
970
971 cmp(cnt1_neg, (u1)8);
972 mov(cnt1_neg, 0);
973 br(LT, CH1_LOOP);
974 b(NOMATCH);
975
976 BIND(HAS_ZERO);
977 rev(tmp1, tmp1);
978 clz(tmp1, tmp1);
979 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
980 b(MATCH);
981
982 BIND(DO1_SHORT);
983 mov(result_tmp, cnt1);
984 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
985 sub(cnt1_neg, zr, cnt1, LSL, 1);
986 BIND(DO1_LOOP);
987 ldrh(ch1, Address(str1, cnt1_neg));
988 cmpw(ch, ch1);
989 br(EQ, MATCH);
990 adds(cnt1_neg, cnt1_neg, 2);
991 br(LT, DO1_LOOP);
992 BIND(NOMATCH);
993 mov(result, -1);
994 b(DONE);
995 BIND(MATCH);
996 add(result, result_tmp, cnt1_neg, ASR, 1);
997 BIND(DONE);
998 }
999
1000 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1001 Register ch, Register result,
1002 FloatRegister ztmp1,
1003 FloatRegister ztmp2,
1004 PRegister tmp_pg,
1005 PRegister tmp_pdn, bool isL)
1006 {
1007 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1008 assert(tmp_pg->is_governing(),
1009 "this register has to be a governing predicate register");
1010
1011 Label LOOP, MATCH, DONE, NOMATCH;
1012 Register vec_len = rscratch1;
1013 Register idx = rscratch2;
1014
1015 SIMD_RegVariant T = (isL == true) ? B : H;
1016
1017 cbz(cnt1, NOMATCH);
1018
1019 // Assign the particular char throughout the vector.
1020 sve_dup(ztmp2, T, ch);
1021 if (isL) {
1022 sve_cntb(vec_len);
1023 } else {
1024 sve_cnth(vec_len);
1025 }
1026 mov(idx, 0);
1027
1028 // Generate a predicate to control the reading of input string.
1029 sve_whilelt(tmp_pg, T, idx, cnt1);
1030
1031 BIND(LOOP);
1032 // Read a vector of 8- or 16-bit data depending on the string type. Note
1033 // that inactive elements indicated by the predicate register won't cause
1034 // a data read from memory to the destination vector.
1035 if (isL) {
1036 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1037 } else {
1038 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1039 }
1040 add(idx, idx, vec_len);
1041
1042 // Perform the comparison. An element of the destination predicate is set
1043 // to active if the particular char is matched.
1044 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1045
1046 // Branch if the particular char is found.
1047 br(NE, MATCH);
1048
1049 sve_whilelt(tmp_pg, T, idx, cnt1);
1050
1051 // Loop back if the particular char not found.
1052 br(MI, LOOP);
1053
1054 BIND(NOMATCH);
1055 mov(result, -1);
1056 b(DONE);
1057
1058 BIND(MATCH);
1059 // Undo the index increment.
1060 sub(idx, idx, vec_len);
1061
1062 // Crop the vector to find its location.
1063 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1064 add(result, idx, -1);
1065 sve_incp(result, T, tmp_pdn);
1066 BIND(DONE);
1067 }
1068
1069 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1070 Register ch, Register result,
1071 Register tmp1, Register tmp2, Register tmp3)
1072 {
1073 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1074 Register cnt1_neg = cnt1;
1075 Register ch1 = rscratch1;
1076 Register result_tmp = rscratch2;
1077
1078 cbz(cnt1, NOMATCH);
1079
1080 cmp(cnt1, (u1)8);
1081 br(LT, DO1_SHORT);
1082
1083 orr(ch, ch, ch, LSL, 8);
1084 orr(ch, ch, ch, LSL, 16);
1085 orr(ch, ch, ch, LSL, 32);
1086
1087 sub(cnt1, cnt1, 8);
1088 mov(result_tmp, cnt1);
1089 lea(str1, Address(str1, cnt1));
1090 sub(cnt1_neg, zr, cnt1);
1091
1092 mov(tmp3, 0x0101010101010101);
1093
1094 BIND(CH1_LOOP);
1095 ldr(ch1, Address(str1, cnt1_neg));
1096 eor(ch1, ch, ch1);
1097 sub(tmp1, ch1, tmp3);
1098 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1099 bics(tmp1, tmp1, tmp2);
1100 br(NE, HAS_ZERO);
1101 adds(cnt1_neg, cnt1_neg, 8);
1102 br(LT, CH1_LOOP);
1103
1104 cmp(cnt1_neg, (u1)8);
1105 mov(cnt1_neg, 0);
1106 br(LT, CH1_LOOP);
1107 b(NOMATCH);
1108
1109 BIND(HAS_ZERO);
1110 rev(tmp1, tmp1);
1111 clz(tmp1, tmp1);
1112 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1113 b(MATCH);
1114
1115 BIND(DO1_SHORT);
1116 mov(result_tmp, cnt1);
1117 lea(str1, Address(str1, cnt1));
1118 sub(cnt1_neg, zr, cnt1);
1119 BIND(DO1_LOOP);
1120 ldrb(ch1, Address(str1, cnt1_neg));
1121 cmp(ch, ch1);
1122 br(EQ, MATCH);
1123 adds(cnt1_neg, cnt1_neg, 1);
1124 br(LT, DO1_LOOP);
1125 BIND(NOMATCH);
1126 mov(result, -1);
1127 b(DONE);
1128 BIND(MATCH);
1129 add(result, result_tmp, cnt1_neg);
1130 BIND(DONE);
1131 }
1132
1133 // Compare strings.
1134 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1135 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1136 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1137 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1138 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1139 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1140 SHORT_LOOP_START, TAIL_CHECK;
1141
1142 bool isLL = ae == StrIntrinsicNode::LL;
1143 bool isLU = ae == StrIntrinsicNode::LU;
1144 bool isUL = ae == StrIntrinsicNode::UL;
1145
1146 // The stub threshold for LL strings is: 72 (64 + 8) chars
1147 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1148 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1149 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1150
1151 bool str1_isL = isLL || isLU;
1152 bool str2_isL = isLL || isUL;
1153
1154 int str1_chr_shift = str1_isL ? 0 : 1;
1155 int str2_chr_shift = str2_isL ? 0 : 1;
1156 int str1_chr_size = str1_isL ? 1 : 2;
1157 int str2_chr_size = str2_isL ? 1 : 2;
1158 int minCharsInWord = isLL ? wordSize : wordSize/2;
1159
1160 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1161 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1162 (chr_insn)&MacroAssembler::ldrh;
1163 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1164 (chr_insn)&MacroAssembler::ldrh;
1165 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1166 (uxt_insn)&MacroAssembler::uxthw;
1167
1168 BLOCK_COMMENT("string_compare {");
1169
1170 // Bizarrely, the counts are passed in bytes, regardless of whether they
1171 // are L or U strings, however the result is always in characters.
1172 if (!str1_isL) asrw(cnt1, cnt1, 1);
1173 if (!str2_isL) asrw(cnt2, cnt2, 1);
1174
1175 // Compute the minimum of the string lengths and save the difference.
1176 subsw(result, cnt1, cnt2);
1177 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1178
1179 // A very short string
1180 cmpw(cnt2, minCharsInWord);
1181 br(Assembler::LE, SHORT_STRING);
1182
1183 // Compare longwords
1184 // load first parts of strings and finish initialization while loading
1185 {
1186 if (str1_isL == str2_isL) { // LL or UU
1187 ldr(tmp1, Address(str1));
1188 cmp(str1, str2);
1189 br(Assembler::EQ, DONE);
1190 ldr(tmp2, Address(str2));
1191 cmp(cnt2, stub_threshold);
1192 br(GE, STUB);
1193 subsw(cnt2, cnt2, minCharsInWord);
1194 br(EQ, TAIL_CHECK);
1195 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1196 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1197 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1198 } else if (isLU) {
1199 ldrs(vtmp, Address(str1));
1200 ldr(tmp2, Address(str2));
1201 cmp(cnt2, stub_threshold);
1202 br(GE, STUB);
1203 subw(cnt2, cnt2, 4);
1204 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1205 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1206 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1207 zip1(vtmp, T8B, vtmp, vtmpZ);
1208 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1209 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1210 add(cnt1, cnt1, 4);
1211 fmovd(tmp1, vtmp);
1212 } else { // UL case
1213 ldr(tmp1, Address(str1));
1214 ldrs(vtmp, Address(str2));
1215 cmp(cnt2, stub_threshold);
1216 br(GE, STUB);
1217 subw(cnt2, cnt2, 4);
1218 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1219 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1220 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1221 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1222 zip1(vtmp, T8B, vtmp, vtmpZ);
1223 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1224 add(cnt1, cnt1, 8);
1225 fmovd(tmp2, vtmp);
1226 }
1227 adds(cnt2, cnt2, isUL ? 4 : 8);
1228 br(GE, TAIL);
1229 eor(rscratch2, tmp1, tmp2);
1230 cbnz(rscratch2, DIFF);
1231 // main loop
1232 bind(NEXT_WORD);
1233 if (str1_isL == str2_isL) {
1234 ldr(tmp1, Address(str1, cnt2));
1235 ldr(tmp2, Address(str2, cnt2));
1236 adds(cnt2, cnt2, 8);
1237 } else if (isLU) {
1238 ldrs(vtmp, Address(str1, cnt1));
1239 ldr(tmp2, Address(str2, cnt2));
1240 add(cnt1, cnt1, 4);
1241 zip1(vtmp, T8B, vtmp, vtmpZ);
1242 fmovd(tmp1, vtmp);
1243 adds(cnt2, cnt2, 8);
1244 } else { // UL
1245 ldrs(vtmp, Address(str2, cnt2));
1246 ldr(tmp1, Address(str1, cnt1));
1247 zip1(vtmp, T8B, vtmp, vtmpZ);
1248 add(cnt1, cnt1, 8);
1249 fmovd(tmp2, vtmp);
1250 adds(cnt2, cnt2, 4);
1251 }
1252 br(GE, TAIL);
1253
1254 eor(rscratch2, tmp1, tmp2);
1255 cbz(rscratch2, NEXT_WORD);
1256 b(DIFF);
1257 bind(TAIL);
1258 eor(rscratch2, tmp1, tmp2);
1259 cbnz(rscratch2, DIFF);
1260 // Last longword. In the case where length == 4 we compare the
1261 // same longword twice, but that's still faster than another
1262 // conditional branch.
1263 if (str1_isL == str2_isL) {
1264 ldr(tmp1, Address(str1));
1265 ldr(tmp2, Address(str2));
1266 } else if (isLU) {
1267 ldrs(vtmp, Address(str1));
1268 ldr(tmp2, Address(str2));
1269 zip1(vtmp, T8B, vtmp, vtmpZ);
1270 fmovd(tmp1, vtmp);
1271 } else { // UL
1272 ldrs(vtmp, Address(str2));
1273 ldr(tmp1, Address(str1));
1274 zip1(vtmp, T8B, vtmp, vtmpZ);
1275 fmovd(tmp2, vtmp);
1276 }
1277 bind(TAIL_CHECK);
1278 eor(rscratch2, tmp1, tmp2);
1279 cbz(rscratch2, DONE);
1280
1281 // Find the first different characters in the longwords and
1282 // compute their difference.
1283 bind(DIFF);
1284 rev(rscratch2, rscratch2);
1285 clz(rscratch2, rscratch2);
1286 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1287 lsrv(tmp1, tmp1, rscratch2);
1288 (this->*ext_chr)(tmp1, tmp1);
1289 lsrv(tmp2, tmp2, rscratch2);
1290 (this->*ext_chr)(tmp2, tmp2);
1291 subw(result, tmp1, tmp2);
1292 b(DONE);
1293 }
1294
1295 bind(STUB);
1296 RuntimeAddress stub = nullptr;
1297 switch(ae) {
1298 case StrIntrinsicNode::LL:
1299 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1300 break;
1301 case StrIntrinsicNode::UU:
1302 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1303 break;
1304 case StrIntrinsicNode::LU:
1305 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1306 break;
1307 case StrIntrinsicNode::UL:
1308 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1309 break;
1310 default:
1311 ShouldNotReachHere();
1312 }
1313 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1314 address call = trampoline_call(stub);
1315 if (call == nullptr) {
1316 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1317 ciEnv::current()->record_failure("CodeCache is full");
1318 return;
1319 }
1320 b(DONE);
1321
1322 bind(SHORT_STRING);
1323 // Is the minimum length zero?
1324 cbz(cnt2, DONE);
1325 // arrange code to do most branches while loading and loading next characters
1326 // while comparing previous
1327 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1328 subs(cnt2, cnt2, 1);
1329 br(EQ, SHORT_LAST_INIT);
1330 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1331 b(SHORT_LOOP_START);
1332 bind(SHORT_LOOP);
1333 subs(cnt2, cnt2, 1);
1334 br(EQ, SHORT_LAST);
1335 bind(SHORT_LOOP_START);
1336 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1337 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1338 cmp(tmp1, cnt1);
1339 br(NE, SHORT_LOOP_TAIL);
1340 subs(cnt2, cnt2, 1);
1341 br(EQ, SHORT_LAST2);
1342 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1343 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1344 cmp(tmp2, rscratch1);
1345 br(EQ, SHORT_LOOP);
1346 sub(result, tmp2, rscratch1);
1347 b(DONE);
1348 bind(SHORT_LOOP_TAIL);
1349 sub(result, tmp1, cnt1);
1350 b(DONE);
1351 bind(SHORT_LAST2);
1352 cmp(tmp2, rscratch1);
1353 br(EQ, DONE);
1354 sub(result, tmp2, rscratch1);
1355
1356 b(DONE);
1357 bind(SHORT_LAST_INIT);
1358 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1359 bind(SHORT_LAST);
1360 cmp(tmp1, cnt1);
1361 br(EQ, DONE);
1362 sub(result, tmp1, cnt1);
1363
1364 bind(DONE);
1365
1366 BLOCK_COMMENT("} string_compare");
1367 }
1368
1369 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1370 FloatRegister src2, Condition cond, bool isQ) {
1371 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1372 FloatRegister zn = src1, zm = src2;
1373 bool needs_negation = false;
1374 switch (cond) {
1375 case LT: cond = GT; zn = src2; zm = src1; break;
1376 case LE: cond = GE; zn = src2; zm = src1; break;
1377 case LO: cond = HI; zn = src2; zm = src1; break;
1378 case LS: cond = HS; zn = src2; zm = src1; break;
1379 case NE: cond = EQ; needs_negation = true; break;
1380 default:
1381 break;
1382 }
1383
1384 if (is_floating_point_type(bt)) {
1385 fcm(cond, dst, size, zn, zm);
1386 } else {
1387 cm(cond, dst, size, zn, zm);
1388 }
1389
1390 if (needs_negation) {
1391 notr(dst, isQ ? T16B : T8B, dst);
1392 }
1393 }
1394
1395 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1396 Condition cond, bool isQ) {
1397 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1398 if (bt == T_FLOAT || bt == T_DOUBLE) {
1399 if (cond == Assembler::NE) {
1400 fcm(Assembler::EQ, dst, size, src);
1401 notr(dst, isQ ? T16B : T8B, dst);
1402 } else {
1403 fcm(cond, dst, size, src);
1404 }
1405 } else {
1406 if (cond == Assembler::NE) {
1407 cm(Assembler::EQ, dst, size, src);
1408 notr(dst, isQ ? T16B : T8B, dst);
1409 } else {
1410 cm(cond, dst, size, src);
1411 }
1412 }
1413 }
1414
1415 // Compress the least significant bit of each byte to the rightmost and clear
1416 // the higher garbage bits.
1417 void C2_MacroAssembler::bytemask_compress(Register dst) {
1418 // Example input, dst = 0x01 00 00 00 01 01 00 01
1419 // The "??" bytes are garbage.
1420 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1421 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1422 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1423 andr(dst, dst, 0xff); // dst = 0x8D
1424 }
1425
1426 // Pack the value of each mask element in "src" into a long value in "dst", at most
1427 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1428 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1429 // one bit in "dst".
1430 //
1431 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1432 // Expected: dst = 0x658D
1433 //
1434 // Clobbers: rscratch1
1435 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1436 FloatRegister vtmp, int lane_cnt) {
1437 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1438 assert_different_registers(dst, rscratch1);
1439 assert_different_registers(src, vtmp);
1440 assert(UseSVE > 0, "must be");
1441
1442 // Compress the lowest 8 bytes.
1443 fmovd(dst, src);
1444 bytemask_compress(dst);
1445 if (lane_cnt <= 8) return;
1446
1447 // Repeat on higher bytes and join the results.
1448 // Compress 8 bytes in each iteration.
1449 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1450 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1451 bytemask_compress(rscratch1);
1452 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1453 }
1454 }
1455
1456 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1457 // instruction which requires the FEAT_BITPERM feature.
1458 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1459 FloatRegister vtmp1, FloatRegister vtmp2,
1460 int lane_cnt) {
1461 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1462 assert_different_registers(src, vtmp1, vtmp2);
1463 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1464
1465 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1466 // is to compress each significant bit of the byte in a cross-lane way. Due
1467 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1468 // (bit-compress in each lane) with the biggest lane size (T = D) then
1469 // concatenate the results.
1470
1471 // The second source input of BEXT, initialized with 0x01 in each byte.
1472 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1473 sve_dup(vtmp2, B, 1);
1474
1475 // BEXT vtmp1.D, src.D, vtmp2.D
1476 // src = 0x0001010000010001 | 0x0100000001010001
1477 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1478 // ---------------------------------------
1479 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1480 sve_bext(vtmp1, D, src, vtmp2);
1481
1482 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1483 // result to dst.
1484 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1485 // dst = 0x658D
1486 if (lane_cnt <= 8) {
1487 // No need to concatenate.
1488 umov(dst, vtmp1, B, 0);
1489 } else if (lane_cnt <= 16) {
1490 ins(vtmp1, B, vtmp1, 1, 8);
1491 umov(dst, vtmp1, H, 0);
1492 } else {
1493 // As the lane count is 64 at most, the final expected value must be in
1494 // the lowest 64 bits after narrowing vtmp1 from D to B.
1495 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1496 umov(dst, vtmp1, D, 0);
1497 }
1498 }
1499
1500 // Unpack the mask, a long value in "src", into a vector register of boolean
1501 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1502 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1503 // most 64 lanes.
1504 //
1505 // Below example gives the expected dst vector register, with a valid src(0x658D)
1506 // on a 128-bit vector size machine.
1507 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1508 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1509 FloatRegister vtmp, int lane_cnt) {
1510 assert_different_registers(dst, vtmp);
1511 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1512 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1513
1514 // Example: src = 0x658D, lane_cnt = 16
1515 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1516
1517 // Put long value from general purpose register into the first lane of vector.
1518 // vtmp = 0x0000000000000000 | 0x000000000000658D
1519 sve_dup(vtmp, B, 0);
1520 mov(vtmp, D, 0, src);
1521
1522 // Transform the value in the first lane which is mask in bit now to the mask in
1523 // byte, which can be done by SVE2's BDEP instruction.
1524
1525 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1526 // vtmp = 0x0000000000000065 | 0x000000000000008D
1527 if (lane_cnt <= 8) {
1528 // Nothing. As only one byte exsits.
1529 } else if (lane_cnt <= 16) {
1530 ins(vtmp, B, vtmp, 8, 1);
1531 } else {
1532 sve_vector_extend(vtmp, D, vtmp, B);
1533 }
1534
1535 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1536 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1537 sve_dup(dst, B, 1);
1538
1539 // BDEP dst.D, vtmp.D, dst.D
1540 // vtmp = 0x0000000000000065 | 0x000000000000008D
1541 // dst = 0x0101010101010101 | 0x0101010101010101
1542 // ---------------------------------------
1543 // dst = 0x0001010000010001 | 0x0100000001010001
1544 sve_bdep(dst, D, vtmp, dst);
1545 }
1546
1547 // Clobbers: rflags
1548 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1549 FloatRegister zn, FloatRegister zm, Condition cond) {
1550 assert(pg->is_governing(), "This register has to be a governing predicate register");
1551 FloatRegister z1 = zn, z2 = zm;
1552 switch (cond) {
1553 case LE: z1 = zm; z2 = zn; cond = GE; break;
1554 case LT: z1 = zm; z2 = zn; cond = GT; break;
1555 case LO: z1 = zm; z2 = zn; cond = HI; break;
1556 case LS: z1 = zm; z2 = zn; cond = HS; break;
1557 default:
1558 break;
1559 }
1560
1561 SIMD_RegVariant size = elemType_to_regVariant(bt);
1562 if (is_floating_point_type(bt)) {
1563 sve_fcm(cond, pd, size, pg, z1, z2);
1564 } else {
1565 assert(is_integral_type(bt), "unsupported element type");
1566 sve_cmp(cond, pd, size, pg, z1, z2);
1567 }
1568 }
1569
1570 // Get index of the last mask lane that is set
1571 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1572 SIMD_RegVariant size = elemType_to_regVariant(bt);
1573 sve_rev(ptmp, size, src);
1574 sve_brkb(ptmp, ptrue, ptmp, false);
1575 sve_cntp(dst, size, ptrue, ptmp);
1576 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1577 subw(dst, rscratch1, dst);
1578 }
1579
1580 // Extend integer vector src to dst with the same lane count
1581 // but larger element size, e.g. 4B -> 4I
1582 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1583 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1584 if (src_bt == T_BYTE) {
1585 // 4B to 4S/4I, 8B to 8S
1586 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1587 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1588 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1589 if (dst_bt == T_INT) {
1590 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1591 }
1592 } else if (src_bt == T_SHORT) {
1593 // 2S to 2I/2L, 4S to 4I
1594 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1595 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1596 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1597 if (dst_bt == T_LONG) {
1598 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1599 }
1600 } else if (src_bt == T_INT) {
1601 // 2I to 2L
1602 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1603 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1604 } else {
1605 ShouldNotReachHere();
1606 }
1607 }
1608
1609 // Narrow integer vector src down to dst with the same lane count
1610 // but smaller element size, e.g. 4I -> 4B
1611 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1612 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1613 if (src_bt == T_SHORT) {
1614 // 4S/8S to 4B/8B
1615 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1616 assert(dst_bt == T_BYTE, "unsupported");
1617 xtn(dst, T8B, src, T8H);
1618 } else if (src_bt == T_INT) {
1619 // 2I to 2S, 4I to 4B/4S
1620 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1621 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1622 xtn(dst, T4H, src, T4S);
1623 if (dst_bt == T_BYTE) {
1624 xtn(dst, T8B, dst, T8H);
1625 }
1626 } else if (src_bt == T_LONG) {
1627 // 2L to 2S/2I
1628 assert(src_vlen_in_bytes == 16, "unsupported");
1629 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1630 xtn(dst, T2S, src, T2D);
1631 if (dst_bt == T_SHORT) {
1632 xtn(dst, T4H, dst, T4S);
1633 }
1634 } else {
1635 ShouldNotReachHere();
1636 }
1637 }
1638
1639 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1640 FloatRegister src, SIMD_RegVariant src_size,
1641 bool is_unsigned) {
1642 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1643
1644 if (src_size == B) {
1645 switch (dst_size) {
1646 case H:
1647 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1648 break;
1649 case S:
1650 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1651 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1652 break;
1653 case D:
1654 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1655 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1656 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1657 break;
1658 default:
1659 ShouldNotReachHere();
1660 }
1661 } else if (src_size == H) {
1662 if (dst_size == S) {
1663 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1664 } else { // D
1665 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1666 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1667 }
1668 } else if (src_size == S) {
1669 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1670 }
1671 }
1672
1673 // Vector narrow from src to dst with specified element sizes.
1674 // High part of dst vector will be filled with zero.
1675 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1676 FloatRegister src, SIMD_RegVariant src_size,
1677 FloatRegister tmp) {
1678 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1679 assert_different_registers(src, tmp);
1680 sve_dup(tmp, src_size, 0);
1681 if (src_size == D) {
1682 switch (dst_size) {
1683 case S:
1684 sve_uzp1(dst, S, src, tmp);
1685 break;
1686 case H:
1687 assert_different_registers(dst, tmp);
1688 sve_uzp1(dst, S, src, tmp);
1689 sve_uzp1(dst, H, dst, tmp);
1690 break;
1691 case B:
1692 assert_different_registers(dst, tmp);
1693 sve_uzp1(dst, S, src, tmp);
1694 sve_uzp1(dst, H, dst, tmp);
1695 sve_uzp1(dst, B, dst, tmp);
1696 break;
1697 default:
1698 ShouldNotReachHere();
1699 }
1700 } else if (src_size == S) {
1701 if (dst_size == H) {
1702 sve_uzp1(dst, H, src, tmp);
1703 } else { // B
1704 assert_different_registers(dst, tmp);
1705 sve_uzp1(dst, H, src, tmp);
1706 sve_uzp1(dst, B, dst, tmp);
1707 }
1708 } else if (src_size == H) {
1709 sve_uzp1(dst, B, src, tmp);
1710 }
1711 }
1712
1713 // Extend src predicate to dst predicate with the same lane count but larger
1714 // element size, e.g. 64Byte -> 512Long
1715 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1716 uint dst_element_length_in_bytes,
1717 uint src_element_length_in_bytes) {
1718 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1719 sve_punpklo(dst, src);
1720 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1721 sve_punpklo(dst, src);
1722 sve_punpklo(dst, dst);
1723 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1724 sve_punpklo(dst, src);
1725 sve_punpklo(dst, dst);
1726 sve_punpklo(dst, dst);
1727 } else {
1728 assert(false, "unsupported");
1729 ShouldNotReachHere();
1730 }
1731 }
1732
1733 // Narrow src predicate to dst predicate with the same lane count but
1734 // smaller element size, e.g. 512Long -> 64Byte
1735 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1736 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1737 // The insignificant bits in src predicate are expected to be zero.
1738 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1739 // passed as the second argument. An example narrowing operation with a given mask would be -
1740 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1741 // Mask (for 2 Longs) : TF
1742 // Predicate register for the above mask (16 bits) : 00000001 00000000
1743 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1744 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1745 assert_different_registers(src, ptmp);
1746 assert_different_registers(dst, ptmp);
1747 sve_pfalse(ptmp);
1748 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1749 sve_uzp1(dst, B, src, ptmp);
1750 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1751 sve_uzp1(dst, H, src, ptmp);
1752 sve_uzp1(dst, B, dst, ptmp);
1753 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1754 sve_uzp1(dst, S, src, ptmp);
1755 sve_uzp1(dst, H, dst, ptmp);
1756 sve_uzp1(dst, B, dst, ptmp);
1757 } else {
1758 assert(false, "unsupported");
1759 ShouldNotReachHere();
1760 }
1761 }
1762
1763 // Vector reduction add for integral type with ASIMD instructions.
1764 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1765 Register isrc, FloatRegister vsrc,
1766 unsigned vector_length_in_bytes,
1767 FloatRegister vtmp) {
1768 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1769 assert_different_registers(dst, isrc);
1770 bool isQ = vector_length_in_bytes == 16;
1771
1772 BLOCK_COMMENT("neon_reduce_add_integral {");
1773 switch(bt) {
1774 case T_BYTE:
1775 addv(vtmp, isQ ? T16B : T8B, vsrc);
1776 smov(dst, vtmp, B, 0);
1777 addw(dst, dst, isrc, ext::sxtb);
1778 break;
1779 case T_SHORT:
1780 addv(vtmp, isQ ? T8H : T4H, vsrc);
1781 smov(dst, vtmp, H, 0);
1782 addw(dst, dst, isrc, ext::sxth);
1783 break;
1784 case T_INT:
1785 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1786 umov(dst, vtmp, S, 0);
1787 addw(dst, dst, isrc);
1788 break;
1789 case T_LONG:
1790 assert(isQ, "unsupported");
1791 addpd(vtmp, vsrc);
1792 umov(dst, vtmp, D, 0);
1793 add(dst, dst, isrc);
1794 break;
1795 default:
1796 assert(false, "unsupported");
1797 ShouldNotReachHere();
1798 }
1799 BLOCK_COMMENT("} neon_reduce_add_integral");
1800 }
1801
1802 // Vector reduction multiply for integral type with ASIMD instructions.
1803 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1804 // Clobbers: rscratch1
1805 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1806 Register isrc, FloatRegister vsrc,
1807 unsigned vector_length_in_bytes,
1808 FloatRegister vtmp1, FloatRegister vtmp2) {
1809 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1810 bool isQ = vector_length_in_bytes == 16;
1811
1812 BLOCK_COMMENT("neon_reduce_mul_integral {");
1813 switch(bt) {
1814 case T_BYTE:
1815 if (isQ) {
1816 // Multiply the lower half and higher half of vector iteratively.
1817 // vtmp1 = vsrc[8:15]
1818 ins(vtmp1, D, vsrc, 0, 1);
1819 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1820 mulv(vtmp1, T8B, vtmp1, vsrc);
1821 // vtmp2 = vtmp1[4:7]
1822 ins(vtmp2, S, vtmp1, 0, 1);
1823 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1824 mulv(vtmp1, T8B, vtmp2, vtmp1);
1825 } else {
1826 ins(vtmp1, S, vsrc, 0, 1);
1827 mulv(vtmp1, T8B, vtmp1, vsrc);
1828 }
1829 // vtmp2 = vtmp1[2:3]
1830 ins(vtmp2, H, vtmp1, 0, 1);
1831 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1832 mulv(vtmp2, T8B, vtmp2, vtmp1);
1833 // dst = vtmp2[0] * isrc * vtmp2[1]
1834 umov(rscratch1, vtmp2, B, 0);
1835 mulw(dst, rscratch1, isrc);
1836 sxtb(dst, dst);
1837 umov(rscratch1, vtmp2, B, 1);
1838 mulw(dst, rscratch1, dst);
1839 sxtb(dst, dst);
1840 break;
1841 case T_SHORT:
1842 if (isQ) {
1843 ins(vtmp2, D, vsrc, 0, 1);
1844 mulv(vtmp2, T4H, vtmp2, vsrc);
1845 ins(vtmp1, S, vtmp2, 0, 1);
1846 mulv(vtmp1, T4H, vtmp1, vtmp2);
1847 } else {
1848 ins(vtmp1, S, vsrc, 0, 1);
1849 mulv(vtmp1, T4H, vtmp1, vsrc);
1850 }
1851 umov(rscratch1, vtmp1, H, 0);
1852 mulw(dst, rscratch1, isrc);
1853 sxth(dst, dst);
1854 umov(rscratch1, vtmp1, H, 1);
1855 mulw(dst, rscratch1, dst);
1856 sxth(dst, dst);
1857 break;
1858 case T_INT:
1859 if (isQ) {
1860 ins(vtmp1, D, vsrc, 0, 1);
1861 mulv(vtmp1, T2S, vtmp1, vsrc);
1862 } else {
1863 vtmp1 = vsrc;
1864 }
1865 umov(rscratch1, vtmp1, S, 0);
1866 mul(dst, rscratch1, isrc);
1867 umov(rscratch1, vtmp1, S, 1);
1868 mul(dst, rscratch1, dst);
1869 break;
1870 case T_LONG:
1871 umov(rscratch1, vsrc, D, 0);
1872 mul(dst, isrc, rscratch1);
1873 umov(rscratch1, vsrc, D, 1);
1874 mul(dst, dst, rscratch1);
1875 break;
1876 default:
1877 assert(false, "unsupported");
1878 ShouldNotReachHere();
1879 }
1880 BLOCK_COMMENT("} neon_reduce_mul_integral");
1881 }
1882
1883 // Vector reduction multiply for floating-point type with ASIMD instructions.
1884 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1885 FloatRegister fsrc, FloatRegister vsrc,
1886 unsigned vector_length_in_bytes,
1887 FloatRegister vtmp) {
1888 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1889 bool isQ = vector_length_in_bytes == 16;
1890
1891 BLOCK_COMMENT("neon_reduce_mul_fp {");
1892 switch(bt) {
1893 // The T_SHORT type below is for Float16 type which also uses floating-point
1894 // instructions.
1895 case T_SHORT:
1896 fmulh(dst, fsrc, vsrc);
1897 ext(vtmp, T8B, vsrc, vsrc, 2);
1898 fmulh(dst, dst, vtmp);
1899 ext(vtmp, T8B, vsrc, vsrc, 4);
1900 fmulh(dst, dst, vtmp);
1901 ext(vtmp, T8B, vsrc, vsrc, 6);
1902 fmulh(dst, dst, vtmp);
1903 if (isQ) {
1904 ext(vtmp, T16B, vsrc, vsrc, 8);
1905 fmulh(dst, dst, vtmp);
1906 ext(vtmp, T16B, vsrc, vsrc, 10);
1907 fmulh(dst, dst, vtmp);
1908 ext(vtmp, T16B, vsrc, vsrc, 12);
1909 fmulh(dst, dst, vtmp);
1910 ext(vtmp, T16B, vsrc, vsrc, 14);
1911 fmulh(dst, dst, vtmp);
1912 }
1913 break;
1914 case T_FLOAT:
1915 fmuls(dst, fsrc, vsrc);
1916 ins(vtmp, S, vsrc, 0, 1);
1917 fmuls(dst, dst, vtmp);
1918 if (isQ) {
1919 ins(vtmp, S, vsrc, 0, 2);
1920 fmuls(dst, dst, vtmp);
1921 ins(vtmp, S, vsrc, 0, 3);
1922 fmuls(dst, dst, vtmp);
1923 }
1924 break;
1925 case T_DOUBLE:
1926 assert(isQ, "unsupported");
1927 fmuld(dst, fsrc, vsrc);
1928 ins(vtmp, D, vsrc, 0, 1);
1929 fmuld(dst, dst, vtmp);
1930 break;
1931 default:
1932 assert(false, "unsupported");
1933 ShouldNotReachHere();
1934 }
1935 BLOCK_COMMENT("} neon_reduce_mul_fp");
1936 }
1937
1938 // Vector reduction add for half float type with ASIMD instructions.
1939 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1940 unsigned vector_length_in_bytes, FloatRegister vtmp) {
1941 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1942 bool isQ = vector_length_in_bytes == 16;
1943
1944 BLOCK_COMMENT("neon_reduce_add_fp16 {");
1945 faddh(dst, fsrc, vsrc);
1946 ext(vtmp, T8B, vsrc, vsrc, 2);
1947 faddh(dst, dst, vtmp);
1948 ext(vtmp, T8B, vsrc, vsrc, 4);
1949 faddh(dst, dst, vtmp);
1950 ext(vtmp, T8B, vsrc, vsrc, 6);
1951 faddh(dst, dst, vtmp);
1952 if (isQ) {
1953 ext(vtmp, T16B, vsrc, vsrc, 8);
1954 faddh(dst, dst, vtmp);
1955 ext(vtmp, T16B, vsrc, vsrc, 10);
1956 faddh(dst, dst, vtmp);
1957 ext(vtmp, T16B, vsrc, vsrc, 12);
1958 faddh(dst, dst, vtmp);
1959 ext(vtmp, T16B, vsrc, vsrc, 14);
1960 faddh(dst, dst, vtmp);
1961 }
1962 BLOCK_COMMENT("} neon_reduce_add_fp16");
1963 }
1964
1965 // Helper to select logical instruction
1966 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1967 Register Rn, Register Rm,
1968 enum shift_kind kind, unsigned shift) {
1969 switch(opc) {
1970 case Op_AndReductionV:
1971 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1972 break;
1973 case Op_OrReductionV:
1974 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1975 break;
1976 case Op_XorReductionV:
1977 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1978 break;
1979 default:
1980 assert(false, "unsupported");
1981 ShouldNotReachHere();
1982 }
1983 }
1984
1985 // Vector reduction logical operations And, Or, Xor
1986 // Clobbers: rscratch1
1987 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1988 Register isrc, FloatRegister vsrc,
1989 unsigned vector_length_in_bytes) {
1990 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1991 "unsupported");
1992 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1993 assert_different_registers(dst, isrc);
1994 bool isQ = vector_length_in_bytes == 16;
1995
1996 BLOCK_COMMENT("neon_reduce_logical {");
1997 umov(rscratch1, vsrc, isQ ? D : S, 0);
1998 umov(dst, vsrc, isQ ? D : S, 1);
1999 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2000 switch(bt) {
2001 case T_BYTE:
2002 if (isQ) {
2003 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2004 }
2005 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2006 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2007 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2008 sxtb(dst, dst);
2009 break;
2010 case T_SHORT:
2011 if (isQ) {
2012 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2013 }
2014 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2015 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2016 sxth(dst, dst);
2017 break;
2018 case T_INT:
2019 if (isQ) {
2020 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2021 }
2022 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2023 break;
2024 case T_LONG:
2025 assert(isQ, "unsupported");
2026 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2027 break;
2028 default:
2029 assert(false, "unsupported");
2030 ShouldNotReachHere();
2031 }
2032 BLOCK_COMMENT("} neon_reduce_logical");
2033 }
2034
2035 // Helper function to decode min/max reduction operation properties
2036 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2037 bool* is_unsigned,
2038 Condition* cond) {
2039 switch(opc) {
2040 case Op_MinReductionV:
2041 *is_min = true; *is_unsigned = false; *cond = LT; break;
2042 case Op_MaxReductionV:
2043 *is_min = false; *is_unsigned = false; *cond = GT; break;
2044 case Op_UMinReductionV:
2045 *is_min = true; *is_unsigned = true; *cond = LO; break;
2046 case Op_UMaxReductionV:
2047 *is_min = false; *is_unsigned = true; *cond = HI; break;
2048 default:
2049 ShouldNotReachHere();
2050 }
2051 }
2052
2053 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2054 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2055 // Clobbers: rscratch1, rflags
2056 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2057 Register isrc, FloatRegister vsrc,
2058 unsigned vector_length_in_bytes,
2059 FloatRegister vtmp) {
2060 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2061 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2062 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2063 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2064 assert_different_registers(dst, isrc);
2065 bool isQ = vector_length_in_bytes == 16;
2066 bool is_min;
2067 bool is_unsigned;
2068 Condition cond;
2069 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2070 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2071 if (bt == T_LONG) {
2072 assert(vtmp == fnoreg, "should be");
2073 assert(isQ, "should be");
2074 umov(rscratch1, vsrc, D, 0);
2075 cmp(isrc, rscratch1);
2076 csel(dst, isrc, rscratch1, cond);
2077 umov(rscratch1, vsrc, D, 1);
2078 cmp(dst, rscratch1);
2079 csel(dst, dst, rscratch1, cond);
2080 } else {
2081 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2082 if (size == T2S) {
2083 // For T2S (2x32-bit elements), use pairwise instructions because
2084 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2085 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2086 } else {
2087 // For other sizes, use reduction to scalar instructions.
2088 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2089 }
2090 if (bt == T_INT) {
2091 umov(dst, vtmp, S, 0);
2092 } else if (is_unsigned) {
2093 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2094 } else {
2095 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2096 }
2097 cmpw(dst, isrc);
2098 cselw(dst, dst, isrc, cond);
2099 }
2100 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2101 }
2102
2103 // Vector reduction for integral type with SVE instruction.
2104 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2105 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2106 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2107 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2108 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2109 assert(pg->is_governing(), "This register has to be a governing predicate register");
2110 assert_different_registers(src1, dst);
2111 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2112 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2113 switch (opc) {
2114 case Op_AddReductionVI: {
2115 sve_uaddv(tmp, size, pg, src2);
2116 if (bt == T_BYTE) {
2117 smov(dst, tmp, size, 0);
2118 addw(dst, src1, dst, ext::sxtb);
2119 } else if (bt == T_SHORT) {
2120 smov(dst, tmp, size, 0);
2121 addw(dst, src1, dst, ext::sxth);
2122 } else {
2123 umov(dst, tmp, size, 0);
2124 addw(dst, dst, src1);
2125 }
2126 break;
2127 }
2128 case Op_AddReductionVL: {
2129 sve_uaddv(tmp, size, pg, src2);
2130 umov(dst, tmp, size, 0);
2131 add(dst, dst, src1);
2132 break;
2133 }
2134 case Op_AndReductionV: {
2135 sve_andv(tmp, size, pg, src2);
2136 if (bt == T_INT || bt == T_LONG) {
2137 umov(dst, tmp, size, 0);
2138 } else {
2139 smov(dst, tmp, size, 0);
2140 }
2141 if (bt == T_LONG) {
2142 andr(dst, dst, src1);
2143 } else {
2144 andw(dst, dst, src1);
2145 }
2146 break;
2147 }
2148 case Op_OrReductionV: {
2149 sve_orv(tmp, size, pg, src2);
2150 if (bt == T_INT || bt == T_LONG) {
2151 umov(dst, tmp, size, 0);
2152 } else {
2153 smov(dst, tmp, size, 0);
2154 }
2155 if (bt == T_LONG) {
2156 orr(dst, dst, src1);
2157 } else {
2158 orrw(dst, dst, src1);
2159 }
2160 break;
2161 }
2162 case Op_XorReductionV: {
2163 sve_eorv(tmp, size, pg, src2);
2164 if (bt == T_INT || bt == T_LONG) {
2165 umov(dst, tmp, size, 0);
2166 } else {
2167 smov(dst, tmp, size, 0);
2168 }
2169 if (bt == T_LONG) {
2170 eor(dst, dst, src1);
2171 } else {
2172 eorw(dst, dst, src1);
2173 }
2174 break;
2175 }
2176 case Op_MaxReductionV:
2177 case Op_MinReductionV:
2178 case Op_UMaxReductionV:
2179 case Op_UMinReductionV: {
2180 bool is_min;
2181 bool is_unsigned;
2182 Condition cond;
2183 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2184 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2185 // Move result from vector to general register
2186 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2187 umov(dst, tmp, size, 0);
2188 } else {
2189 smov(dst, tmp, size, 0);
2190 }
2191 if (bt == T_LONG) {
2192 cmp(dst, src1);
2193 csel(dst, dst, src1, cond);
2194 } else {
2195 cmpw(dst, src1);
2196 cselw(dst, dst, src1, cond);
2197 }
2198 break;
2199 }
2200 default:
2201 assert(false, "unsupported");
2202 ShouldNotReachHere();
2203 }
2204
2205 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2206 if (bt == T_BYTE) {
2207 sxtb(dst, dst);
2208 } else if (bt == T_SHORT) {
2209 sxth(dst, dst);
2210 }
2211 }
2212 }
2213
2214 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2215 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2216 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2217 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2218 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2219 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2220
2221 // Set all elements to false if the input "lane_cnt" is zero.
2222 if (lane_cnt == 0) {
2223 sve_pfalse(dst);
2224 return;
2225 }
2226
2227 SIMD_RegVariant size = elemType_to_regVariant(bt);
2228 assert(size != Q, "invalid size");
2229
2230 // Set all true if "lane_cnt" equals to the max lane count.
2231 if (lane_cnt == max_vector_length) {
2232 sve_ptrue(dst, size, /* ALL */ 0b11111);
2233 return;
2234 }
2235
2236 // Fixed numbers for "ptrue".
2237 switch(lane_cnt) {
2238 case 1: /* VL1 */
2239 case 2: /* VL2 */
2240 case 3: /* VL3 */
2241 case 4: /* VL4 */
2242 case 5: /* VL5 */
2243 case 6: /* VL6 */
2244 case 7: /* VL7 */
2245 case 8: /* VL8 */
2246 sve_ptrue(dst, size, lane_cnt);
2247 return;
2248 case 16:
2249 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2250 return;
2251 case 32:
2252 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2253 return;
2254 case 64:
2255 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2256 return;
2257 case 128:
2258 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2259 return;
2260 case 256:
2261 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2262 return;
2263 default:
2264 break;
2265 }
2266
2267 // Special patterns for "ptrue".
2268 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2269 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2270 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2271 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2272 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2273 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2274 } else {
2275 // Encode to "whileltw" for the remaining cases.
2276 mov(rscratch1, lane_cnt);
2277 sve_whileltw(dst, size, zr, rscratch1);
2278 }
2279 }
2280
2281 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2282 // Any remaining elements of dst will be filled with zero.
2283 // Clobbers: rscratch1
2284 // Preserves: mask, vzr
2285 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2286 FloatRegister vzr, FloatRegister vtmp,
2287 PRegister pgtmp, unsigned vector_length_in_bytes) {
2288 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2289 // When called by sve_compress_byte, src and vtmp may be the same register.
2290 assert_different_registers(dst, src, vzr);
2291 assert_different_registers(dst, vtmp, vzr);
2292 assert_different_registers(mask, pgtmp);
2293 // high <-- low
2294 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2295 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2296 // Expected result: dst = 00 00 00 hh ee dd bb aa
2297
2298 // Extend lowest half to type INT.
2299 // dst = 00dd 00cc 00bb 00aa
2300 sve_uunpklo(dst, S, src);
2301 // pgtmp = 0001 0000 0001 0001
2302 sve_punpklo(pgtmp, mask);
2303 // Pack the active elements in size of type INT to the right,
2304 // and fill the remainings with zero.
2305 // dst = 0000 00dd 00bb 00aa
2306 sve_compact(dst, S, dst, pgtmp);
2307 // Narrow the result back to type SHORT.
2308 // dst = 00 00 00 00 00 dd bb aa
2309 sve_uzp1(dst, H, dst, vzr);
2310
2311 // Return if the vector length is no more than MaxVectorSize/2, since the
2312 // highest half is invalid.
2313 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2314 return;
2315 }
2316
2317 // Count the active elements of lowest half.
2318 // rscratch1 = 3
2319 sve_cntp(rscratch1, S, ptrue, pgtmp);
2320
2321 // Repeat to the highest half.
2322 // pgtmp = 0001 0000 0000 0001
2323 sve_punpkhi(pgtmp, mask);
2324 // vtmp = 00hh 00gg 00ff 00ee
2325 sve_uunpkhi(vtmp, S, src);
2326 // vtmp = 0000 0000 00hh 00ee
2327 sve_compact(vtmp, S, vtmp, pgtmp);
2328 // vtmp = 00 00 00 00 00 00 hh ee
2329 sve_uzp1(vtmp, H, vtmp, vzr);
2330
2331 // pgtmp = 00 00 00 00 00 01 01 01
2332 sve_whilelt(pgtmp, H, zr, rscratch1);
2333 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2334 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2335 // Combine the compressed low with the compressed high:
2336 // dst = 00 00 00 hh ee dd bb aa
2337 sve_splice(dst, H, pgtmp, vtmp);
2338 }
2339
2340 // Clobbers: rscratch1, rscratch2
2341 // Preserves: src, mask
2342 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2343 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2344 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2345 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2346 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2347 assert_different_registers(mask, ptmp, pgtmp);
2348 // high <-- low
2349 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2350 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2351 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2352 FloatRegister vzr = vtmp3;
2353 sve_dup(vzr, B, 0);
2354
2355 // Extend lowest half to type SHORT.
2356 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2357 sve_uunpklo(vtmp1, H, src);
2358 // ptmp = 00 01 00 00 00 01 00 01
2359 sve_punpklo(ptmp, mask);
2360 // Pack the active elements in size of type SHORT to the right,
2361 // and fill the remainings with zero.
2362 // dst = 00 00 00 00 00 0g 0c 0a
2363 unsigned extended_size = vector_length_in_bytes << 1;
2364 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2365 // Narrow the result back to type BYTE.
2366 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2367 sve_uzp1(dst, B, dst, vzr);
2368
2369 // Return if the vector length is no more than MaxVectorSize/2, since the
2370 // highest half is invalid.
2371 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2372 return;
2373 }
2374 // Count the active elements of lowest half.
2375 // rscratch2 = 3
2376 sve_cntp(rscratch2, H, ptrue, ptmp);
2377
2378 // Repeat to the highest half.
2379 // ptmp = 00 01 00 00 00 00 00 01
2380 sve_punpkhi(ptmp, mask);
2381 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2382 sve_uunpkhi(vtmp2, H, src);
2383 // vtmp1 = 00 00 00 00 00 00 0p 0i
2384 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2385 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2386 sve_uzp1(vtmp1, B, vtmp1, vzr);
2387
2388 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2389 sve_whilelt(ptmp, B, zr, rscratch2);
2390 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2391 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2392 // Combine the compressed low with the compressed high:
2393 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2394 sve_splice(dst, B, ptmp, vtmp1);
2395 }
2396
2397 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2398 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2399 SIMD_Arrangement size = isQ ? T16B : T8B;
2400 if (bt == T_BYTE) {
2401 rbit(dst, size, src);
2402 } else {
2403 neon_reverse_bytes(dst, src, bt, isQ);
2404 rbit(dst, size, dst);
2405 }
2406 }
2407
2408 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2409 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2410 SIMD_Arrangement size = isQ ? T16B : T8B;
2411 switch (bt) {
2412 case T_BYTE:
2413 if (dst != src) {
2414 orr(dst, size, src, src);
2415 }
2416 break;
2417 case T_SHORT:
2418 rev16(dst, size, src);
2419 break;
2420 case T_INT:
2421 rev32(dst, size, src);
2422 break;
2423 case T_LONG:
2424 rev64(dst, size, src);
2425 break;
2426 default:
2427 assert(false, "unsupported");
2428 ShouldNotReachHere();
2429 }
2430 }
2431
2432 // VectorRearrange implementation for short/int/float/long/double types with NEON
2433 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2434 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2435 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2436 // and use bsl to implement the operation.
2437 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2438 FloatRegister shuffle, FloatRegister tmp,
2439 BasicType bt, bool isQ) {
2440 assert_different_registers(dst, src, shuffle, tmp);
2441 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2442 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2443
2444 // Here is an example that rearranges a NEON vector with 4 ints:
2445 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2446 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2447 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2448 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2449 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2450 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2451 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2452 // 4. Use Vm as index register, and use V1 as table register.
2453 // Then get V2 as the result by tbl NEON instructions.
2454 switch (bt) {
2455 case T_SHORT:
2456 mov(tmp, size1, 0x02);
2457 mulv(dst, size2, shuffle, tmp);
2458 mov(tmp, size2, 0x0100);
2459 addv(dst, size1, dst, tmp);
2460 tbl(dst, size1, src, 1, dst);
2461 break;
2462 case T_INT:
2463 case T_FLOAT:
2464 mov(tmp, size1, 0x04);
2465 mulv(dst, size2, shuffle, tmp);
2466 mov(tmp, size2, 0x03020100);
2467 addv(dst, size1, dst, tmp);
2468 tbl(dst, size1, src, 1, dst);
2469 break;
2470 case T_LONG:
2471 case T_DOUBLE:
2472 {
2473 int idx = vector_iota_entry_index(T_LONG);
2474 lea(rscratch1,
2475 ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2476 ldrq(tmp, rscratch1);
2477 // Check whether the input "shuffle" is the same with iota indices.
2478 // Return "src" if true, otherwise swap the two elements of "src".
2479 cm(EQ, dst, size2, shuffle, tmp);
2480 ext(tmp, size1, src, src, 8);
2481 bsl(dst, size1, src, tmp);
2482 }
2483 break;
2484 default:
2485 assert(false, "unsupported element type");
2486 ShouldNotReachHere();
2487 }
2488 }
2489
2490 // Extract a scalar element from an sve vector at position 'idx'.
2491 // The input elements in src are expected to be of integral type.
2492 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2493 int idx, FloatRegister vtmp) {
2494 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2495 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2496 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2497 if (bt == T_INT || bt == T_LONG) {
2498 umov(dst, src, size, idx);
2499 } else {
2500 smov(dst, src, size, idx);
2501 }
2502 } else {
2503 sve_orr(vtmp, src, src);
2504 sve_ext(vtmp, vtmp, idx << size);
2505 if (bt == T_INT || bt == T_LONG) {
2506 umov(dst, vtmp, size, 0);
2507 } else {
2508 smov(dst, vtmp, size, 0);
2509 }
2510 }
2511 }
2512
2513 // java.lang.Math::round intrinsics
2514
2515 // Clobbers: rscratch1, rflags
2516 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2517 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2518 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2519 switch (T) {
2520 case T2S:
2521 case T4S:
2522 fmovs(tmp1, T, 0.5f);
2523 mov(rscratch1, jint_cast(0x1.0p23f));
2524 break;
2525 case T2D:
2526 fmovd(tmp1, T, 0.5);
2527 mov(rscratch1, julong_cast(0x1.0p52));
2528 break;
2529 default:
2530 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2531 }
2532 fadd(tmp1, T, tmp1, src);
2533 fcvtms(tmp1, T, tmp1);
2534 // tmp1 = floor(src + 0.5, ties to even)
2535
2536 fcvtas(dst, T, src);
2537 // dst = round(src), ties to away
2538
2539 fneg(tmp3, T, src);
2540 dup(tmp2, T, rscratch1);
2541 cm(HS, tmp3, T, tmp3, tmp2);
2542 // tmp3 is now a set of flags
2543
2544 bif(dst, T16B, tmp1, tmp3);
2545 // result in dst
2546 }
2547
2548 // Clobbers: rscratch1, rflags
2549 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2550 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2551 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2552 assert_different_registers(tmp1, tmp2, src, dst);
2553
2554 switch (T) {
2555 case S:
2556 mov(rscratch1, jint_cast(0x1.0p23f));
2557 break;
2558 case D:
2559 mov(rscratch1, julong_cast(0x1.0p52));
2560 break;
2561 default:
2562 assert(T == S || T == D, "invalid register variant");
2563 }
2564
2565 sve_frinta(dst, T, ptrue, src);
2566 // dst = round(src), ties to away
2567
2568 Label none;
2569
2570 sve_fneg(tmp1, T, ptrue, src);
2571 sve_dup(tmp2, T, rscratch1);
2572 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2573 br(EQ, none);
2574 {
2575 sve_cpy(tmp1, T, pgtmp, 0.5);
2576 sve_fadd(tmp1, T, pgtmp, src);
2577 sve_frintm(dst, T, pgtmp, tmp1);
2578 // dst = floor(src + 0.5, ties to even)
2579 }
2580 bind(none);
2581
2582 sve_fcvtzs(dst, T, ptrue, dst, T);
2583 // result in dst
2584 }
2585
2586 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2587 FloatRegister one, SIMD_Arrangement T) {
2588 assert_different_registers(dst, src, zero, one);
2589 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2590
2591 facgt(dst, T, src, zero);
2592 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2593 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2594 }
2595
2596 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2597 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2598 assert_different_registers(dst, src, zero, one, vtmp);
2599 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2600
2601 sve_orr(vtmp, src, src);
2602 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2603 switch (T) {
2604 case S:
2605 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2606 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2607 // on the sign of the float value
2608 break;
2609 case D:
2610 sve_and(vtmp, T, min_jlong);
2611 sve_orr(vtmp, T, jlong_cast(1.0));
2612 break;
2613 default:
2614 assert(false, "unsupported");
2615 ShouldNotReachHere();
2616 }
2617 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2618 // Result in dst
2619 }
2620
2621 bool C2_MacroAssembler::in_scratch_emit_size() {
2622 if (ciEnv::current()->task() != nullptr) {
2623 PhaseOutput* phase_output = Compile::current()->output();
2624 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2625 return true;
2626 }
2627 }
2628 return MacroAssembler::in_scratch_emit_size();
2629 }
2630
2631 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2632 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2633 }
2634
2635 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2636 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2637 if (t == TypeInt::INT) {
2638 return;
2639 }
2640
2641 BLOCK_COMMENT("verify_int_in_range {");
2642 Label L_success, L_failure;
2643
2644 jint lo = t->_lo;
2645 jint hi = t->_hi;
2646
2647 if (lo != min_jint) {
2648 subsw(rtmp, rval, lo);
2649 br(Assembler::LT, L_failure);
2650 }
2651 if (hi != max_jint) {
2652 subsw(rtmp, rval, hi);
2653 br(Assembler::GT, L_failure);
2654 }
2655 b(L_success);
2656
2657 bind(L_failure);
2658 movw(c_rarg0, idx);
2659 mov(c_rarg1, rval);
2660 movw(c_rarg2, lo);
2661 movw(c_rarg3, hi);
2662 reconstruct_frame_pointer(rtmp);
2663 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2664 hlt(0);
2665
2666 bind(L_success);
2667 BLOCK_COMMENT("} verify_int_in_range");
2668 }
2669
2670 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2671 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2672 }
2673
2674 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2675 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2676 if (t == TypeLong::LONG) {
2677 return;
2678 }
2679
2680 BLOCK_COMMENT("verify_long_in_range {");
2681 Label L_success, L_failure;
2682
2683 jlong lo = t->_lo;
2684 jlong hi = t->_hi;
2685
2686 if (lo != min_jlong) {
2687 subs(rtmp, rval, lo);
2688 br(Assembler::LT, L_failure);
2689 }
2690 if (hi != max_jlong) {
2691 subs(rtmp, rval, hi);
2692 br(Assembler::GT, L_failure);
2693 }
2694 b(L_success);
2695
2696 bind(L_failure);
2697 movw(c_rarg0, idx);
2698 mov(c_rarg1, rval);
2699 mov(c_rarg2, lo);
2700 mov(c_rarg3, hi);
2701 reconstruct_frame_pointer(rtmp);
2702 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2703 hlt(0);
2704
2705 bind(L_success);
2706 BLOCK_COMMENT("} verify_long_in_range");
2707 }
2708
2709 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2710 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2711 if (PreserveFramePointer) {
2712 // frame pointer is valid
2713 #ifdef ASSERT
2714 // Verify frame pointer value in rfp.
2715 add(rtmp, sp, framesize - 2 * wordSize);
2716 Label L_success;
2717 cmp(rfp, rtmp);
2718 br(Assembler::EQ, L_success);
2719 stop("frame pointer mismatch");
2720 bind(L_success);
2721 #endif // ASSERT
2722 } else {
2723 add(rfp, sp, framesize - 2 * wordSize);
2724 }
2725 }
2726
2727 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2728 // using Neon instructions and places it in the destination vector element corresponding to the
2729 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2730 // where NUM_ELEM is the number of BasicType elements per vector.
2731 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2732 // Otherwise, selects src2[idx – NUM_ELEM]
2733 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2734 FloatRegister src2, FloatRegister index,
2735 FloatRegister tmp, unsigned vector_length_in_bytes) {
2736 assert_different_registers(dst, src1, src2, tmp);
2737 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2738
2739 if (vector_length_in_bytes == 16) {
2740 assert(UseSVE <= 1, "sve must be <= 1");
2741 assert(src1->successor() == src2, "Source registers must be ordered");
2742 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2743 tbl(dst, size, src1, 2, index);
2744 } else { // vector length == 8
2745 assert(UseSVE == 0, "must be Neon only");
2746 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2747 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2748 // instruction with one vector lookup
2749 ins(tmp, D, src1, 0, 0);
2750 ins(tmp, D, src2, 1, 0);
2751 tbl(dst, size, tmp, 1, index);
2752 }
2753 }
2754
2755 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2756 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2757 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2758 // where NUM_ELEM is the number of BasicType elements per vector.
2759 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2760 // Otherwise, selects src2[idx – NUM_ELEM]
2761 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2762 FloatRegister src2, FloatRegister index,
2763 FloatRegister tmp, SIMD_RegVariant T,
2764 unsigned vector_length_in_bytes) {
2765 assert_different_registers(dst, src1, src2, index, tmp);
2766
2767 if (vector_length_in_bytes == 8) {
2768 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2769 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2770 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2771 // instruction with one vector lookup
2772 assert(UseSVE >= 1, "sve must be >= 1");
2773 ins(tmp, D, src1, 0, 0);
2774 ins(tmp, D, src2, 1, 0);
2775 sve_tbl(dst, T, tmp, index);
2776 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2777 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2778 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2779 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2780 // with the only exception of 8B vector length.
2781 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2782 assert(src1->successor() == src2, "Source registers must be ordered");
2783 sve_tbl(dst, T, src1, src2, index);
2784 }
2785 }
2786
2787 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2788 FloatRegister src2, FloatRegister index,
2789 FloatRegister tmp, BasicType bt,
2790 unsigned vector_length_in_bytes) {
2791
2792 assert_different_registers(dst, src1, src2, index, tmp);
2793
2794 // The cases that can reach this method are -
2795 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2796 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2797 //
2798 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2799 // and UseSVE = 2 with vector_length_in_bytes >= 8
2800 //
2801 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2802 // UseSVE = 1 with vector_length_in_bytes = 16
2803
2804 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2805 SIMD_RegVariant T = elemType_to_regVariant(bt);
2806 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2807 return;
2808 }
2809
2810 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2811 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2812 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2813
2814 bool isQ = vector_length_in_bytes == 16;
2815
2816 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2817 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2818
2819 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2820 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2821 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2822 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2823 // the indices can range from [0, 8).
2824 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2825 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2826 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2827 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2828 // Add the multiplied result to the vector in tmp to obtain the byte level
2829 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2830 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2831
2832 if (bt == T_BYTE) {
2833 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2834 } else {
2835 int elem_size = (bt == T_SHORT) ? 2 : 4;
2836 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2837
2838 mov(tmp, size1, elem_size);
2839 mulv(dst, size2, index, tmp);
2840 mov(tmp, size2, tbl_offset);
2841 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2842 // to select a set of 2B/4B
2843 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2844 }
2845 }
2846
2847 // Vector expand implementation. Elements from the src vector are expanded into
2848 // the dst vector under the control of the vector mask.
2849 // Since there are no native instructions directly corresponding to expand before
2850 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2851 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2852 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2853 // for NEON and SVE, but with different instructions where appropriate.
2854
2855 // Vector expand implementation for NEON.
2856 //
2857 // An example of 128-bit Byte vector:
2858 // Data direction: high <== low
2859 // Input:
2860 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2861 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2862 // Expected result:
2863 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2864 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2865 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2866 int vector_length_in_bytes) {
2867 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2868 assert_different_registers(dst, src, mask, tmp1, tmp2);
2869 // Since the TBL instruction only supports byte table, we need to
2870 // compute indices in byte type for all types.
2871 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2872 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2873 dup(tmp1, size, zr);
2874 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2875 negr(dst, size, mask);
2876 // Calculate vector index for TBL with prefix sum algorithm.
2877 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2878 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2879 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2880 addv(dst, size, tmp2, dst);
2881 }
2882 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2883 orr(tmp2, size, mask, mask);
2884 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2885 bsl(tmp2, size, dst, tmp1);
2886 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2887 movi(tmp1, size, 1);
2888 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2889 subv(dst, size, tmp2, tmp1);
2890 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2891 tbl(dst, size, src, 1, dst);
2892 }
2893
2894 // Vector expand implementation for SVE.
2895 //
2896 // An example of 128-bit Short vector:
2897 // Data direction: high <== low
2898 // Input:
2899 // src = gf ed cb a9 87 65 43 21
2900 // pg = 00 01 00 01 00 01 00 01
2901 // Expected result:
2902 // dst = 00 87 00 65 00 43 00 21
2903 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2904 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2905 int vector_length_in_bytes) {
2906 assert(UseSVE > 0, "expand implementation only for SVE");
2907 assert_different_registers(dst, src, tmp1, tmp2);
2908 SIMD_RegVariant size = elemType_to_regVariant(bt);
2909
2910 // tmp1 = 00 00 00 00 00 00 00 00
2911 sve_dup(tmp1, size, 0);
2912 sve_movprfx(tmp2, tmp1);
2913 // tmp2 = 00 01 00 01 00 01 00 01
2914 sve_cpy(tmp2, size, pg, 1, true);
2915 // Calculate vector index for TBL with prefix sum algorithm.
2916 // tmp2 = 04 04 03 03 02 02 01 01
2917 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2918 sve_movprfx(dst, tmp1);
2919 // The EXT instruction operates on the full-width sve register. The correct
2920 // index calculation method is:
2921 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2922 // MaxVectorSize - i.
2923 sve_ext(dst, tmp2, MaxVectorSize - i);
2924 sve_add(tmp2, size, dst, tmp2);
2925 }
2926 // dst = 00 04 00 03 00 02 00 01
2927 sve_sel(dst, size, pg, tmp2, tmp1);
2928 // dst = -1 03 -1 02 -1 01 -1 00
2929 sve_sub(dst, size, 1);
2930 // dst = 00 87 00 65 00 43 00 21
2931 sve_tbl(dst, size, src, dst);
2932 }
2933
2934 // Optimized SVE cpy (imm, zeroing) instruction.
2935 //
2936 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2937 // functionality, but test results show that `movi; cpy(imm, merging)` has
2938 // higher throughput on some microarchitectures. This would depend on
2939 // microarchitecture and so may vary between implementations.
2940 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2941 PRegister pg, int imm8, bool isMerge) {
2942 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2943 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2944 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2945 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2946 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2947 // entire Z<dst> register. According to the Arm Software Optimization
2948 // Guide, `movi` is zero latency.
2949 movi(dst, T2D, 0);
2950 isMerge = true;
2951 }
2952 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2953 }
2954
2955 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2956 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2957 // the offset between two types is 16.
2958 switch(bt) {
2959 case T_BYTE:
2960 return 0;
2961 case T_SHORT:
2962 return 1;
2963 case T_INT:
2964 return 2;
2965 case T_LONG:
2966 return 3;
2967 case T_FLOAT:
2968 return 4;
2969 case T_DOUBLE:
2970 return 5;
2971 default:
2972 ShouldNotReachHere();
2973 }
2974 }