1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/objectMonitorTable.hpp"
34 #include "runtime/stubRoutines.hpp"
35 #include "runtime/synchronizer.hpp"
36 #include "utilities/globalDefinitions.hpp"
37 #include "utilities/powerOfTwo.hpp"
38
39 #ifdef PRODUCT
40 #define BLOCK_COMMENT(str) /* nothing */
41 #define STOP(error) stop(error)
42 #else
43 #define BLOCK_COMMENT(str) block_comment(str)
44 #define STOP(error) block_comment(error); stop(error)
45 #endif
46
47 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
48
49 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
50
51 // jdk.internal.util.ArraysSupport.vectorizedHashCode
52 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
53 FloatRegister vdata0, FloatRegister vdata1,
54 FloatRegister vdata2, FloatRegister vdata3,
55 FloatRegister vmul0, FloatRegister vmul1,
56 FloatRegister vmul2, FloatRegister vmul3,
57 FloatRegister vpow, FloatRegister vpowm,
58 BasicType eltype) {
59 ARRAYS_HASHCODE_REGISTERS;
60
61 Register tmp1 = rscratch1, tmp2 = rscratch2;
62
63 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
64
65 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
66 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
67 // use 4H for chars and shorts instead, but using 8H gives better performance.
68 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
69 : eltype == T_CHAR || eltype == T_SHORT ? 8
70 : eltype == T_INT ? 4
71 : 0;
72 guarantee(vf, "unsupported eltype");
73
74 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
75 const size_t unroll_factor = 4;
76
77 switch (eltype) {
78 case T_BOOLEAN:
79 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
80 break;
81 case T_CHAR:
82 BLOCK_COMMENT("arrays_hashcode(char) {");
83 break;
84 case T_BYTE:
85 BLOCK_COMMENT("arrays_hashcode(byte) {");
86 break;
87 case T_SHORT:
88 BLOCK_COMMENT("arrays_hashcode(short) {");
89 break;
90 case T_INT:
91 BLOCK_COMMENT("arrays_hashcode(int) {");
92 break;
93 default:
94 ShouldNotReachHere();
95 }
96
97 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
98 // implemented by the stub executes just once. Call the stub only if at least two iterations will
99 // be executed.
100 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
101 cmpw(cnt, large_threshold);
102 br(Assembler::HS, LARGE);
103
104 bind(TAIL);
105
106 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
107 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
108 // Iteration eats up the remainder, uf elements at a time.
109 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
110 andr(tmp2, cnt, unroll_factor - 1);
111 adr(tmp1, BR_BASE);
112 // For Cortex-A53 offset is 4 because 2 nops are generated.
113 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
114 movw(tmp2, 0x1f);
115 br(tmp1);
116
117 bind(LOOP);
118 for (size_t i = 0; i < unroll_factor; ++i) {
119 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
120 maddw(result, result, tmp2, tmp1);
121 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
122 // Generate 2nd nop to have 4 instructions per iteration.
123 if (VM_Version::supports_a53mac()) {
124 nop();
125 }
126 }
127 bind(BR_BASE);
128 subsw(cnt, cnt, unroll_factor);
129 br(Assembler::HS, LOOP);
130
131 b(DONE);
132
133 bind(LARGE);
134
135 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
136 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
137 address tpc = trampoline_call(stub);
138 if (tpc == nullptr) {
139 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
140 postcond(pc() == badAddress);
141 return nullptr;
142 }
143
144 bind(DONE);
145
146 BLOCK_COMMENT("} // arrays_hashcode");
147
148 postcond(pc() != badAddress);
149 return pc();
150 }
151
152 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
153 Register t2, Register t3) {
154 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
155
156 // Handle inflated monitor.
157 Label inflated;
158 // Finish fast lock successfully. MUST branch to with flag == EQ
159 Label locked;
160 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
161 Label slow_path;
162
163 if (UseObjectMonitorTable) {
164 // Clear cache in case fast locking succeeds or we need to take the slow-path.
165 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
166 }
167
168 if (DiagnoseSyncOnValueBasedClasses != 0) {
169 load_klass(t1, obj);
170 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
171 tst(t1, KlassFlags::_misc_is_value_based_class);
172 br(Assembler::NE, slow_path);
173 }
174
175 const Register t1_mark = t1;
176 const Register t3_t = t3;
177
178 { // Fast locking
179
180 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
181 Label push;
182
183 const Register t2_top = t2;
184
185 // Check if lock-stack is full.
186 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
187 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
188 br(Assembler::GT, slow_path);
189
190 // Check if recursive.
191 subw(t3_t, t2_top, oopSize);
192 ldr(t3_t, Address(rthread, t3_t));
193 cmp(obj, t3_t);
194 br(Assembler::EQ, push);
195
196 // Relaxed normal load to check for monitor. Optimization for monitor case.
197 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
198 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
199
200 // Not inflated
201 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
202
203 // Try to lock. Transition lock-bits 0b01 => 0b00
204 orr(t1_mark, t1_mark, markWord::unlocked_value);
205 eor(t3_t, t1_mark, markWord::unlocked_value);
206 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
207 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
208 br(Assembler::NE, slow_path);
209
210 bind(push);
211 // After successful lock, push object on lock-stack.
212 str(obj, Address(rthread, t2_top));
213 addw(t2_top, t2_top, oopSize);
214 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
215 b(locked);
216 }
217
218 { // Handle inflated monitor.
219 bind(inflated);
220
221 const Register t1_monitor = t1;
222
223 if (!UseObjectMonitorTable) {
224 assert(t1_monitor == t1_mark, "should be the same here");
225 } else {
226 const Register t1_hash = t1;
227 Label monitor_found;
228
229 // Save the mark, we might need it to extract the hash.
230 mov(t3, t1_mark);
231
232 // Look for the monitor in the om_cache.
233
234 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
235 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
236 const int num_unrolled = OMCache::CAPACITY;
237 for (int i = 0; i < num_unrolled; i++) {
238 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
239 ldr(t2, Address(rthread, cache_offset));
240 cmp(obj, t2);
241 br(Assembler::EQ, monitor_found);
242 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
243 }
244
245 if (UseCompactObjectHeaders) {
246 // TODO: The fast-path table lookup currently doesn't work with Lilliput's
247 // compact identity-hashcode implementation.
248 // See: https://bugs.openjdk.org/browse/JDK-8380981
249 b(slow_path);
250 } else {
251 // Look for the monitor in the table.
252
253 // Get the hash code.
254 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
255
256 // Get the table and calculate the bucket's address
257 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
258 ldr(t3, Address(t3));
259 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
260 ands(t1_hash, t1_hash, t2);
261 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
262
263 // Read the monitor from the bucket.
264 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
265
266 // Check if the monitor in the bucket is special (empty, tombstone or removed).
267 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
268 br(Assembler::LO, slow_path);
269
270 // Check if object matches.
271 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
272 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
273 bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
274 cmp(t3, obj);
275 br(Assembler::NE, slow_path);
276 }
277 bind(monitor_found);
278 }
279
280 const Register t2_owner_addr = t2;
281 const Register t3_owner = t3;
282 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
283 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
284 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
285
286 Label monitor_locked;
287
288 // Compute owner address.
289 lea(t2_owner_addr, owner_address);
290
291 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
292 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
293 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
294 /*release*/ false, /*weak*/ false, t3_owner);
295 br(Assembler::EQ, monitor_locked);
296
297 // Check if recursive.
298 cmp(t3_owner, rscratch2);
299 br(Assembler::NE, slow_path);
300
301 // Recursive.
302 increment(recursions_address, 1);
303
304 bind(monitor_locked);
305 if (UseObjectMonitorTable) {
306 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
307 }
308 }
309
310 bind(locked);
311
312 #ifdef ASSERT
313 // Check that locked label is reached with Flags == EQ.
314 Label flag_correct;
315 br(Assembler::EQ, flag_correct);
316 stop("Fast Lock Flag != EQ");
317 #endif
318
319 bind(slow_path);
320 #ifdef ASSERT
321 // Check that slow_path label is reached with Flags == NE.
322 br(Assembler::NE, flag_correct);
323 stop("Fast Lock Flag != NE");
324 bind(flag_correct);
325 #endif
326 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
327 }
328
329 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
330 Register t2, Register t3) {
331 assert_different_registers(obj, box, t1, t2, t3);
332
333 // Handle inflated monitor.
334 Label inflated, inflated_load_mark;
335 // Finish fast unlock successfully. MUST branch to with flag == EQ
336 Label unlocked;
337 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
338 Label slow_path;
339
340 const Register t1_mark = t1;
341 const Register t2_top = t2;
342 const Register t3_t = t3;
343
344 { // Fast unlock
345
346 Label push_and_slow_path;
347
348 // Check if obj is top of lock-stack.
349 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
350 subw(t2_top, t2_top, oopSize);
351 ldr(t3_t, Address(rthread, t2_top));
352 cmp(obj, t3_t);
353 // Top of lock stack was not obj. Must be monitor.
354 br(Assembler::NE, inflated_load_mark);
355
356 // Pop lock-stack.
357 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
358 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
359
360 // Check if recursive.
361 subw(t3_t, t2_top, oopSize);
362 ldr(t3_t, Address(rthread, t3_t));
363 cmp(obj, t3_t);
364 br(Assembler::EQ, unlocked);
365
366 // Not recursive.
367 // Load Mark.
368 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
369
370 // Check header for monitor (0b10).
371 // Because we got here by popping (meaning we pushed in locked)
372 // there will be no monitor in the box. So we need to push back the obj
373 // so that the runtime can fix any potential anonymous owner.
374 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
375
376 // Try to unlock. Transition lock bits 0b00 => 0b01
377 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
378 orr(t3_t, t1_mark, markWord::unlocked_value);
379 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
380 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
381 br(Assembler::EQ, unlocked);
382
383 bind(push_and_slow_path);
384 // Compare and exchange failed.
385 // Restore lock-stack and handle the unlock in runtime.
386 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
387 addw(t2_top, t2_top, oopSize);
388 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
389 b(slow_path);
390 }
391
392
393 { // Handle inflated monitor.
394 bind(inflated_load_mark);
395 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
396 #ifdef ASSERT
397 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
398 stop("Fast Unlock not monitor");
399 #endif
400
401 bind(inflated);
402
403 #ifdef ASSERT
404 Label check_done;
405 subw(t2_top, t2_top, oopSize);
406 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
407 br(Assembler::LT, check_done);
408 ldr(t3_t, Address(rthread, t2_top));
409 cmp(obj, t3_t);
410 br(Assembler::NE, inflated);
411 stop("Fast Unlock lock on stack");
412 bind(check_done);
413 #endif
414
415 const Register t1_monitor = t1;
416
417 if (!UseObjectMonitorTable) {
418 assert(t1_monitor == t1_mark, "should be the same here");
419
420 // Untag the monitor.
421 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
422 } else {
423 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
424 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
425 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
426 br(Assembler::LO, slow_path);
427 }
428
429 const Register t2_recursions = t2;
430 Label not_recursive;
431
432 // Check if recursive.
433 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
434 cbz(t2_recursions, not_recursive);
435
436 // Recursive unlock.
437 sub(t2_recursions, t2_recursions, 1u);
438 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
439 // Set flag == EQ
440 cmp(t2_recursions, t2_recursions);
441 b(unlocked);
442
443 bind(not_recursive);
444
445 const Register t2_owner_addr = t2;
446
447 // Compute owner address.
448 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
449
450 // Set owner to null.
451 // Release to satisfy the JMM
452 stlr(zr, t2_owner_addr);
453 // We need a full fence after clearing owner to avoid stranding.
454 // StoreLoad achieves this.
455 membar(StoreLoad);
456
457 // Check if the entry_list is empty.
458 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
459 cmp(rscratch1, zr);
460 br(Assembler::EQ, unlocked); // If so we are done.
461
462 // Check if there is a successor.
463 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
464 cmp(rscratch1, zr);
465 br(Assembler::NE, unlocked); // If so we are done.
466
467 // Save the monitor pointer in the current thread, so we can try to
468 // reacquire the lock in SharedRuntime::monitor_exit_helper().
469 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
470
471 cmp(zr, rthread); // Set Flag to NE => slow path
472 b(slow_path);
473 }
474
475 bind(unlocked);
476 cmp(zr, zr); // Set Flags to EQ => fast path
477
478 #ifdef ASSERT
479 // Check that unlocked label is reached with Flags == EQ.
480 Label flag_correct;
481 br(Assembler::EQ, flag_correct);
482 stop("Fast Unlock Flag != EQ");
483 #endif
484
485 bind(slow_path);
486 #ifdef ASSERT
487 // Check that slow_path label is reached with Flags == NE.
488 br(Assembler::NE, flag_correct);
489 stop("Fast Unlock Flag != NE");
490 bind(flag_correct);
491 #endif
492 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
493 }
494
495 // Search for str1 in str2 and return index or -1
496 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
497 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
498 Register cnt2, Register cnt1,
499 Register tmp1, Register tmp2,
500 Register tmp3, Register tmp4,
501 Register tmp5, Register tmp6,
502 int icnt1, Register result, int ae) {
503 // NOTE: tmp5, tmp6 can be zr depending on specific method version
504 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
505
506 Register ch1 = rscratch1;
507 Register ch2 = rscratch2;
508 Register cnt1tmp = tmp1;
509 Register cnt2tmp = tmp2;
510 Register cnt1_neg = cnt1;
511 Register cnt2_neg = cnt2;
512 Register result_tmp = tmp4;
513
514 bool isL = ae == StrIntrinsicNode::LL;
515
516 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
517 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
518 int str1_chr_shift = str1_isL ? 0:1;
519 int str2_chr_shift = str2_isL ? 0:1;
520 int str1_chr_size = str1_isL ? 1:2;
521 int str2_chr_size = str2_isL ? 1:2;
522 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
523 (chr_insn)&MacroAssembler::ldrh;
524 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
525 (chr_insn)&MacroAssembler::ldrh;
526 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
527 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
528
529 // Note, inline_string_indexOf() generates checks:
530 // if (substr.count > string.count) return -1;
531 // if (substr.count == 0) return 0;
532
533 // We have two strings, a source string in str2, cnt2 and a pattern string
534 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
535
536 // For larger pattern and source we use a simplified Boyer Moore algorithm.
537 // With a small pattern and source we use linear scan.
538
539 if (icnt1 == -1) {
540 sub(result_tmp, cnt2, cnt1);
541 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
542 br(LT, LINEARSEARCH);
543 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
544 subs(zr, cnt1, 256);
545 lsr(tmp1, cnt2, 2);
546 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
547 br(GE, LINEARSTUB);
548 }
549
550 // The Boyer Moore alogorithm is based on the description here:-
551 //
552 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
553 //
554 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
555 // and the 'Good Suffix' rule.
556 //
557 // These rules are essentially heuristics for how far we can shift the
558 // pattern along the search string.
559 //
560 // The implementation here uses the 'Bad Character' rule only because of the
561 // complexity of initialisation for the 'Good Suffix' rule.
562 //
563 // This is also known as the Boyer-Moore-Horspool algorithm:-
564 //
565 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
566 //
567 // This particular implementation has few java-specific optimizations.
568 //
569 // #define ASIZE 256
570 //
571 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
572 // int i, j;
573 // unsigned c;
574 // unsigned char bc[ASIZE];
575 //
576 // /* Preprocessing */
577 // for (i = 0; i < ASIZE; ++i)
578 // bc[i] = m;
579 // for (i = 0; i < m - 1; ) {
580 // c = x[i];
581 // ++i;
582 // // c < 256 for Latin1 string, so, no need for branch
583 // #ifdef PATTERN_STRING_IS_LATIN1
584 // bc[c] = m - i;
585 // #else
586 // if (c < ASIZE) bc[c] = m - i;
587 // #endif
588 // }
589 //
590 // /* Searching */
591 // j = 0;
592 // while (j <= n - m) {
593 // c = y[i+j];
594 // if (x[m-1] == c)
595 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
596 // if (i < 0) return j;
597 // // c < 256 for Latin1 string, so, no need for branch
598 // #ifdef SOURCE_STRING_IS_LATIN1
599 // // LL case: (c< 256) always true. Remove branch
600 // j += bc[y[j+m-1]];
601 // #endif
602 // #ifndef PATTERN_STRING_IS_UTF
603 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
604 // if (c < ASIZE)
605 // j += bc[y[j+m-1]];
606 // else
607 // j += 1
608 // #endif
609 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
610 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
611 // if (c < ASIZE)
612 // j += bc[y[j+m-1]];
613 // else
614 // j += m
615 // #endif
616 // }
617 // }
618
619 if (icnt1 == -1) {
620 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
621 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
622 Register cnt1end = tmp2;
623 Register str2end = cnt2;
624 Register skipch = tmp2;
625
626 // str1 length is >=8, so, we can read at least 1 register for cases when
627 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
628 // UL case. We'll re-read last character in inner pre-loop code to have
629 // single outer pre-loop load
630 const int firstStep = isL ? 7 : 3;
631
632 const int ASIZE = 256;
633 const int STORED_BYTES = 32; // amount of bytes stored per instruction
634 sub(sp, sp, ASIZE);
635 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
636 mov(ch1, sp);
637 BIND(BM_INIT_LOOP);
638 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
639 subs(tmp5, tmp5, 1);
640 br(GT, BM_INIT_LOOP);
641
642 sub(cnt1tmp, cnt1, 1);
643 mov(tmp5, str2);
644 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
645 sub(ch2, cnt1, 1);
646 mov(tmp3, str1);
647 BIND(BCLOOP);
648 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
649 if (!str1_isL) {
650 subs(zr, ch1, ASIZE);
651 br(HS, BCSKIP);
652 }
653 strb(ch2, Address(sp, ch1));
654 BIND(BCSKIP);
655 subs(ch2, ch2, 1);
656 br(GT, BCLOOP);
657
658 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
659 if (str1_isL == str2_isL) {
660 // load last 8 bytes (8LL/4UU symbols)
661 ldr(tmp6, Address(tmp6, -wordSize));
662 } else {
663 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
664 // convert Latin1 to UTF. We'll have to wait until load completed, but
665 // it's still faster than per-character loads+checks
666 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
667 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
668 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
669 andr(tmp6, tmp6, 0xFF); // str1[N-4]
670 orr(ch2, ch1, ch2, LSL, 16);
671 orr(tmp6, tmp6, tmp3, LSL, 48);
672 orr(tmp6, tmp6, ch2, LSL, 16);
673 }
674 BIND(BMLOOPSTR2);
675 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
676 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
677 if (str1_isL == str2_isL) {
678 // re-init tmp3. It's for free because it's executed in parallel with
679 // load above. Alternative is to initialize it before loop, but it'll
680 // affect performance on in-order systems with 2 or more ld/st pipelines
681 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
682 }
683 if (!isL) { // UU/UL case
684 lsl(ch2, cnt1tmp, 1); // offset in bytes
685 }
686 cmp(tmp3, skipch);
687 br(NE, BMSKIP);
688 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
689 mov(ch1, tmp6);
690 if (isL) {
691 b(BMLOOPSTR1_AFTER_LOAD);
692 } else {
693 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
694 b(BMLOOPSTR1_CMP);
695 }
696 BIND(BMLOOPSTR1);
697 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
698 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
699 BIND(BMLOOPSTR1_AFTER_LOAD);
700 subs(cnt1tmp, cnt1tmp, 1);
701 br(LT, BMLOOPSTR1_LASTCMP);
702 BIND(BMLOOPSTR1_CMP);
703 cmp(ch1, ch2);
704 br(EQ, BMLOOPSTR1);
705 BIND(BMSKIP);
706 if (!isL) {
707 // if we've met UTF symbol while searching Latin1 pattern, then we can
708 // skip cnt1 symbols
709 if (str1_isL != str2_isL) {
710 mov(result_tmp, cnt1);
711 } else {
712 mov(result_tmp, 1);
713 }
714 subs(zr, skipch, ASIZE);
715 br(HS, BMADV);
716 }
717 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
718 BIND(BMADV);
719 sub(cnt1tmp, cnt1, 1);
720 add(str2, str2, result_tmp, LSL, str2_chr_shift);
721 cmp(str2, str2end);
722 br(LE, BMLOOPSTR2);
723 add(sp, sp, ASIZE);
724 b(NOMATCH);
725 BIND(BMLOOPSTR1_LASTCMP);
726 cmp(ch1, ch2);
727 br(NE, BMSKIP);
728 BIND(BMMATCH);
729 sub(result, str2, tmp5);
730 if (!str2_isL) lsr(result, result, 1);
731 add(sp, sp, ASIZE);
732 b(DONE);
733
734 BIND(LINEARSTUB);
735 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
736 br(LT, LINEAR_MEDIUM);
737 mov(result, zr);
738 RuntimeAddress stub = nullptr;
739 if (isL) {
740 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
741 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
742 } else if (str1_isL) {
743 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
744 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
745 } else {
746 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
747 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
748 }
749 address call = trampoline_call(stub);
750 if (call == nullptr) {
751 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
752 ciEnv::current()->record_failure("CodeCache is full");
753 return;
754 }
755 b(DONE);
756 }
757
758 BIND(LINEARSEARCH);
759 {
760 Label DO1, DO2, DO3;
761
762 Register str2tmp = tmp2;
763 Register first = tmp3;
764
765 if (icnt1 == -1)
766 {
767 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
768
769 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
770 br(LT, DOSHORT);
771 BIND(LINEAR_MEDIUM);
772 (this->*str1_load_1chr)(first, Address(str1));
773 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
774 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
775 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
776 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
777
778 BIND(FIRST_LOOP);
779 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
780 cmp(first, ch2);
781 br(EQ, STR1_LOOP);
782 BIND(STR2_NEXT);
783 adds(cnt2_neg, cnt2_neg, str2_chr_size);
784 br(LE, FIRST_LOOP);
785 b(NOMATCH);
786
787 BIND(STR1_LOOP);
788 adds(cnt1tmp, cnt1_neg, str1_chr_size);
789 add(cnt2tmp, cnt2_neg, str2_chr_size);
790 br(GE, MATCH);
791
792 BIND(STR1_NEXT);
793 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
794 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
795 cmp(ch1, ch2);
796 br(NE, STR2_NEXT);
797 adds(cnt1tmp, cnt1tmp, str1_chr_size);
798 add(cnt2tmp, cnt2tmp, str2_chr_size);
799 br(LT, STR1_NEXT);
800 b(MATCH);
801
802 BIND(DOSHORT);
803 if (str1_isL == str2_isL) {
804 cmp(cnt1, (u1)2);
805 br(LT, DO1);
806 br(GT, DO3);
807 }
808 }
809
810 if (icnt1 == 4) {
811 Label CH1_LOOP;
812
813 (this->*load_4chr)(ch1, str1);
814 sub(result_tmp, cnt2, 4);
815 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
816 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
817
818 BIND(CH1_LOOP);
819 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
820 cmp(ch1, ch2);
821 br(EQ, MATCH);
822 adds(cnt2_neg, cnt2_neg, str2_chr_size);
823 br(LE, CH1_LOOP);
824 b(NOMATCH);
825 }
826
827 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
828 Label CH1_LOOP;
829
830 BIND(DO2);
831 (this->*load_2chr)(ch1, str1);
832 if (icnt1 == 2) {
833 sub(result_tmp, cnt2, 2);
834 }
835 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
836 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
837 BIND(CH1_LOOP);
838 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
839 cmp(ch1, ch2);
840 br(EQ, MATCH);
841 adds(cnt2_neg, cnt2_neg, str2_chr_size);
842 br(LE, CH1_LOOP);
843 b(NOMATCH);
844 }
845
846 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
847 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
848
849 BIND(DO3);
850 (this->*load_2chr)(first, str1);
851 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
852 if (icnt1 == 3) {
853 sub(result_tmp, cnt2, 3);
854 }
855 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
856 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
857 BIND(FIRST_LOOP);
858 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
859 cmpw(first, ch2);
860 br(EQ, STR1_LOOP);
861 BIND(STR2_NEXT);
862 adds(cnt2_neg, cnt2_neg, str2_chr_size);
863 br(LE, FIRST_LOOP);
864 b(NOMATCH);
865
866 BIND(STR1_LOOP);
867 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
868 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
869 cmp(ch1, ch2);
870 br(NE, STR2_NEXT);
871 b(MATCH);
872 }
873
874 if (icnt1 == -1 || icnt1 == 1) {
875 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
876
877 BIND(DO1);
878 (this->*str1_load_1chr)(ch1, str1);
879 cmp(cnt2, (u1)8);
880 br(LT, DO1_SHORT);
881
882 sub(result_tmp, cnt2, 8/str2_chr_size);
883 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
884 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
885 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
886
887 if (str2_isL) {
888 orr(ch1, ch1, ch1, LSL, 8);
889 }
890 orr(ch1, ch1, ch1, LSL, 16);
891 orr(ch1, ch1, ch1, LSL, 32);
892 BIND(CH1_LOOP);
893 ldr(ch2, Address(str2, cnt2_neg));
894 eor(ch2, ch1, ch2);
895 sub(tmp1, ch2, tmp3);
896 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
897 bics(tmp1, tmp1, tmp2);
898 br(NE, HAS_ZERO);
899 adds(cnt2_neg, cnt2_neg, 8);
900 br(LT, CH1_LOOP);
901
902 cmp(cnt2_neg, (u1)8);
903 mov(cnt2_neg, 0);
904 br(LT, CH1_LOOP);
905 b(NOMATCH);
906
907 BIND(HAS_ZERO);
908 rev(tmp1, tmp1);
909 clz(tmp1, tmp1);
910 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
911 b(MATCH);
912
913 BIND(DO1_SHORT);
914 mov(result_tmp, cnt2);
915 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
916 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
917 BIND(DO1_LOOP);
918 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
919 cmpw(ch1, ch2);
920 br(EQ, MATCH);
921 adds(cnt2_neg, cnt2_neg, str2_chr_size);
922 br(LT, DO1_LOOP);
923 }
924 }
925 BIND(NOMATCH);
926 mov(result, -1);
927 b(DONE);
928 BIND(MATCH);
929 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
930 BIND(DONE);
931 }
932
933 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
934 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
935
936 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
937 Register ch, Register result,
938 Register tmp1, Register tmp2, Register tmp3)
939 {
940 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
941 Register cnt1_neg = cnt1;
942 Register ch1 = rscratch1;
943 Register result_tmp = rscratch2;
944
945 cbz(cnt1, NOMATCH);
946
947 cmp(cnt1, (u1)4);
948 br(LT, DO1_SHORT);
949
950 orr(ch, ch, ch, LSL, 16);
951 orr(ch, ch, ch, LSL, 32);
952
953 sub(cnt1, cnt1, 4);
954 mov(result_tmp, cnt1);
955 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
956 sub(cnt1_neg, zr, cnt1, LSL, 1);
957
958 mov(tmp3, 0x0001000100010001);
959
960 BIND(CH1_LOOP);
961 ldr(ch1, Address(str1, cnt1_neg));
962 eor(ch1, ch, ch1);
963 sub(tmp1, ch1, tmp3);
964 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
965 bics(tmp1, tmp1, tmp2);
966 br(NE, HAS_ZERO);
967 adds(cnt1_neg, cnt1_neg, 8);
968 br(LT, CH1_LOOP);
969
970 cmp(cnt1_neg, (u1)8);
971 mov(cnt1_neg, 0);
972 br(LT, CH1_LOOP);
973 b(NOMATCH);
974
975 BIND(HAS_ZERO);
976 rev(tmp1, tmp1);
977 clz(tmp1, tmp1);
978 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
979 b(MATCH);
980
981 BIND(DO1_SHORT);
982 mov(result_tmp, cnt1);
983 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
984 sub(cnt1_neg, zr, cnt1, LSL, 1);
985 BIND(DO1_LOOP);
986 ldrh(ch1, Address(str1, cnt1_neg));
987 cmpw(ch, ch1);
988 br(EQ, MATCH);
989 adds(cnt1_neg, cnt1_neg, 2);
990 br(LT, DO1_LOOP);
991 BIND(NOMATCH);
992 mov(result, -1);
993 b(DONE);
994 BIND(MATCH);
995 add(result, result_tmp, cnt1_neg, ASR, 1);
996 BIND(DONE);
997 }
998
999 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1000 Register ch, Register result,
1001 FloatRegister ztmp1,
1002 FloatRegister ztmp2,
1003 PRegister tmp_pg,
1004 PRegister tmp_pdn, bool isL)
1005 {
1006 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1007 assert(tmp_pg->is_governing(),
1008 "this register has to be a governing predicate register");
1009
1010 Label LOOP, MATCH, DONE, NOMATCH;
1011 Register vec_len = rscratch1;
1012 Register idx = rscratch2;
1013
1014 SIMD_RegVariant T = (isL == true) ? B : H;
1015
1016 cbz(cnt1, NOMATCH);
1017
1018 // Assign the particular char throughout the vector.
1019 sve_dup(ztmp2, T, ch);
1020 if (isL) {
1021 sve_cntb(vec_len);
1022 } else {
1023 sve_cnth(vec_len);
1024 }
1025 mov(idx, 0);
1026
1027 // Generate a predicate to control the reading of input string.
1028 sve_whilelt(tmp_pg, T, idx, cnt1);
1029
1030 BIND(LOOP);
1031 // Read a vector of 8- or 16-bit data depending on the string type. Note
1032 // that inactive elements indicated by the predicate register won't cause
1033 // a data read from memory to the destination vector.
1034 if (isL) {
1035 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1036 } else {
1037 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1038 }
1039 add(idx, idx, vec_len);
1040
1041 // Perform the comparison. An element of the destination predicate is set
1042 // to active if the particular char is matched.
1043 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1044
1045 // Branch if the particular char is found.
1046 br(NE, MATCH);
1047
1048 sve_whilelt(tmp_pg, T, idx, cnt1);
1049
1050 // Loop back if the particular char not found.
1051 br(MI, LOOP);
1052
1053 BIND(NOMATCH);
1054 mov(result, -1);
1055 b(DONE);
1056
1057 BIND(MATCH);
1058 // Undo the index increment.
1059 sub(idx, idx, vec_len);
1060
1061 // Crop the vector to find its location.
1062 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1063 add(result, idx, -1);
1064 sve_incp(result, T, tmp_pdn);
1065 BIND(DONE);
1066 }
1067
1068 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1069 Register ch, Register result,
1070 Register tmp1, Register tmp2, Register tmp3)
1071 {
1072 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1073 Register cnt1_neg = cnt1;
1074 Register ch1 = rscratch1;
1075 Register result_tmp = rscratch2;
1076
1077 cbz(cnt1, NOMATCH);
1078
1079 cmp(cnt1, (u1)8);
1080 br(LT, DO1_SHORT);
1081
1082 orr(ch, ch, ch, LSL, 8);
1083 orr(ch, ch, ch, LSL, 16);
1084 orr(ch, ch, ch, LSL, 32);
1085
1086 sub(cnt1, cnt1, 8);
1087 mov(result_tmp, cnt1);
1088 lea(str1, Address(str1, cnt1));
1089 sub(cnt1_neg, zr, cnt1);
1090
1091 mov(tmp3, 0x0101010101010101);
1092
1093 BIND(CH1_LOOP);
1094 ldr(ch1, Address(str1, cnt1_neg));
1095 eor(ch1, ch, ch1);
1096 sub(tmp1, ch1, tmp3);
1097 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1098 bics(tmp1, tmp1, tmp2);
1099 br(NE, HAS_ZERO);
1100 adds(cnt1_neg, cnt1_neg, 8);
1101 br(LT, CH1_LOOP);
1102
1103 cmp(cnt1_neg, (u1)8);
1104 mov(cnt1_neg, 0);
1105 br(LT, CH1_LOOP);
1106 b(NOMATCH);
1107
1108 BIND(HAS_ZERO);
1109 rev(tmp1, tmp1);
1110 clz(tmp1, tmp1);
1111 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1112 b(MATCH);
1113
1114 BIND(DO1_SHORT);
1115 mov(result_tmp, cnt1);
1116 lea(str1, Address(str1, cnt1));
1117 sub(cnt1_neg, zr, cnt1);
1118 BIND(DO1_LOOP);
1119 ldrb(ch1, Address(str1, cnt1_neg));
1120 cmp(ch, ch1);
1121 br(EQ, MATCH);
1122 adds(cnt1_neg, cnt1_neg, 1);
1123 br(LT, DO1_LOOP);
1124 BIND(NOMATCH);
1125 mov(result, -1);
1126 b(DONE);
1127 BIND(MATCH);
1128 add(result, result_tmp, cnt1_neg);
1129 BIND(DONE);
1130 }
1131
1132 // Compare strings.
1133 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1134 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1135 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1136 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1137 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1138 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1139 SHORT_LOOP_START, TAIL_CHECK;
1140
1141 bool isLL = ae == StrIntrinsicNode::LL;
1142 bool isLU = ae == StrIntrinsicNode::LU;
1143 bool isUL = ae == StrIntrinsicNode::UL;
1144
1145 // The stub threshold for LL strings is: 72 (64 + 8) chars
1146 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1147 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1148 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1149
1150 bool str1_isL = isLL || isLU;
1151 bool str2_isL = isLL || isUL;
1152
1153 int str1_chr_shift = str1_isL ? 0 : 1;
1154 int str2_chr_shift = str2_isL ? 0 : 1;
1155 int str1_chr_size = str1_isL ? 1 : 2;
1156 int str2_chr_size = str2_isL ? 1 : 2;
1157 int minCharsInWord = isLL ? wordSize : wordSize/2;
1158
1159 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1160 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1161 (chr_insn)&MacroAssembler::ldrh;
1162 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1163 (chr_insn)&MacroAssembler::ldrh;
1164 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1165 (uxt_insn)&MacroAssembler::uxthw;
1166
1167 BLOCK_COMMENT("string_compare {");
1168
1169 // Bizarrely, the counts are passed in bytes, regardless of whether they
1170 // are L or U strings, however the result is always in characters.
1171 if (!str1_isL) asrw(cnt1, cnt1, 1);
1172 if (!str2_isL) asrw(cnt2, cnt2, 1);
1173
1174 // Compute the minimum of the string lengths and save the difference.
1175 subsw(result, cnt1, cnt2);
1176 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1177
1178 // A very short string
1179 cmpw(cnt2, minCharsInWord);
1180 br(Assembler::LE, SHORT_STRING);
1181
1182 // Compare longwords
1183 // load first parts of strings and finish initialization while loading
1184 {
1185 if (str1_isL == str2_isL) { // LL or UU
1186 ldr(tmp1, Address(str1));
1187 cmp(str1, str2);
1188 br(Assembler::EQ, DONE);
1189 ldr(tmp2, Address(str2));
1190 cmp(cnt2, stub_threshold);
1191 br(GE, STUB);
1192 subsw(cnt2, cnt2, minCharsInWord);
1193 br(EQ, TAIL_CHECK);
1194 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1195 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1196 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1197 } else if (isLU) {
1198 ldrs(vtmp, Address(str1));
1199 ldr(tmp2, Address(str2));
1200 cmp(cnt2, stub_threshold);
1201 br(GE, STUB);
1202 subw(cnt2, cnt2, 4);
1203 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1204 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1205 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1206 zip1(vtmp, T8B, vtmp, vtmpZ);
1207 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1208 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1209 add(cnt1, cnt1, 4);
1210 fmovd(tmp1, vtmp);
1211 } else { // UL case
1212 ldr(tmp1, Address(str1));
1213 ldrs(vtmp, Address(str2));
1214 cmp(cnt2, stub_threshold);
1215 br(GE, STUB);
1216 subw(cnt2, cnt2, 4);
1217 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1218 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1219 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1220 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1221 zip1(vtmp, T8B, vtmp, vtmpZ);
1222 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1223 add(cnt1, cnt1, 8);
1224 fmovd(tmp2, vtmp);
1225 }
1226 adds(cnt2, cnt2, isUL ? 4 : 8);
1227 br(GE, TAIL);
1228 eor(rscratch2, tmp1, tmp2);
1229 cbnz(rscratch2, DIFF);
1230 // main loop
1231 bind(NEXT_WORD);
1232 if (str1_isL == str2_isL) {
1233 ldr(tmp1, Address(str1, cnt2));
1234 ldr(tmp2, Address(str2, cnt2));
1235 adds(cnt2, cnt2, 8);
1236 } else if (isLU) {
1237 ldrs(vtmp, Address(str1, cnt1));
1238 ldr(tmp2, Address(str2, cnt2));
1239 add(cnt1, cnt1, 4);
1240 zip1(vtmp, T8B, vtmp, vtmpZ);
1241 fmovd(tmp1, vtmp);
1242 adds(cnt2, cnt2, 8);
1243 } else { // UL
1244 ldrs(vtmp, Address(str2, cnt2));
1245 ldr(tmp1, Address(str1, cnt1));
1246 zip1(vtmp, T8B, vtmp, vtmpZ);
1247 add(cnt1, cnt1, 8);
1248 fmovd(tmp2, vtmp);
1249 adds(cnt2, cnt2, 4);
1250 }
1251 br(GE, TAIL);
1252
1253 eor(rscratch2, tmp1, tmp2);
1254 cbz(rscratch2, NEXT_WORD);
1255 b(DIFF);
1256 bind(TAIL);
1257 eor(rscratch2, tmp1, tmp2);
1258 cbnz(rscratch2, DIFF);
1259 // Last longword. In the case where length == 4 we compare the
1260 // same longword twice, but that's still faster than another
1261 // conditional branch.
1262 if (str1_isL == str2_isL) {
1263 ldr(tmp1, Address(str1));
1264 ldr(tmp2, Address(str2));
1265 } else if (isLU) {
1266 ldrs(vtmp, Address(str1));
1267 ldr(tmp2, Address(str2));
1268 zip1(vtmp, T8B, vtmp, vtmpZ);
1269 fmovd(tmp1, vtmp);
1270 } else { // UL
1271 ldrs(vtmp, Address(str2));
1272 ldr(tmp1, Address(str1));
1273 zip1(vtmp, T8B, vtmp, vtmpZ);
1274 fmovd(tmp2, vtmp);
1275 }
1276 bind(TAIL_CHECK);
1277 eor(rscratch2, tmp1, tmp2);
1278 cbz(rscratch2, DONE);
1279
1280 // Find the first different characters in the longwords and
1281 // compute their difference.
1282 bind(DIFF);
1283 rev(rscratch2, rscratch2);
1284 clz(rscratch2, rscratch2);
1285 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1286 lsrv(tmp1, tmp1, rscratch2);
1287 (this->*ext_chr)(tmp1, tmp1);
1288 lsrv(tmp2, tmp2, rscratch2);
1289 (this->*ext_chr)(tmp2, tmp2);
1290 subw(result, tmp1, tmp2);
1291 b(DONE);
1292 }
1293
1294 bind(STUB);
1295 RuntimeAddress stub = nullptr;
1296 switch(ae) {
1297 case StrIntrinsicNode::LL:
1298 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1299 break;
1300 case StrIntrinsicNode::UU:
1301 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1302 break;
1303 case StrIntrinsicNode::LU:
1304 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1305 break;
1306 case StrIntrinsicNode::UL:
1307 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1308 break;
1309 default:
1310 ShouldNotReachHere();
1311 }
1312 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1313 address call = trampoline_call(stub);
1314 if (call == nullptr) {
1315 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1316 ciEnv::current()->record_failure("CodeCache is full");
1317 return;
1318 }
1319 b(DONE);
1320
1321 bind(SHORT_STRING);
1322 // Is the minimum length zero?
1323 cbz(cnt2, DONE);
1324 // arrange code to do most branches while loading and loading next characters
1325 // while comparing previous
1326 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1327 subs(cnt2, cnt2, 1);
1328 br(EQ, SHORT_LAST_INIT);
1329 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1330 b(SHORT_LOOP_START);
1331 bind(SHORT_LOOP);
1332 subs(cnt2, cnt2, 1);
1333 br(EQ, SHORT_LAST);
1334 bind(SHORT_LOOP_START);
1335 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1336 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1337 cmp(tmp1, cnt1);
1338 br(NE, SHORT_LOOP_TAIL);
1339 subs(cnt2, cnt2, 1);
1340 br(EQ, SHORT_LAST2);
1341 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1342 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1343 cmp(tmp2, rscratch1);
1344 br(EQ, SHORT_LOOP);
1345 sub(result, tmp2, rscratch1);
1346 b(DONE);
1347 bind(SHORT_LOOP_TAIL);
1348 sub(result, tmp1, cnt1);
1349 b(DONE);
1350 bind(SHORT_LAST2);
1351 cmp(tmp2, rscratch1);
1352 br(EQ, DONE);
1353 sub(result, tmp2, rscratch1);
1354
1355 b(DONE);
1356 bind(SHORT_LAST_INIT);
1357 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1358 bind(SHORT_LAST);
1359 cmp(tmp1, cnt1);
1360 br(EQ, DONE);
1361 sub(result, tmp1, cnt1);
1362
1363 bind(DONE);
1364
1365 BLOCK_COMMENT("} string_compare");
1366 }
1367
1368 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1369 FloatRegister src2, Condition cond, bool isQ) {
1370 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1371 FloatRegister zn = src1, zm = src2;
1372 bool needs_negation = false;
1373 switch (cond) {
1374 case LT: cond = GT; zn = src2; zm = src1; break;
1375 case LE: cond = GE; zn = src2; zm = src1; break;
1376 case LO: cond = HI; zn = src2; zm = src1; break;
1377 case LS: cond = HS; zn = src2; zm = src1; break;
1378 case NE: cond = EQ; needs_negation = true; break;
1379 default:
1380 break;
1381 }
1382
1383 if (is_floating_point_type(bt)) {
1384 fcm(cond, dst, size, zn, zm);
1385 } else {
1386 cm(cond, dst, size, zn, zm);
1387 }
1388
1389 if (needs_negation) {
1390 notr(dst, isQ ? T16B : T8B, dst);
1391 }
1392 }
1393
1394 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1395 Condition cond, bool isQ) {
1396 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1397 if (bt == T_FLOAT || bt == T_DOUBLE) {
1398 if (cond == Assembler::NE) {
1399 fcm(Assembler::EQ, dst, size, src);
1400 notr(dst, isQ ? T16B : T8B, dst);
1401 } else {
1402 fcm(cond, dst, size, src);
1403 }
1404 } else {
1405 if (cond == Assembler::NE) {
1406 cm(Assembler::EQ, dst, size, src);
1407 notr(dst, isQ ? T16B : T8B, dst);
1408 } else {
1409 cm(cond, dst, size, src);
1410 }
1411 }
1412 }
1413
1414 // Compress the least significant bit of each byte to the rightmost and clear
1415 // the higher garbage bits.
1416 void C2_MacroAssembler::bytemask_compress(Register dst) {
1417 // Example input, dst = 0x01 00 00 00 01 01 00 01
1418 // The "??" bytes are garbage.
1419 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1420 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1421 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1422 andr(dst, dst, 0xff); // dst = 0x8D
1423 }
1424
1425 // Pack the value of each mask element in "src" into a long value in "dst", at most
1426 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1427 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1428 // one bit in "dst".
1429 //
1430 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1431 // Expected: dst = 0x658D
1432 //
1433 // Clobbers: rscratch1
1434 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1435 FloatRegister vtmp, int lane_cnt) {
1436 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1437 assert_different_registers(dst, rscratch1);
1438 assert_different_registers(src, vtmp);
1439 assert(UseSVE > 0, "must be");
1440
1441 // Compress the lowest 8 bytes.
1442 fmovd(dst, src);
1443 bytemask_compress(dst);
1444 if (lane_cnt <= 8) return;
1445
1446 // Repeat on higher bytes and join the results.
1447 // Compress 8 bytes in each iteration.
1448 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1449 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1450 bytemask_compress(rscratch1);
1451 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1452 }
1453 }
1454
1455 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1456 // instruction which requires the FEAT_BITPERM feature.
1457 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1458 FloatRegister vtmp1, FloatRegister vtmp2,
1459 int lane_cnt) {
1460 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1461 assert_different_registers(src, vtmp1, vtmp2);
1462 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1463
1464 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1465 // is to compress each significant bit of the byte in a cross-lane way. Due
1466 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1467 // (bit-compress in each lane) with the biggest lane size (T = D) then
1468 // concatenate the results.
1469
1470 // The second source input of BEXT, initialized with 0x01 in each byte.
1471 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1472 sve_dup(vtmp2, B, 1);
1473
1474 // BEXT vtmp1.D, src.D, vtmp2.D
1475 // src = 0x0001010000010001 | 0x0100000001010001
1476 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1477 // ---------------------------------------
1478 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1479 sve_bext(vtmp1, D, src, vtmp2);
1480
1481 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1482 // result to dst.
1483 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1484 // dst = 0x658D
1485 if (lane_cnt <= 8) {
1486 // No need to concatenate.
1487 umov(dst, vtmp1, B, 0);
1488 } else if (lane_cnt <= 16) {
1489 ins(vtmp1, B, vtmp1, 1, 8);
1490 umov(dst, vtmp1, H, 0);
1491 } else {
1492 // As the lane count is 64 at most, the final expected value must be in
1493 // the lowest 64 bits after narrowing vtmp1 from D to B.
1494 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1495 umov(dst, vtmp1, D, 0);
1496 }
1497 }
1498
1499 // Unpack the mask, a long value in "src", into a vector register of boolean
1500 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1501 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1502 // most 64 lanes.
1503 //
1504 // Below example gives the expected dst vector register, with a valid src(0x658D)
1505 // on a 128-bit vector size machine.
1506 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1507 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1508 FloatRegister vtmp, int lane_cnt) {
1509 assert_different_registers(dst, vtmp);
1510 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1511 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1512
1513 // Example: src = 0x658D, lane_cnt = 16
1514 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1515
1516 // Put long value from general purpose register into the first lane of vector.
1517 // vtmp = 0x0000000000000000 | 0x000000000000658D
1518 sve_dup(vtmp, B, 0);
1519 mov(vtmp, D, 0, src);
1520
1521 // Transform the value in the first lane which is mask in bit now to the mask in
1522 // byte, which can be done by SVE2's BDEP instruction.
1523
1524 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1525 // vtmp = 0x0000000000000065 | 0x000000000000008D
1526 if (lane_cnt <= 8) {
1527 // Nothing. As only one byte exsits.
1528 } else if (lane_cnt <= 16) {
1529 ins(vtmp, B, vtmp, 8, 1);
1530 } else {
1531 sve_vector_extend(vtmp, D, vtmp, B);
1532 }
1533
1534 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1535 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1536 sve_dup(dst, B, 1);
1537
1538 // BDEP dst.D, vtmp.D, dst.D
1539 // vtmp = 0x0000000000000065 | 0x000000000000008D
1540 // dst = 0x0101010101010101 | 0x0101010101010101
1541 // ---------------------------------------
1542 // dst = 0x0001010000010001 | 0x0100000001010001
1543 sve_bdep(dst, D, vtmp, dst);
1544 }
1545
1546 // Clobbers: rflags
1547 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1548 FloatRegister zn, FloatRegister zm, Condition cond) {
1549 assert(pg->is_governing(), "This register has to be a governing predicate register");
1550 FloatRegister z1 = zn, z2 = zm;
1551 switch (cond) {
1552 case LE: z1 = zm; z2 = zn; cond = GE; break;
1553 case LT: z1 = zm; z2 = zn; cond = GT; break;
1554 case LO: z1 = zm; z2 = zn; cond = HI; break;
1555 case LS: z1 = zm; z2 = zn; cond = HS; break;
1556 default:
1557 break;
1558 }
1559
1560 SIMD_RegVariant size = elemType_to_regVariant(bt);
1561 if (is_floating_point_type(bt)) {
1562 sve_fcm(cond, pd, size, pg, z1, z2);
1563 } else {
1564 assert(is_integral_type(bt), "unsupported element type");
1565 sve_cmp(cond, pd, size, pg, z1, z2);
1566 }
1567 }
1568
1569 // Get index of the last mask lane that is set
1570 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1571 SIMD_RegVariant size = elemType_to_regVariant(bt);
1572 sve_rev(ptmp, size, src);
1573 sve_brkb(ptmp, ptrue, ptmp, false);
1574 sve_cntp(dst, size, ptrue, ptmp);
1575 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1576 subw(dst, rscratch1, dst);
1577 }
1578
1579 // Extend integer vector src to dst with the same lane count
1580 // but larger element size, e.g. 4B -> 4I
1581 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1582 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1583 if (src_bt == T_BYTE) {
1584 // 4B to 4S/4I, 8B to 8S
1585 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1586 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1587 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1588 if (dst_bt == T_INT) {
1589 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1590 }
1591 } else if (src_bt == T_SHORT) {
1592 // 2S to 2I/2L, 4S to 4I
1593 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1594 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1595 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1596 if (dst_bt == T_LONG) {
1597 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1598 }
1599 } else if (src_bt == T_INT) {
1600 // 2I to 2L
1601 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1602 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1603 } else {
1604 ShouldNotReachHere();
1605 }
1606 }
1607
1608 // Narrow integer vector src down to dst with the same lane count
1609 // but smaller element size, e.g. 4I -> 4B
1610 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1611 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1612 if (src_bt == T_SHORT) {
1613 // 4S/8S to 4B/8B
1614 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1615 assert(dst_bt == T_BYTE, "unsupported");
1616 xtn(dst, T8B, src, T8H);
1617 } else if (src_bt == T_INT) {
1618 // 2I to 2S, 4I to 4B/4S
1619 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1620 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1621 xtn(dst, T4H, src, T4S);
1622 if (dst_bt == T_BYTE) {
1623 xtn(dst, T8B, dst, T8H);
1624 }
1625 } else if (src_bt == T_LONG) {
1626 // 2L to 2S/2I
1627 assert(src_vlen_in_bytes == 16, "unsupported");
1628 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1629 xtn(dst, T2S, src, T2D);
1630 if (dst_bt == T_SHORT) {
1631 xtn(dst, T4H, dst, T4S);
1632 }
1633 } else {
1634 ShouldNotReachHere();
1635 }
1636 }
1637
1638 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1639 FloatRegister src, SIMD_RegVariant src_size,
1640 bool is_unsigned) {
1641 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1642
1643 if (src_size == B) {
1644 switch (dst_size) {
1645 case H:
1646 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1647 break;
1648 case S:
1649 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1650 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1651 break;
1652 case D:
1653 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1654 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1655 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1656 break;
1657 default:
1658 ShouldNotReachHere();
1659 }
1660 } else if (src_size == H) {
1661 if (dst_size == S) {
1662 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1663 } else { // D
1664 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1665 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1666 }
1667 } else if (src_size == S) {
1668 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1669 }
1670 }
1671
1672 // Vector narrow from src to dst with specified element sizes.
1673 // High part of dst vector will be filled with zero.
1674 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1675 FloatRegister src, SIMD_RegVariant src_size,
1676 FloatRegister tmp) {
1677 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1678 assert_different_registers(src, tmp);
1679 sve_dup(tmp, src_size, 0);
1680 if (src_size == D) {
1681 switch (dst_size) {
1682 case S:
1683 sve_uzp1(dst, S, src, tmp);
1684 break;
1685 case H:
1686 assert_different_registers(dst, tmp);
1687 sve_uzp1(dst, S, src, tmp);
1688 sve_uzp1(dst, H, dst, tmp);
1689 break;
1690 case B:
1691 assert_different_registers(dst, tmp);
1692 sve_uzp1(dst, S, src, tmp);
1693 sve_uzp1(dst, H, dst, tmp);
1694 sve_uzp1(dst, B, dst, tmp);
1695 break;
1696 default:
1697 ShouldNotReachHere();
1698 }
1699 } else if (src_size == S) {
1700 if (dst_size == H) {
1701 sve_uzp1(dst, H, src, tmp);
1702 } else { // B
1703 assert_different_registers(dst, tmp);
1704 sve_uzp1(dst, H, src, tmp);
1705 sve_uzp1(dst, B, dst, tmp);
1706 }
1707 } else if (src_size == H) {
1708 sve_uzp1(dst, B, src, tmp);
1709 }
1710 }
1711
1712 // Extend src predicate to dst predicate with the same lane count but larger
1713 // element size, e.g. 64Byte -> 512Long
1714 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1715 uint dst_element_length_in_bytes,
1716 uint src_element_length_in_bytes) {
1717 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1718 sve_punpklo(dst, src);
1719 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1720 sve_punpklo(dst, src);
1721 sve_punpklo(dst, dst);
1722 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1723 sve_punpklo(dst, src);
1724 sve_punpklo(dst, dst);
1725 sve_punpklo(dst, dst);
1726 } else {
1727 assert(false, "unsupported");
1728 ShouldNotReachHere();
1729 }
1730 }
1731
1732 // Narrow src predicate to dst predicate with the same lane count but
1733 // smaller element size, e.g. 512Long -> 64Byte
1734 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1735 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1736 // The insignificant bits in src predicate are expected to be zero.
1737 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1738 // passed as the second argument. An example narrowing operation with a given mask would be -
1739 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1740 // Mask (for 2 Longs) : TF
1741 // Predicate register for the above mask (16 bits) : 00000001 00000000
1742 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1743 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1744 assert_different_registers(src, ptmp);
1745 assert_different_registers(dst, ptmp);
1746 sve_pfalse(ptmp);
1747 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1748 sve_uzp1(dst, B, src, ptmp);
1749 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1750 sve_uzp1(dst, H, src, ptmp);
1751 sve_uzp1(dst, B, dst, ptmp);
1752 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1753 sve_uzp1(dst, S, src, ptmp);
1754 sve_uzp1(dst, H, dst, ptmp);
1755 sve_uzp1(dst, B, dst, ptmp);
1756 } else {
1757 assert(false, "unsupported");
1758 ShouldNotReachHere();
1759 }
1760 }
1761
1762 // Vector reduction add for integral type with ASIMD instructions.
1763 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1764 Register isrc, FloatRegister vsrc,
1765 unsigned vector_length_in_bytes,
1766 FloatRegister vtmp) {
1767 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1768 assert_different_registers(dst, isrc);
1769 bool isQ = vector_length_in_bytes == 16;
1770
1771 BLOCK_COMMENT("neon_reduce_add_integral {");
1772 switch(bt) {
1773 case T_BYTE:
1774 addv(vtmp, isQ ? T16B : T8B, vsrc);
1775 smov(dst, vtmp, B, 0);
1776 addw(dst, dst, isrc, ext::sxtb);
1777 break;
1778 case T_SHORT:
1779 addv(vtmp, isQ ? T8H : T4H, vsrc);
1780 smov(dst, vtmp, H, 0);
1781 addw(dst, dst, isrc, ext::sxth);
1782 break;
1783 case T_INT:
1784 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1785 umov(dst, vtmp, S, 0);
1786 addw(dst, dst, isrc);
1787 break;
1788 case T_LONG:
1789 assert(isQ, "unsupported");
1790 addpd(vtmp, vsrc);
1791 umov(dst, vtmp, D, 0);
1792 add(dst, dst, isrc);
1793 break;
1794 default:
1795 assert(false, "unsupported");
1796 ShouldNotReachHere();
1797 }
1798 BLOCK_COMMENT("} neon_reduce_add_integral");
1799 }
1800
1801 // Vector reduction multiply for integral type with ASIMD instructions.
1802 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1803 // Clobbers: rscratch1
1804 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1805 Register isrc, FloatRegister vsrc,
1806 unsigned vector_length_in_bytes,
1807 FloatRegister vtmp1, FloatRegister vtmp2) {
1808 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1809 bool isQ = vector_length_in_bytes == 16;
1810
1811 BLOCK_COMMENT("neon_reduce_mul_integral {");
1812 switch(bt) {
1813 case T_BYTE:
1814 if (isQ) {
1815 // Multiply the lower half and higher half of vector iteratively.
1816 // vtmp1 = vsrc[8:15]
1817 ins(vtmp1, D, vsrc, 0, 1);
1818 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1819 mulv(vtmp1, T8B, vtmp1, vsrc);
1820 // vtmp2 = vtmp1[4:7]
1821 ins(vtmp2, S, vtmp1, 0, 1);
1822 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1823 mulv(vtmp1, T8B, vtmp2, vtmp1);
1824 } else {
1825 ins(vtmp1, S, vsrc, 0, 1);
1826 mulv(vtmp1, T8B, vtmp1, vsrc);
1827 }
1828 // vtmp2 = vtmp1[2:3]
1829 ins(vtmp2, H, vtmp1, 0, 1);
1830 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1831 mulv(vtmp2, T8B, vtmp2, vtmp1);
1832 // dst = vtmp2[0] * isrc * vtmp2[1]
1833 umov(rscratch1, vtmp2, B, 0);
1834 mulw(dst, rscratch1, isrc);
1835 sxtb(dst, dst);
1836 umov(rscratch1, vtmp2, B, 1);
1837 mulw(dst, rscratch1, dst);
1838 sxtb(dst, dst);
1839 break;
1840 case T_SHORT:
1841 if (isQ) {
1842 ins(vtmp2, D, vsrc, 0, 1);
1843 mulv(vtmp2, T4H, vtmp2, vsrc);
1844 ins(vtmp1, S, vtmp2, 0, 1);
1845 mulv(vtmp1, T4H, vtmp1, vtmp2);
1846 } else {
1847 ins(vtmp1, S, vsrc, 0, 1);
1848 mulv(vtmp1, T4H, vtmp1, vsrc);
1849 }
1850 umov(rscratch1, vtmp1, H, 0);
1851 mulw(dst, rscratch1, isrc);
1852 sxth(dst, dst);
1853 umov(rscratch1, vtmp1, H, 1);
1854 mulw(dst, rscratch1, dst);
1855 sxth(dst, dst);
1856 break;
1857 case T_INT:
1858 if (isQ) {
1859 ins(vtmp1, D, vsrc, 0, 1);
1860 mulv(vtmp1, T2S, vtmp1, vsrc);
1861 } else {
1862 vtmp1 = vsrc;
1863 }
1864 umov(rscratch1, vtmp1, S, 0);
1865 mul(dst, rscratch1, isrc);
1866 umov(rscratch1, vtmp1, S, 1);
1867 mul(dst, rscratch1, dst);
1868 break;
1869 case T_LONG:
1870 umov(rscratch1, vsrc, D, 0);
1871 mul(dst, isrc, rscratch1);
1872 umov(rscratch1, vsrc, D, 1);
1873 mul(dst, dst, rscratch1);
1874 break;
1875 default:
1876 assert(false, "unsupported");
1877 ShouldNotReachHere();
1878 }
1879 BLOCK_COMMENT("} neon_reduce_mul_integral");
1880 }
1881
1882 // Vector reduction multiply for floating-point type with ASIMD instructions.
1883 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1884 FloatRegister fsrc, FloatRegister vsrc,
1885 unsigned vector_length_in_bytes,
1886 FloatRegister vtmp) {
1887 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1888 bool isQ = vector_length_in_bytes == 16;
1889
1890 BLOCK_COMMENT("neon_reduce_mul_fp {");
1891 switch(bt) {
1892 case T_FLOAT:
1893 fmuls(dst, fsrc, vsrc);
1894 ins(vtmp, S, vsrc, 0, 1);
1895 fmuls(dst, dst, vtmp);
1896 if (isQ) {
1897 ins(vtmp, S, vsrc, 0, 2);
1898 fmuls(dst, dst, vtmp);
1899 ins(vtmp, S, vsrc, 0, 3);
1900 fmuls(dst, dst, vtmp);
1901 }
1902 break;
1903 case T_DOUBLE:
1904 assert(isQ, "unsupported");
1905 fmuld(dst, fsrc, vsrc);
1906 ins(vtmp, D, vsrc, 0, 1);
1907 fmuld(dst, dst, vtmp);
1908 break;
1909 default:
1910 assert(false, "unsupported");
1911 ShouldNotReachHere();
1912 }
1913 BLOCK_COMMENT("} neon_reduce_mul_fp");
1914 }
1915
1916 // Helper to select logical instruction
1917 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1918 Register Rn, Register Rm,
1919 enum shift_kind kind, unsigned shift) {
1920 switch(opc) {
1921 case Op_AndReductionV:
1922 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1923 break;
1924 case Op_OrReductionV:
1925 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1926 break;
1927 case Op_XorReductionV:
1928 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1929 break;
1930 default:
1931 assert(false, "unsupported");
1932 ShouldNotReachHere();
1933 }
1934 }
1935
1936 // Vector reduction logical operations And, Or, Xor
1937 // Clobbers: rscratch1
1938 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1939 Register isrc, FloatRegister vsrc,
1940 unsigned vector_length_in_bytes) {
1941 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1942 "unsupported");
1943 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1944 assert_different_registers(dst, isrc);
1945 bool isQ = vector_length_in_bytes == 16;
1946
1947 BLOCK_COMMENT("neon_reduce_logical {");
1948 umov(rscratch1, vsrc, isQ ? D : S, 0);
1949 umov(dst, vsrc, isQ ? D : S, 1);
1950 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1951 switch(bt) {
1952 case T_BYTE:
1953 if (isQ) {
1954 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1955 }
1956 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1957 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1958 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1959 sxtb(dst, dst);
1960 break;
1961 case T_SHORT:
1962 if (isQ) {
1963 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1964 }
1965 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1966 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1967 sxth(dst, dst);
1968 break;
1969 case T_INT:
1970 if (isQ) {
1971 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1972 }
1973 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1974 break;
1975 case T_LONG:
1976 assert(isQ, "unsupported");
1977 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1978 break;
1979 default:
1980 assert(false, "unsupported");
1981 ShouldNotReachHere();
1982 }
1983 BLOCK_COMMENT("} neon_reduce_logical");
1984 }
1985
1986 // Helper function to decode min/max reduction operation properties
1987 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
1988 bool* is_unsigned,
1989 Condition* cond) {
1990 switch(opc) {
1991 case Op_MinReductionV:
1992 *is_min = true; *is_unsigned = false; *cond = LT; break;
1993 case Op_MaxReductionV:
1994 *is_min = false; *is_unsigned = false; *cond = GT; break;
1995 case Op_UMinReductionV:
1996 *is_min = true; *is_unsigned = true; *cond = LO; break;
1997 case Op_UMaxReductionV:
1998 *is_min = false; *is_unsigned = true; *cond = HI; break;
1999 default:
2000 ShouldNotReachHere();
2001 }
2002 }
2003
2004 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2005 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2006 // Clobbers: rscratch1, rflags
2007 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2008 Register isrc, FloatRegister vsrc,
2009 unsigned vector_length_in_bytes,
2010 FloatRegister vtmp) {
2011 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2012 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2013 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2014 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2015 assert_different_registers(dst, isrc);
2016 bool isQ = vector_length_in_bytes == 16;
2017 bool is_min;
2018 bool is_unsigned;
2019 Condition cond;
2020 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2021 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2022 if (bt == T_LONG) {
2023 assert(vtmp == fnoreg, "should be");
2024 assert(isQ, "should be");
2025 umov(rscratch1, vsrc, D, 0);
2026 cmp(isrc, rscratch1);
2027 csel(dst, isrc, rscratch1, cond);
2028 umov(rscratch1, vsrc, D, 1);
2029 cmp(dst, rscratch1);
2030 csel(dst, dst, rscratch1, cond);
2031 } else {
2032 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2033 if (size == T2S) {
2034 // For T2S (2x32-bit elements), use pairwise instructions because
2035 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2036 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2037 } else {
2038 // For other sizes, use reduction to scalar instructions.
2039 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2040 }
2041 if (bt == T_INT) {
2042 umov(dst, vtmp, S, 0);
2043 } else if (is_unsigned) {
2044 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2045 } else {
2046 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2047 }
2048 cmpw(dst, isrc);
2049 cselw(dst, dst, isrc, cond);
2050 }
2051 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2052 }
2053
2054 // Vector reduction for integral type with SVE instruction.
2055 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2056 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2057 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2058 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2059 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2060 assert(pg->is_governing(), "This register has to be a governing predicate register");
2061 assert_different_registers(src1, dst);
2062 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2063 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2064 switch (opc) {
2065 case Op_AddReductionVI: {
2066 sve_uaddv(tmp, size, pg, src2);
2067 if (bt == T_BYTE) {
2068 smov(dst, tmp, size, 0);
2069 addw(dst, src1, dst, ext::sxtb);
2070 } else if (bt == T_SHORT) {
2071 smov(dst, tmp, size, 0);
2072 addw(dst, src1, dst, ext::sxth);
2073 } else {
2074 umov(dst, tmp, size, 0);
2075 addw(dst, dst, src1);
2076 }
2077 break;
2078 }
2079 case Op_AddReductionVL: {
2080 sve_uaddv(tmp, size, pg, src2);
2081 umov(dst, tmp, size, 0);
2082 add(dst, dst, src1);
2083 break;
2084 }
2085 case Op_AndReductionV: {
2086 sve_andv(tmp, size, pg, src2);
2087 if (bt == T_INT || bt == T_LONG) {
2088 umov(dst, tmp, size, 0);
2089 } else {
2090 smov(dst, tmp, size, 0);
2091 }
2092 if (bt == T_LONG) {
2093 andr(dst, dst, src1);
2094 } else {
2095 andw(dst, dst, src1);
2096 }
2097 break;
2098 }
2099 case Op_OrReductionV: {
2100 sve_orv(tmp, size, pg, src2);
2101 if (bt == T_INT || bt == T_LONG) {
2102 umov(dst, tmp, size, 0);
2103 } else {
2104 smov(dst, tmp, size, 0);
2105 }
2106 if (bt == T_LONG) {
2107 orr(dst, dst, src1);
2108 } else {
2109 orrw(dst, dst, src1);
2110 }
2111 break;
2112 }
2113 case Op_XorReductionV: {
2114 sve_eorv(tmp, size, pg, src2);
2115 if (bt == T_INT || bt == T_LONG) {
2116 umov(dst, tmp, size, 0);
2117 } else {
2118 smov(dst, tmp, size, 0);
2119 }
2120 if (bt == T_LONG) {
2121 eor(dst, dst, src1);
2122 } else {
2123 eorw(dst, dst, src1);
2124 }
2125 break;
2126 }
2127 case Op_MaxReductionV:
2128 case Op_MinReductionV:
2129 case Op_UMaxReductionV:
2130 case Op_UMinReductionV: {
2131 bool is_min;
2132 bool is_unsigned;
2133 Condition cond;
2134 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2135 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2136 // Move result from vector to general register
2137 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2138 umov(dst, tmp, size, 0);
2139 } else {
2140 smov(dst, tmp, size, 0);
2141 }
2142 if (bt == T_LONG) {
2143 cmp(dst, src1);
2144 csel(dst, dst, src1, cond);
2145 } else {
2146 cmpw(dst, src1);
2147 cselw(dst, dst, src1, cond);
2148 }
2149 break;
2150 }
2151 default:
2152 assert(false, "unsupported");
2153 ShouldNotReachHere();
2154 }
2155
2156 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2157 if (bt == T_BYTE) {
2158 sxtb(dst, dst);
2159 } else if (bt == T_SHORT) {
2160 sxth(dst, dst);
2161 }
2162 }
2163 }
2164
2165 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2166 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2167 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2168 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2169 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2170 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2171
2172 // Set all elements to false if the input "lane_cnt" is zero.
2173 if (lane_cnt == 0) {
2174 sve_pfalse(dst);
2175 return;
2176 }
2177
2178 SIMD_RegVariant size = elemType_to_regVariant(bt);
2179 assert(size != Q, "invalid size");
2180
2181 // Set all true if "lane_cnt" equals to the max lane count.
2182 if (lane_cnt == max_vector_length) {
2183 sve_ptrue(dst, size, /* ALL */ 0b11111);
2184 return;
2185 }
2186
2187 // Fixed numbers for "ptrue".
2188 switch(lane_cnt) {
2189 case 1: /* VL1 */
2190 case 2: /* VL2 */
2191 case 3: /* VL3 */
2192 case 4: /* VL4 */
2193 case 5: /* VL5 */
2194 case 6: /* VL6 */
2195 case 7: /* VL7 */
2196 case 8: /* VL8 */
2197 sve_ptrue(dst, size, lane_cnt);
2198 return;
2199 case 16:
2200 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2201 return;
2202 case 32:
2203 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2204 return;
2205 case 64:
2206 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2207 return;
2208 case 128:
2209 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2210 return;
2211 case 256:
2212 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2213 return;
2214 default:
2215 break;
2216 }
2217
2218 // Special patterns for "ptrue".
2219 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2220 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2221 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2222 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2223 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2224 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2225 } else {
2226 // Encode to "whileltw" for the remaining cases.
2227 mov(rscratch1, lane_cnt);
2228 sve_whileltw(dst, size, zr, rscratch1);
2229 }
2230 }
2231
2232 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2233 // Any remaining elements of dst will be filled with zero.
2234 // Clobbers: rscratch1
2235 // Preserves: mask, vzr
2236 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2237 FloatRegister vzr, FloatRegister vtmp,
2238 PRegister pgtmp, unsigned vector_length_in_bytes) {
2239 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2240 // When called by sve_compress_byte, src and vtmp may be the same register.
2241 assert_different_registers(dst, src, vzr);
2242 assert_different_registers(dst, vtmp, vzr);
2243 assert_different_registers(mask, pgtmp);
2244 // high <-- low
2245 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2246 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2247 // Expected result: dst = 00 00 00 hh ee dd bb aa
2248
2249 // Extend lowest half to type INT.
2250 // dst = 00dd 00cc 00bb 00aa
2251 sve_uunpklo(dst, S, src);
2252 // pgtmp = 0001 0000 0001 0001
2253 sve_punpklo(pgtmp, mask);
2254 // Pack the active elements in size of type INT to the right,
2255 // and fill the remainings with zero.
2256 // dst = 0000 00dd 00bb 00aa
2257 sve_compact(dst, S, dst, pgtmp);
2258 // Narrow the result back to type SHORT.
2259 // dst = 00 00 00 00 00 dd bb aa
2260 sve_uzp1(dst, H, dst, vzr);
2261
2262 // Return if the vector length is no more than MaxVectorSize/2, since the
2263 // highest half is invalid.
2264 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2265 return;
2266 }
2267
2268 // Count the active elements of lowest half.
2269 // rscratch1 = 3
2270 sve_cntp(rscratch1, S, ptrue, pgtmp);
2271
2272 // Repeat to the highest half.
2273 // pgtmp = 0001 0000 0000 0001
2274 sve_punpkhi(pgtmp, mask);
2275 // vtmp = 00hh 00gg 00ff 00ee
2276 sve_uunpkhi(vtmp, S, src);
2277 // vtmp = 0000 0000 00hh 00ee
2278 sve_compact(vtmp, S, vtmp, pgtmp);
2279 // vtmp = 00 00 00 00 00 00 hh ee
2280 sve_uzp1(vtmp, H, vtmp, vzr);
2281
2282 // pgtmp = 00 00 00 00 00 01 01 01
2283 sve_whilelt(pgtmp, H, zr, rscratch1);
2284 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2285 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2286 // Combine the compressed low with the compressed high:
2287 // dst = 00 00 00 hh ee dd bb aa
2288 sve_splice(dst, H, pgtmp, vtmp);
2289 }
2290
2291 // Clobbers: rscratch1, rscratch2
2292 // Preserves: src, mask
2293 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2294 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2295 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2296 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2297 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2298 assert_different_registers(mask, ptmp, pgtmp);
2299 // high <-- low
2300 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2301 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2302 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2303 FloatRegister vzr = vtmp3;
2304 sve_dup(vzr, B, 0);
2305
2306 // Extend lowest half to type SHORT.
2307 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2308 sve_uunpklo(vtmp1, H, src);
2309 // ptmp = 00 01 00 00 00 01 00 01
2310 sve_punpklo(ptmp, mask);
2311 // Pack the active elements in size of type SHORT to the right,
2312 // and fill the remainings with zero.
2313 // dst = 00 00 00 00 00 0g 0c 0a
2314 unsigned extended_size = vector_length_in_bytes << 1;
2315 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2316 // Narrow the result back to type BYTE.
2317 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2318 sve_uzp1(dst, B, dst, vzr);
2319
2320 // Return if the vector length is no more than MaxVectorSize/2, since the
2321 // highest half is invalid.
2322 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2323 return;
2324 }
2325 // Count the active elements of lowest half.
2326 // rscratch2 = 3
2327 sve_cntp(rscratch2, H, ptrue, ptmp);
2328
2329 // Repeat to the highest half.
2330 // ptmp = 00 01 00 00 00 00 00 01
2331 sve_punpkhi(ptmp, mask);
2332 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2333 sve_uunpkhi(vtmp2, H, src);
2334 // vtmp1 = 00 00 00 00 00 00 0p 0i
2335 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2336 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2337 sve_uzp1(vtmp1, B, vtmp1, vzr);
2338
2339 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2340 sve_whilelt(ptmp, B, zr, rscratch2);
2341 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2342 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2343 // Combine the compressed low with the compressed high:
2344 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2345 sve_splice(dst, B, ptmp, vtmp1);
2346 }
2347
2348 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2349 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2350 SIMD_Arrangement size = isQ ? T16B : T8B;
2351 if (bt == T_BYTE) {
2352 rbit(dst, size, src);
2353 } else {
2354 neon_reverse_bytes(dst, src, bt, isQ);
2355 rbit(dst, size, dst);
2356 }
2357 }
2358
2359 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2360 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2361 SIMD_Arrangement size = isQ ? T16B : T8B;
2362 switch (bt) {
2363 case T_BYTE:
2364 if (dst != src) {
2365 orr(dst, size, src, src);
2366 }
2367 break;
2368 case T_SHORT:
2369 rev16(dst, size, src);
2370 break;
2371 case T_INT:
2372 rev32(dst, size, src);
2373 break;
2374 case T_LONG:
2375 rev64(dst, size, src);
2376 break;
2377 default:
2378 assert(false, "unsupported");
2379 ShouldNotReachHere();
2380 }
2381 }
2382
2383 // VectorRearrange implementation for short/int/float/long/double types with NEON
2384 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2385 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2386 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2387 // and use bsl to implement the operation.
2388 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2389 FloatRegister shuffle, FloatRegister tmp,
2390 BasicType bt, bool isQ) {
2391 assert_different_registers(dst, src, shuffle, tmp);
2392 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2393 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2394
2395 // Here is an example that rearranges a NEON vector with 4 ints:
2396 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2397 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2398 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2399 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2400 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2401 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2402 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2403 // 4. Use Vm as index register, and use V1 as table register.
2404 // Then get V2 as the result by tbl NEON instructions.
2405 switch (bt) {
2406 case T_SHORT:
2407 mov(tmp, size1, 0x02);
2408 mulv(dst, size2, shuffle, tmp);
2409 mov(tmp, size2, 0x0100);
2410 addv(dst, size1, dst, tmp);
2411 tbl(dst, size1, src, 1, dst);
2412 break;
2413 case T_INT:
2414 case T_FLOAT:
2415 mov(tmp, size1, 0x04);
2416 mulv(dst, size2, shuffle, tmp);
2417 mov(tmp, size2, 0x03020100);
2418 addv(dst, size1, dst, tmp);
2419 tbl(dst, size1, src, 1, dst);
2420 break;
2421 case T_LONG:
2422 case T_DOUBLE:
2423 // Load the iota indices for Long type. The indices are ordered by
2424 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2425 // the offset for L is 48.
2426 lea(rscratch1,
2427 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2428 ldrq(tmp, rscratch1);
2429 // Check whether the input "shuffle" is the same with iota indices.
2430 // Return "src" if true, otherwise swap the two elements of "src".
2431 cm(EQ, dst, size2, shuffle, tmp);
2432 ext(tmp, size1, src, src, 8);
2433 bsl(dst, size1, src, tmp);
2434 break;
2435 default:
2436 assert(false, "unsupported element type");
2437 ShouldNotReachHere();
2438 }
2439 }
2440
2441 // Extract a scalar element from an sve vector at position 'idx'.
2442 // The input elements in src are expected to be of integral type.
2443 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2444 int idx, FloatRegister vtmp) {
2445 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2446 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2447 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2448 if (bt == T_INT || bt == T_LONG) {
2449 umov(dst, src, size, idx);
2450 } else {
2451 smov(dst, src, size, idx);
2452 }
2453 } else {
2454 sve_orr(vtmp, src, src);
2455 sve_ext(vtmp, vtmp, idx << size);
2456 if (bt == T_INT || bt == T_LONG) {
2457 umov(dst, vtmp, size, 0);
2458 } else {
2459 smov(dst, vtmp, size, 0);
2460 }
2461 }
2462 }
2463
2464 // java.lang.Math::round intrinsics
2465
2466 // Clobbers: rscratch1, rflags
2467 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2468 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2469 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2470 switch (T) {
2471 case T2S:
2472 case T4S:
2473 fmovs(tmp1, T, 0.5f);
2474 mov(rscratch1, jint_cast(0x1.0p23f));
2475 break;
2476 case T2D:
2477 fmovd(tmp1, T, 0.5);
2478 mov(rscratch1, julong_cast(0x1.0p52));
2479 break;
2480 default:
2481 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2482 }
2483 fadd(tmp1, T, tmp1, src);
2484 fcvtms(tmp1, T, tmp1);
2485 // tmp1 = floor(src + 0.5, ties to even)
2486
2487 fcvtas(dst, T, src);
2488 // dst = round(src), ties to away
2489
2490 fneg(tmp3, T, src);
2491 dup(tmp2, T, rscratch1);
2492 cm(HS, tmp3, T, tmp3, tmp2);
2493 // tmp3 is now a set of flags
2494
2495 bif(dst, T16B, tmp1, tmp3);
2496 // result in dst
2497 }
2498
2499 // Clobbers: rscratch1, rflags
2500 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2501 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2502 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2503 assert_different_registers(tmp1, tmp2, src, dst);
2504
2505 switch (T) {
2506 case S:
2507 mov(rscratch1, jint_cast(0x1.0p23f));
2508 break;
2509 case D:
2510 mov(rscratch1, julong_cast(0x1.0p52));
2511 break;
2512 default:
2513 assert(T == S || T == D, "invalid register variant");
2514 }
2515
2516 sve_frinta(dst, T, ptrue, src);
2517 // dst = round(src), ties to away
2518
2519 Label none;
2520
2521 sve_fneg(tmp1, T, ptrue, src);
2522 sve_dup(tmp2, T, rscratch1);
2523 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2524 br(EQ, none);
2525 {
2526 sve_cpy(tmp1, T, pgtmp, 0.5);
2527 sve_fadd(tmp1, T, pgtmp, src);
2528 sve_frintm(dst, T, pgtmp, tmp1);
2529 // dst = floor(src + 0.5, ties to even)
2530 }
2531 bind(none);
2532
2533 sve_fcvtzs(dst, T, ptrue, dst, T);
2534 // result in dst
2535 }
2536
2537 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2538 FloatRegister one, SIMD_Arrangement T) {
2539 assert_different_registers(dst, src, zero, one);
2540 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2541
2542 facgt(dst, T, src, zero);
2543 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2544 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2545 }
2546
2547 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2548 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2549 assert_different_registers(dst, src, zero, one, vtmp);
2550 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2551
2552 sve_orr(vtmp, src, src);
2553 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2554 switch (T) {
2555 case S:
2556 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2557 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2558 // on the sign of the float value
2559 break;
2560 case D:
2561 sve_and(vtmp, T, min_jlong);
2562 sve_orr(vtmp, T, jlong_cast(1.0));
2563 break;
2564 default:
2565 assert(false, "unsupported");
2566 ShouldNotReachHere();
2567 }
2568 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2569 // Result in dst
2570 }
2571
2572 bool C2_MacroAssembler::in_scratch_emit_size() {
2573 if (ciEnv::current()->task() != nullptr) {
2574 PhaseOutput* phase_output = Compile::current()->output();
2575 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2576 return true;
2577 }
2578 }
2579 return MacroAssembler::in_scratch_emit_size();
2580 }
2581
2582 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2583 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2584 }
2585
2586 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2587 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2588 if (t == TypeInt::INT) {
2589 return;
2590 }
2591
2592 BLOCK_COMMENT("verify_int_in_range {");
2593 Label L_success, L_failure;
2594
2595 jint lo = t->_lo;
2596 jint hi = t->_hi;
2597
2598 if (lo != min_jint) {
2599 subsw(rtmp, rval, lo);
2600 br(Assembler::LT, L_failure);
2601 }
2602 if (hi != max_jint) {
2603 subsw(rtmp, rval, hi);
2604 br(Assembler::GT, L_failure);
2605 }
2606 b(L_success);
2607
2608 bind(L_failure);
2609 movw(c_rarg0, idx);
2610 mov(c_rarg1, rval);
2611 movw(c_rarg2, lo);
2612 movw(c_rarg3, hi);
2613 reconstruct_frame_pointer(rtmp);
2614 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2615 hlt(0);
2616
2617 bind(L_success);
2618 BLOCK_COMMENT("} verify_int_in_range");
2619 }
2620
2621 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2622 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2623 }
2624
2625 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2626 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2627 if (t == TypeLong::LONG) {
2628 return;
2629 }
2630
2631 BLOCK_COMMENT("verify_long_in_range {");
2632 Label L_success, L_failure;
2633
2634 jlong lo = t->_lo;
2635 jlong hi = t->_hi;
2636
2637 if (lo != min_jlong) {
2638 subs(rtmp, rval, lo);
2639 br(Assembler::LT, L_failure);
2640 }
2641 if (hi != max_jlong) {
2642 subs(rtmp, rval, hi);
2643 br(Assembler::GT, L_failure);
2644 }
2645 b(L_success);
2646
2647 bind(L_failure);
2648 movw(c_rarg0, idx);
2649 mov(c_rarg1, rval);
2650 mov(c_rarg2, lo);
2651 mov(c_rarg3, hi);
2652 reconstruct_frame_pointer(rtmp);
2653 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2654 hlt(0);
2655
2656 bind(L_success);
2657 BLOCK_COMMENT("} verify_long_in_range");
2658 }
2659
2660 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2661 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2662 if (PreserveFramePointer) {
2663 // frame pointer is valid
2664 #ifdef ASSERT
2665 // Verify frame pointer value in rfp.
2666 add(rtmp, sp, framesize - 2 * wordSize);
2667 Label L_success;
2668 cmp(rfp, rtmp);
2669 br(Assembler::EQ, L_success);
2670 stop("frame pointer mismatch");
2671 bind(L_success);
2672 #endif // ASSERT
2673 } else {
2674 add(rfp, sp, framesize - 2 * wordSize);
2675 }
2676 }
2677
2678 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2679 // using Neon instructions and places it in the destination vector element corresponding to the
2680 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2681 // where NUM_ELEM is the number of BasicType elements per vector.
2682 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2683 // Otherwise, selects src2[idx – NUM_ELEM]
2684 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2685 FloatRegister src2, FloatRegister index,
2686 FloatRegister tmp, unsigned vector_length_in_bytes) {
2687 assert_different_registers(dst, src1, src2, tmp);
2688 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2689
2690 if (vector_length_in_bytes == 16) {
2691 assert(UseSVE <= 1, "sve must be <= 1");
2692 assert(src1->successor() == src2, "Source registers must be ordered");
2693 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2694 tbl(dst, size, src1, 2, index);
2695 } else { // vector length == 8
2696 assert(UseSVE == 0, "must be Neon only");
2697 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2698 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2699 // instruction with one vector lookup
2700 ins(tmp, D, src1, 0, 0);
2701 ins(tmp, D, src2, 1, 0);
2702 tbl(dst, size, tmp, 1, index);
2703 }
2704 }
2705
2706 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2707 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2708 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2709 // where NUM_ELEM is the number of BasicType elements per vector.
2710 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2711 // Otherwise, selects src2[idx – NUM_ELEM]
2712 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2713 FloatRegister src2, FloatRegister index,
2714 FloatRegister tmp, SIMD_RegVariant T,
2715 unsigned vector_length_in_bytes) {
2716 assert_different_registers(dst, src1, src2, index, tmp);
2717
2718 if (vector_length_in_bytes == 8) {
2719 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2720 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2721 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2722 // instruction with one vector lookup
2723 assert(UseSVE >= 1, "sve must be >= 1");
2724 ins(tmp, D, src1, 0, 0);
2725 ins(tmp, D, src2, 1, 0);
2726 sve_tbl(dst, T, tmp, index);
2727 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2728 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2729 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2730 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2731 // with the only exception of 8B vector length.
2732 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2733 assert(src1->successor() == src2, "Source registers must be ordered");
2734 sve_tbl(dst, T, src1, src2, index);
2735 }
2736 }
2737
2738 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2739 FloatRegister src2, FloatRegister index,
2740 FloatRegister tmp, BasicType bt,
2741 unsigned vector_length_in_bytes) {
2742
2743 assert_different_registers(dst, src1, src2, index, tmp);
2744
2745 // The cases that can reach this method are -
2746 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2747 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2748 //
2749 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2750 // and UseSVE = 2 with vector_length_in_bytes >= 8
2751 //
2752 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2753 // UseSVE = 1 with vector_length_in_bytes = 16
2754
2755 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2756 SIMD_RegVariant T = elemType_to_regVariant(bt);
2757 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2758 return;
2759 }
2760
2761 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2762 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2763 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2764
2765 bool isQ = vector_length_in_bytes == 16;
2766
2767 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2768 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2769
2770 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2771 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2772 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2773 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2774 // the indices can range from [0, 8).
2775 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2776 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2777 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2778 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2779 // Add the multiplied result to the vector in tmp to obtain the byte level
2780 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2781 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2782
2783 if (bt == T_BYTE) {
2784 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2785 } else {
2786 int elem_size = (bt == T_SHORT) ? 2 : 4;
2787 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2788
2789 mov(tmp, size1, elem_size);
2790 mulv(dst, size2, index, tmp);
2791 mov(tmp, size2, tbl_offset);
2792 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2793 // to select a set of 2B/4B
2794 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2795 }
2796 }
2797
2798 // Vector expand implementation. Elements from the src vector are expanded into
2799 // the dst vector under the control of the vector mask.
2800 // Since there are no native instructions directly corresponding to expand before
2801 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2802 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2803 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2804 // for NEON and SVE, but with different instructions where appropriate.
2805
2806 // Vector expand implementation for NEON.
2807 //
2808 // An example of 128-bit Byte vector:
2809 // Data direction: high <== low
2810 // Input:
2811 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2812 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2813 // Expected result:
2814 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2815 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2816 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2817 int vector_length_in_bytes) {
2818 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2819 assert_different_registers(dst, src, mask, tmp1, tmp2);
2820 // Since the TBL instruction only supports byte table, we need to
2821 // compute indices in byte type for all types.
2822 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2823 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2824 dup(tmp1, size, zr);
2825 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2826 negr(dst, size, mask);
2827 // Calculate vector index for TBL with prefix sum algorithm.
2828 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2829 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2830 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2831 addv(dst, size, tmp2, dst);
2832 }
2833 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2834 orr(tmp2, size, mask, mask);
2835 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2836 bsl(tmp2, size, dst, tmp1);
2837 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2838 movi(tmp1, size, 1);
2839 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2840 subv(dst, size, tmp2, tmp1);
2841 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2842 tbl(dst, size, src, 1, dst);
2843 }
2844
2845 // Vector expand implementation for SVE.
2846 //
2847 // An example of 128-bit Short vector:
2848 // Data direction: high <== low
2849 // Input:
2850 // src = gf ed cb a9 87 65 43 21
2851 // pg = 00 01 00 01 00 01 00 01
2852 // Expected result:
2853 // dst = 00 87 00 65 00 43 00 21
2854 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2855 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2856 int vector_length_in_bytes) {
2857 assert(UseSVE > 0, "expand implementation only for SVE");
2858 assert_different_registers(dst, src, tmp1, tmp2);
2859 SIMD_RegVariant size = elemType_to_regVariant(bt);
2860
2861 // tmp1 = 00 00 00 00 00 00 00 00
2862 sve_dup(tmp1, size, 0);
2863 sve_movprfx(tmp2, tmp1);
2864 // tmp2 = 00 01 00 01 00 01 00 01
2865 sve_cpy(tmp2, size, pg, 1, true);
2866 // Calculate vector index for TBL with prefix sum algorithm.
2867 // tmp2 = 04 04 03 03 02 02 01 01
2868 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2869 sve_movprfx(dst, tmp1);
2870 // The EXT instruction operates on the full-width sve register. The correct
2871 // index calculation method is:
2872 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2873 // MaxVectorSize - i.
2874 sve_ext(dst, tmp2, MaxVectorSize - i);
2875 sve_add(tmp2, size, dst, tmp2);
2876 }
2877 // dst = 00 04 00 03 00 02 00 01
2878 sve_sel(dst, size, pg, tmp2, tmp1);
2879 // dst = -1 03 -1 02 -1 01 -1 00
2880 sve_sub(dst, size, 1);
2881 // dst = 00 87 00 65 00 43 00 21
2882 sve_tbl(dst, size, src, dst);
2883 }
2884
2885 // Optimized SVE cpy (imm, zeroing) instruction.
2886 //
2887 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2888 // functionality, but test results show that `movi; cpy(imm, merging)` has
2889 // higher throughput on some microarchitectures. This would depend on
2890 // microarchitecture and so may vary between implementations.
2891 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2892 PRegister pg, int imm8, bool isMerge) {
2893 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2894 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2895 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2896 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2897 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2898 // entire Z<dst> register. According to the Arm Software Optimization
2899 // Guide, `movi` is zero latency.
2900 movi(dst, T2D, 0);
2901 isMerge = true;
2902 }
2903 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2904 }