1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/objectMonitorTable.hpp"
34 #include "runtime/stubRoutines.hpp"
35 #include "runtime/synchronizer.hpp"
36 #include "utilities/globalDefinitions.hpp"
37 #include "utilities/powerOfTwo.hpp"
38
39 #ifdef PRODUCT
40 #define BLOCK_COMMENT(str) /* nothing */
41 #define STOP(error) stop(error)
42 #else
43 #define BLOCK_COMMENT(str) block_comment(str)
44 #define STOP(error) block_comment(error); stop(error)
45 #endif
46
47 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
48
49 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
50
51 // jdk.internal.util.ArraysSupport.vectorizedHashCode
52 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
53 FloatRegister vdata0, FloatRegister vdata1,
54 FloatRegister vdata2, FloatRegister vdata3,
55 FloatRegister vmul0, FloatRegister vmul1,
56 FloatRegister vmul2, FloatRegister vmul3,
57 FloatRegister vpow, FloatRegister vpowm,
58 BasicType eltype) {
59 ARRAYS_HASHCODE_REGISTERS;
60
61 Register tmp1 = rscratch1, tmp2 = rscratch2;
62
63 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
64
65 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
66 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
67 // use 4H for chars and shorts instead, but using 8H gives better performance.
68 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
69 : eltype == T_CHAR || eltype == T_SHORT ? 8
70 : eltype == T_INT ? 4
71 : 0;
72 guarantee(vf, "unsupported eltype");
73
74 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
75 const size_t unroll_factor = 4;
76
77 switch (eltype) {
78 case T_BOOLEAN:
79 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
80 break;
81 case T_CHAR:
82 BLOCK_COMMENT("arrays_hashcode(char) {");
83 break;
84 case T_BYTE:
85 BLOCK_COMMENT("arrays_hashcode(byte) {");
86 break;
87 case T_SHORT:
88 BLOCK_COMMENT("arrays_hashcode(short) {");
89 break;
90 case T_INT:
91 BLOCK_COMMENT("arrays_hashcode(int) {");
92 break;
93 default:
94 ShouldNotReachHere();
95 }
96
97 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
98 // implemented by the stub executes just once. Call the stub only if at least two iterations will
99 // be executed.
100 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
101 cmpw(cnt, large_threshold);
102 br(Assembler::HS, LARGE);
103
104 bind(TAIL);
105
106 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
107 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
108 // Iteration eats up the remainder, uf elements at a time.
109 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
110 andr(tmp2, cnt, unroll_factor - 1);
111 adr(tmp1, BR_BASE);
112 // For Cortex-A53 offset is 4 because 2 nops are generated.
113 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
114 movw(tmp2, 0x1f);
115 br(tmp1);
116
117 bind(LOOP);
118 for (size_t i = 0; i < unroll_factor; ++i) {
119 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
120 maddw(result, result, tmp2, tmp1);
121 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
122 // Generate 2nd nop to have 4 instructions per iteration.
123 if (VM_Version::supports_a53mac()) {
124 nop();
125 }
126 }
127 bind(BR_BASE);
128 subsw(cnt, cnt, unroll_factor);
129 br(Assembler::HS, LOOP);
130
131 b(DONE);
132
133 bind(LARGE);
134
135 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
136 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
137 address tpc = trampoline_call(stub);
138 if (tpc == nullptr) {
139 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
140 postcond(pc() == badAddress);
141 return nullptr;
142 }
143
144 bind(DONE);
145
146 BLOCK_COMMENT("} // arrays_hashcode");
147
148 postcond(pc() != badAddress);
149 return pc();
150 }
151
152 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
153 Register t2, Register t3) {
154 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
155
156 // Handle inflated monitor.
157 Label inflated;
158 // Finish fast lock successfully. MUST branch to with flag == EQ
159 Label locked;
160 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
161 Label slow_path;
162
163 if (UseObjectMonitorTable) {
164 // Clear cache in case fast locking succeeds or we need to take the slow-path.
165 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
166 }
167
168 if (DiagnoseSyncOnValueBasedClasses != 0) {
169 load_klass(t1, obj);
170 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
171 tst(t1, KlassFlags::_misc_is_value_based_class);
172 br(Assembler::NE, slow_path);
173 }
174
175 const Register t1_mark = t1;
176 const Register t3_t = t3;
177
178 { // Fast locking
179
180 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
181 Label push;
182
183 const Register t2_top = t2;
184
185 // Check if lock-stack is full.
186 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
187 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
188 br(Assembler::GT, slow_path);
189
190 // Check if recursive.
191 subw(t3_t, t2_top, oopSize);
192 ldr(t3_t, Address(rthread, t3_t));
193 cmp(obj, t3_t);
194 br(Assembler::EQ, push);
195
196 // Relaxed normal load to check for monitor. Optimization for monitor case.
197 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
198 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
199
200 // Not inflated
201 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
202
203 // Try to lock. Transition lock-bits 0b01 => 0b00
204 orr(t1_mark, t1_mark, markWord::unlocked_value);
205 eor(t3_t, t1_mark, markWord::unlocked_value);
206 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
207 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
208 br(Assembler::NE, slow_path);
209
210 bind(push);
211 // After successful lock, push object on lock-stack.
212 str(obj, Address(rthread, t2_top));
213 addw(t2_top, t2_top, oopSize);
214 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
215 b(locked);
216 }
217
218 { // Handle inflated monitor.
219 bind(inflated);
220
221 const Register t1_monitor = t1;
222
223 if (!UseObjectMonitorTable) {
224 assert(t1_monitor == t1_mark, "should be the same here");
225 } else {
226 const Register t1_hash = t1;
227 Label monitor_found;
228
229 // Save the mark, we might need it to extract the hash.
230 mov(t3, t1_mark);
231
232 // Look for the monitor in the om_cache.
233
234 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
235 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
236 const int num_unrolled = OMCache::CAPACITY;
237 for (int i = 0; i < num_unrolled; i++) {
238 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
239 ldr(t2, Address(rthread, cache_offset));
240 cmp(obj, t2);
241 br(Assembler::EQ, monitor_found);
242 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
243 }
244
245 // Look for the monitor in the table.
246
247 // Get the hash code.
248 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
249
250 // Get the table and calculate the bucket's address
251 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
252 ldr(t3, Address(t3));
253 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
254 ands(t1_hash, t1_hash, t2);
255 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
256
257 // Read the monitor from the bucket.
258 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
259
260 // Check if the monitor in the bucket is special (empty, tombstone or removed).
261 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
262 br(Assembler::LO, slow_path);
263
264 // Check if object matches.
265 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
266 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
267 bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
268 cmp(t3, obj);
269 br(Assembler::NE, slow_path);
270
271 bind(monitor_found);
272 }
273
274 const Register t2_owner_addr = t2;
275 const Register t3_owner = t3;
276 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
277 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
278 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
279
280 Label monitor_locked;
281
282 // Compute owner address.
283 lea(t2_owner_addr, owner_address);
284
285 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
286 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
287 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
288 /*release*/ false, /*weak*/ false, t3_owner);
289 br(Assembler::EQ, monitor_locked);
290
291 // Check if recursive.
292 cmp(t3_owner, rscratch2);
293 br(Assembler::NE, slow_path);
294
295 // Recursive.
296 increment(recursions_address, 1);
297
298 bind(monitor_locked);
299 if (UseObjectMonitorTable) {
300 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
301 }
302 }
303
304 bind(locked);
305
306 #ifdef ASSERT
307 // Check that locked label is reached with Flags == EQ.
308 Label flag_correct;
309 br(Assembler::EQ, flag_correct);
310 stop("Fast Lock Flag != EQ");
311 #endif
312
313 bind(slow_path);
314 #ifdef ASSERT
315 // Check that slow_path label is reached with Flags == NE.
316 br(Assembler::NE, flag_correct);
317 stop("Fast Lock Flag != NE");
318 bind(flag_correct);
319 #endif
320 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
321 }
322
323 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
324 Register t2, Register t3) {
325 assert_different_registers(obj, box, t1, t2, t3);
326
327 // Handle inflated monitor.
328 Label inflated, inflated_load_mark;
329 // Finish fast unlock successfully. MUST branch to with flag == EQ
330 Label unlocked;
331 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
332 Label slow_path;
333
334 const Register t1_mark = t1;
335 const Register t2_top = t2;
336 const Register t3_t = t3;
337
338 { // Fast unlock
339
340 Label push_and_slow_path;
341
342 // Check if obj is top of lock-stack.
343 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
344 subw(t2_top, t2_top, oopSize);
345 ldr(t3_t, Address(rthread, t2_top));
346 cmp(obj, t3_t);
347 // Top of lock stack was not obj. Must be monitor.
348 br(Assembler::NE, inflated_load_mark);
349
350 // Pop lock-stack.
351 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
352 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
353
354 // Check if recursive.
355 subw(t3_t, t2_top, oopSize);
356 ldr(t3_t, Address(rthread, t3_t));
357 cmp(obj, t3_t);
358 br(Assembler::EQ, unlocked);
359
360 // Not recursive.
361 // Load Mark.
362 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
363
364 // Check header for monitor (0b10).
365 // Because we got here by popping (meaning we pushed in locked)
366 // there will be no monitor in the box. So we need to push back the obj
367 // so that the runtime can fix any potential anonymous owner.
368 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
369
370 // Try to unlock. Transition lock bits 0b00 => 0b01
371 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
372 orr(t3_t, t1_mark, markWord::unlocked_value);
373 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
374 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
375 br(Assembler::EQ, unlocked);
376
377 bind(push_and_slow_path);
378 // Compare and exchange failed.
379 // Restore lock-stack and handle the unlock in runtime.
380 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
381 addw(t2_top, t2_top, oopSize);
382 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
383 b(slow_path);
384 }
385
386
387 { // Handle inflated monitor.
388 bind(inflated_load_mark);
389 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
390 #ifdef ASSERT
391 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
392 stop("Fast Unlock not monitor");
393 #endif
394
395 bind(inflated);
396
397 #ifdef ASSERT
398 Label check_done;
399 subw(t2_top, t2_top, oopSize);
400 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
401 br(Assembler::LT, check_done);
402 ldr(t3_t, Address(rthread, t2_top));
403 cmp(obj, t3_t);
404 br(Assembler::NE, inflated);
405 stop("Fast Unlock lock on stack");
406 bind(check_done);
407 #endif
408
409 const Register t1_monitor = t1;
410
411 if (!UseObjectMonitorTable) {
412 assert(t1_monitor == t1_mark, "should be the same here");
413
414 // Untag the monitor.
415 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
416 } else {
417 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
418 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
419 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
420 br(Assembler::LO, slow_path);
421 }
422
423 const Register t2_recursions = t2;
424 Label not_recursive;
425
426 // Check if recursive.
427 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
428 cbz(t2_recursions, not_recursive);
429
430 // Recursive unlock.
431 sub(t2_recursions, t2_recursions, 1u);
432 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
433 // Set flag == EQ
434 cmp(t2_recursions, t2_recursions);
435 b(unlocked);
436
437 bind(not_recursive);
438
439 const Register t2_owner_addr = t2;
440
441 // Compute owner address.
442 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
443
444 // Set owner to null.
445 // Release to satisfy the JMM
446 stlr(zr, t2_owner_addr);
447 // We need a full fence after clearing owner to avoid stranding.
448 // StoreLoad achieves this.
449 membar(StoreLoad);
450
451 // Check if the entry_list is empty.
452 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
453 cmp(rscratch1, zr);
454 br(Assembler::EQ, unlocked); // If so we are done.
455
456 // Check if there is a successor.
457 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
458 cmp(rscratch1, zr);
459 br(Assembler::NE, unlocked); // If so we are done.
460
461 // Save the monitor pointer in the current thread, so we can try to
462 // reacquire the lock in SharedRuntime::monitor_exit_helper().
463 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
464
465 cmp(zr, rthread); // Set Flag to NE => slow path
466 b(slow_path);
467 }
468
469 bind(unlocked);
470 cmp(zr, zr); // Set Flags to EQ => fast path
471
472 #ifdef ASSERT
473 // Check that unlocked label is reached with Flags == EQ.
474 Label flag_correct;
475 br(Assembler::EQ, flag_correct);
476 stop("Fast Unlock Flag != EQ");
477 #endif
478
479 bind(slow_path);
480 #ifdef ASSERT
481 // Check that slow_path label is reached with Flags == NE.
482 br(Assembler::NE, flag_correct);
483 stop("Fast Unlock Flag != NE");
484 bind(flag_correct);
485 #endif
486 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
487 }
488
489 // Search for str1 in str2 and return index or -1
490 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
491 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
492 Register cnt2, Register cnt1,
493 Register tmp1, Register tmp2,
494 Register tmp3, Register tmp4,
495 Register tmp5, Register tmp6,
496 int icnt1, Register result, int ae) {
497 // NOTE: tmp5, tmp6 can be zr depending on specific method version
498 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
499
500 Register ch1 = rscratch1;
501 Register ch2 = rscratch2;
502 Register cnt1tmp = tmp1;
503 Register cnt2tmp = tmp2;
504 Register cnt1_neg = cnt1;
505 Register cnt2_neg = cnt2;
506 Register result_tmp = tmp4;
507
508 bool isL = ae == StrIntrinsicNode::LL;
509
510 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
511 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
512 int str1_chr_shift = str1_isL ? 0:1;
513 int str2_chr_shift = str2_isL ? 0:1;
514 int str1_chr_size = str1_isL ? 1:2;
515 int str2_chr_size = str2_isL ? 1:2;
516 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
517 (chr_insn)&MacroAssembler::ldrh;
518 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
519 (chr_insn)&MacroAssembler::ldrh;
520 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
521 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
522
523 // Note, inline_string_indexOf() generates checks:
524 // if (substr.count > string.count) return -1;
525 // if (substr.count == 0) return 0;
526
527 // We have two strings, a source string in str2, cnt2 and a pattern string
528 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
529
530 // For larger pattern and source we use a simplified Boyer Moore algorithm.
531 // With a small pattern and source we use linear scan.
532
533 if (icnt1 == -1) {
534 sub(result_tmp, cnt2, cnt1);
535 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
536 br(LT, LINEARSEARCH);
537 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
538 subs(zr, cnt1, 256);
539 lsr(tmp1, cnt2, 2);
540 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
541 br(GE, LINEARSTUB);
542 }
543
544 // The Boyer Moore alogorithm is based on the description here:-
545 //
546 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
547 //
548 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
549 // and the 'Good Suffix' rule.
550 //
551 // These rules are essentially heuristics for how far we can shift the
552 // pattern along the search string.
553 //
554 // The implementation here uses the 'Bad Character' rule only because of the
555 // complexity of initialisation for the 'Good Suffix' rule.
556 //
557 // This is also known as the Boyer-Moore-Horspool algorithm:-
558 //
559 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
560 //
561 // This particular implementation has few java-specific optimizations.
562 //
563 // #define ASIZE 256
564 //
565 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
566 // int i, j;
567 // unsigned c;
568 // unsigned char bc[ASIZE];
569 //
570 // /* Preprocessing */
571 // for (i = 0; i < ASIZE; ++i)
572 // bc[i] = m;
573 // for (i = 0; i < m - 1; ) {
574 // c = x[i];
575 // ++i;
576 // // c < 256 for Latin1 string, so, no need for branch
577 // #ifdef PATTERN_STRING_IS_LATIN1
578 // bc[c] = m - i;
579 // #else
580 // if (c < ASIZE) bc[c] = m - i;
581 // #endif
582 // }
583 //
584 // /* Searching */
585 // j = 0;
586 // while (j <= n - m) {
587 // c = y[i+j];
588 // if (x[m-1] == c)
589 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
590 // if (i < 0) return j;
591 // // c < 256 for Latin1 string, so, no need for branch
592 // #ifdef SOURCE_STRING_IS_LATIN1
593 // // LL case: (c< 256) always true. Remove branch
594 // j += bc[y[j+m-1]];
595 // #endif
596 // #ifndef PATTERN_STRING_IS_UTF
597 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
598 // if (c < ASIZE)
599 // j += bc[y[j+m-1]];
600 // else
601 // j += 1
602 // #endif
603 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
604 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
605 // if (c < ASIZE)
606 // j += bc[y[j+m-1]];
607 // else
608 // j += m
609 // #endif
610 // }
611 // }
612
613 if (icnt1 == -1) {
614 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
615 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
616 Register cnt1end = tmp2;
617 Register str2end = cnt2;
618 Register skipch = tmp2;
619
620 // str1 length is >=8, so, we can read at least 1 register for cases when
621 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
622 // UL case. We'll re-read last character in inner pre-loop code to have
623 // single outer pre-loop load
624 const int firstStep = isL ? 7 : 3;
625
626 const int ASIZE = 256;
627 const int STORED_BYTES = 32; // amount of bytes stored per instruction
628 sub(sp, sp, ASIZE);
629 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
630 mov(ch1, sp);
631 BIND(BM_INIT_LOOP);
632 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
633 subs(tmp5, tmp5, 1);
634 br(GT, BM_INIT_LOOP);
635
636 sub(cnt1tmp, cnt1, 1);
637 mov(tmp5, str2);
638 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
639 sub(ch2, cnt1, 1);
640 mov(tmp3, str1);
641 BIND(BCLOOP);
642 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
643 if (!str1_isL) {
644 subs(zr, ch1, ASIZE);
645 br(HS, BCSKIP);
646 }
647 strb(ch2, Address(sp, ch1));
648 BIND(BCSKIP);
649 subs(ch2, ch2, 1);
650 br(GT, BCLOOP);
651
652 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
653 if (str1_isL == str2_isL) {
654 // load last 8 bytes (8LL/4UU symbols)
655 ldr(tmp6, Address(tmp6, -wordSize));
656 } else {
657 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
658 // convert Latin1 to UTF. We'll have to wait until load completed, but
659 // it's still faster than per-character loads+checks
660 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
661 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
662 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
663 andr(tmp6, tmp6, 0xFF); // str1[N-4]
664 orr(ch2, ch1, ch2, LSL, 16);
665 orr(tmp6, tmp6, tmp3, LSL, 48);
666 orr(tmp6, tmp6, ch2, LSL, 16);
667 }
668 BIND(BMLOOPSTR2);
669 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
670 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
671 if (str1_isL == str2_isL) {
672 // re-init tmp3. It's for free because it's executed in parallel with
673 // load above. Alternative is to initialize it before loop, but it'll
674 // affect performance on in-order systems with 2 or more ld/st pipelines
675 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
676 }
677 if (!isL) { // UU/UL case
678 lsl(ch2, cnt1tmp, 1); // offset in bytes
679 }
680 cmp(tmp3, skipch);
681 br(NE, BMSKIP);
682 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
683 mov(ch1, tmp6);
684 if (isL) {
685 b(BMLOOPSTR1_AFTER_LOAD);
686 } else {
687 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
688 b(BMLOOPSTR1_CMP);
689 }
690 BIND(BMLOOPSTR1);
691 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
692 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
693 BIND(BMLOOPSTR1_AFTER_LOAD);
694 subs(cnt1tmp, cnt1tmp, 1);
695 br(LT, BMLOOPSTR1_LASTCMP);
696 BIND(BMLOOPSTR1_CMP);
697 cmp(ch1, ch2);
698 br(EQ, BMLOOPSTR1);
699 BIND(BMSKIP);
700 if (!isL) {
701 // if we've met UTF symbol while searching Latin1 pattern, then we can
702 // skip cnt1 symbols
703 if (str1_isL != str2_isL) {
704 mov(result_tmp, cnt1);
705 } else {
706 mov(result_tmp, 1);
707 }
708 subs(zr, skipch, ASIZE);
709 br(HS, BMADV);
710 }
711 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
712 BIND(BMADV);
713 sub(cnt1tmp, cnt1, 1);
714 add(str2, str2, result_tmp, LSL, str2_chr_shift);
715 cmp(str2, str2end);
716 br(LE, BMLOOPSTR2);
717 add(sp, sp, ASIZE);
718 b(NOMATCH);
719 BIND(BMLOOPSTR1_LASTCMP);
720 cmp(ch1, ch2);
721 br(NE, BMSKIP);
722 BIND(BMMATCH);
723 sub(result, str2, tmp5);
724 if (!str2_isL) lsr(result, result, 1);
725 add(sp, sp, ASIZE);
726 b(DONE);
727
728 BIND(LINEARSTUB);
729 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
730 br(LT, LINEAR_MEDIUM);
731 mov(result, zr);
732 RuntimeAddress stub = nullptr;
733 if (isL) {
734 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
735 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
736 } else if (str1_isL) {
737 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
738 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
739 } else {
740 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
741 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
742 }
743 address call = trampoline_call(stub);
744 if (call == nullptr) {
745 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
746 ciEnv::current()->record_failure("CodeCache is full");
747 return;
748 }
749 b(DONE);
750 }
751
752 BIND(LINEARSEARCH);
753 {
754 Label DO1, DO2, DO3;
755
756 Register str2tmp = tmp2;
757 Register first = tmp3;
758
759 if (icnt1 == -1)
760 {
761 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
762
763 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
764 br(LT, DOSHORT);
765 BIND(LINEAR_MEDIUM);
766 (this->*str1_load_1chr)(first, Address(str1));
767 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
768 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
769 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
770 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
771
772 BIND(FIRST_LOOP);
773 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
774 cmp(first, ch2);
775 br(EQ, STR1_LOOP);
776 BIND(STR2_NEXT);
777 adds(cnt2_neg, cnt2_neg, str2_chr_size);
778 br(LE, FIRST_LOOP);
779 b(NOMATCH);
780
781 BIND(STR1_LOOP);
782 adds(cnt1tmp, cnt1_neg, str1_chr_size);
783 add(cnt2tmp, cnt2_neg, str2_chr_size);
784 br(GE, MATCH);
785
786 BIND(STR1_NEXT);
787 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
788 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
789 cmp(ch1, ch2);
790 br(NE, STR2_NEXT);
791 adds(cnt1tmp, cnt1tmp, str1_chr_size);
792 add(cnt2tmp, cnt2tmp, str2_chr_size);
793 br(LT, STR1_NEXT);
794 b(MATCH);
795
796 BIND(DOSHORT);
797 if (str1_isL == str2_isL) {
798 cmp(cnt1, (u1)2);
799 br(LT, DO1);
800 br(GT, DO3);
801 }
802 }
803
804 if (icnt1 == 4) {
805 Label CH1_LOOP;
806
807 (this->*load_4chr)(ch1, str1);
808 sub(result_tmp, cnt2, 4);
809 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
810 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
811
812 BIND(CH1_LOOP);
813 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
814 cmp(ch1, ch2);
815 br(EQ, MATCH);
816 adds(cnt2_neg, cnt2_neg, str2_chr_size);
817 br(LE, CH1_LOOP);
818 b(NOMATCH);
819 }
820
821 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
822 Label CH1_LOOP;
823
824 BIND(DO2);
825 (this->*load_2chr)(ch1, str1);
826 if (icnt1 == 2) {
827 sub(result_tmp, cnt2, 2);
828 }
829 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
830 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
831 BIND(CH1_LOOP);
832 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
833 cmp(ch1, ch2);
834 br(EQ, MATCH);
835 adds(cnt2_neg, cnt2_neg, str2_chr_size);
836 br(LE, CH1_LOOP);
837 b(NOMATCH);
838 }
839
840 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
841 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
842
843 BIND(DO3);
844 (this->*load_2chr)(first, str1);
845 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
846 if (icnt1 == 3) {
847 sub(result_tmp, cnt2, 3);
848 }
849 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
850 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
851 BIND(FIRST_LOOP);
852 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
853 cmpw(first, ch2);
854 br(EQ, STR1_LOOP);
855 BIND(STR2_NEXT);
856 adds(cnt2_neg, cnt2_neg, str2_chr_size);
857 br(LE, FIRST_LOOP);
858 b(NOMATCH);
859
860 BIND(STR1_LOOP);
861 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
862 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
863 cmp(ch1, ch2);
864 br(NE, STR2_NEXT);
865 b(MATCH);
866 }
867
868 if (icnt1 == -1 || icnt1 == 1) {
869 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
870
871 BIND(DO1);
872 (this->*str1_load_1chr)(ch1, str1);
873 cmp(cnt2, (u1)8);
874 br(LT, DO1_SHORT);
875
876 sub(result_tmp, cnt2, 8/str2_chr_size);
877 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
878 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
879 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
880
881 if (str2_isL) {
882 orr(ch1, ch1, ch1, LSL, 8);
883 }
884 orr(ch1, ch1, ch1, LSL, 16);
885 orr(ch1, ch1, ch1, LSL, 32);
886 BIND(CH1_LOOP);
887 ldr(ch2, Address(str2, cnt2_neg));
888 eor(ch2, ch1, ch2);
889 sub(tmp1, ch2, tmp3);
890 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
891 bics(tmp1, tmp1, tmp2);
892 br(NE, HAS_ZERO);
893 adds(cnt2_neg, cnt2_neg, 8);
894 br(LT, CH1_LOOP);
895
896 cmp(cnt2_neg, (u1)8);
897 mov(cnt2_neg, 0);
898 br(LT, CH1_LOOP);
899 b(NOMATCH);
900
901 BIND(HAS_ZERO);
902 rev(tmp1, tmp1);
903 clz(tmp1, tmp1);
904 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
905 b(MATCH);
906
907 BIND(DO1_SHORT);
908 mov(result_tmp, cnt2);
909 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
910 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
911 BIND(DO1_LOOP);
912 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
913 cmpw(ch1, ch2);
914 br(EQ, MATCH);
915 adds(cnt2_neg, cnt2_neg, str2_chr_size);
916 br(LT, DO1_LOOP);
917 }
918 }
919 BIND(NOMATCH);
920 mov(result, -1);
921 b(DONE);
922 BIND(MATCH);
923 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
924 BIND(DONE);
925 }
926
927 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
928 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
929
930 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
931 Register ch, Register result,
932 Register tmp1, Register tmp2, Register tmp3)
933 {
934 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
935 Register cnt1_neg = cnt1;
936 Register ch1 = rscratch1;
937 Register result_tmp = rscratch2;
938
939 cbz(cnt1, NOMATCH);
940
941 cmp(cnt1, (u1)4);
942 br(LT, DO1_SHORT);
943
944 orr(ch, ch, ch, LSL, 16);
945 orr(ch, ch, ch, LSL, 32);
946
947 sub(cnt1, cnt1, 4);
948 mov(result_tmp, cnt1);
949 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
950 sub(cnt1_neg, zr, cnt1, LSL, 1);
951
952 mov(tmp3, 0x0001000100010001);
953
954 BIND(CH1_LOOP);
955 ldr(ch1, Address(str1, cnt1_neg));
956 eor(ch1, ch, ch1);
957 sub(tmp1, ch1, tmp3);
958 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
959 bics(tmp1, tmp1, tmp2);
960 br(NE, HAS_ZERO);
961 adds(cnt1_neg, cnt1_neg, 8);
962 br(LT, CH1_LOOP);
963
964 cmp(cnt1_neg, (u1)8);
965 mov(cnt1_neg, 0);
966 br(LT, CH1_LOOP);
967 b(NOMATCH);
968
969 BIND(HAS_ZERO);
970 rev(tmp1, tmp1);
971 clz(tmp1, tmp1);
972 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
973 b(MATCH);
974
975 BIND(DO1_SHORT);
976 mov(result_tmp, cnt1);
977 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
978 sub(cnt1_neg, zr, cnt1, LSL, 1);
979 BIND(DO1_LOOP);
980 ldrh(ch1, Address(str1, cnt1_neg));
981 cmpw(ch, ch1);
982 br(EQ, MATCH);
983 adds(cnt1_neg, cnt1_neg, 2);
984 br(LT, DO1_LOOP);
985 BIND(NOMATCH);
986 mov(result, -1);
987 b(DONE);
988 BIND(MATCH);
989 add(result, result_tmp, cnt1_neg, ASR, 1);
990 BIND(DONE);
991 }
992
993 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
994 Register ch, Register result,
995 FloatRegister ztmp1,
996 FloatRegister ztmp2,
997 PRegister tmp_pg,
998 PRegister tmp_pdn, bool isL)
999 {
1000 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1001 assert(tmp_pg->is_governing(),
1002 "this register has to be a governing predicate register");
1003
1004 Label LOOP, MATCH, DONE, NOMATCH;
1005 Register vec_len = rscratch1;
1006 Register idx = rscratch2;
1007
1008 SIMD_RegVariant T = (isL == true) ? B : H;
1009
1010 cbz(cnt1, NOMATCH);
1011
1012 // Assign the particular char throughout the vector.
1013 sve_dup(ztmp2, T, ch);
1014 if (isL) {
1015 sve_cntb(vec_len);
1016 } else {
1017 sve_cnth(vec_len);
1018 }
1019 mov(idx, 0);
1020
1021 // Generate a predicate to control the reading of input string.
1022 sve_whilelt(tmp_pg, T, idx, cnt1);
1023
1024 BIND(LOOP);
1025 // Read a vector of 8- or 16-bit data depending on the string type. Note
1026 // that inactive elements indicated by the predicate register won't cause
1027 // a data read from memory to the destination vector.
1028 if (isL) {
1029 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1030 } else {
1031 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1032 }
1033 add(idx, idx, vec_len);
1034
1035 // Perform the comparison. An element of the destination predicate is set
1036 // to active if the particular char is matched.
1037 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1038
1039 // Branch if the particular char is found.
1040 br(NE, MATCH);
1041
1042 sve_whilelt(tmp_pg, T, idx, cnt1);
1043
1044 // Loop back if the particular char not found.
1045 br(MI, LOOP);
1046
1047 BIND(NOMATCH);
1048 mov(result, -1);
1049 b(DONE);
1050
1051 BIND(MATCH);
1052 // Undo the index increment.
1053 sub(idx, idx, vec_len);
1054
1055 // Crop the vector to find its location.
1056 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1057 add(result, idx, -1);
1058 sve_incp(result, T, tmp_pdn);
1059 BIND(DONE);
1060 }
1061
1062 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1063 Register ch, Register result,
1064 Register tmp1, Register tmp2, Register tmp3)
1065 {
1066 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1067 Register cnt1_neg = cnt1;
1068 Register ch1 = rscratch1;
1069 Register result_tmp = rscratch2;
1070
1071 cbz(cnt1, NOMATCH);
1072
1073 cmp(cnt1, (u1)8);
1074 br(LT, DO1_SHORT);
1075
1076 orr(ch, ch, ch, LSL, 8);
1077 orr(ch, ch, ch, LSL, 16);
1078 orr(ch, ch, ch, LSL, 32);
1079
1080 sub(cnt1, cnt1, 8);
1081 mov(result_tmp, cnt1);
1082 lea(str1, Address(str1, cnt1));
1083 sub(cnt1_neg, zr, cnt1);
1084
1085 mov(tmp3, 0x0101010101010101);
1086
1087 BIND(CH1_LOOP);
1088 ldr(ch1, Address(str1, cnt1_neg));
1089 eor(ch1, ch, ch1);
1090 sub(tmp1, ch1, tmp3);
1091 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1092 bics(tmp1, tmp1, tmp2);
1093 br(NE, HAS_ZERO);
1094 adds(cnt1_neg, cnt1_neg, 8);
1095 br(LT, CH1_LOOP);
1096
1097 cmp(cnt1_neg, (u1)8);
1098 mov(cnt1_neg, 0);
1099 br(LT, CH1_LOOP);
1100 b(NOMATCH);
1101
1102 BIND(HAS_ZERO);
1103 rev(tmp1, tmp1);
1104 clz(tmp1, tmp1);
1105 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1106 b(MATCH);
1107
1108 BIND(DO1_SHORT);
1109 mov(result_tmp, cnt1);
1110 lea(str1, Address(str1, cnt1));
1111 sub(cnt1_neg, zr, cnt1);
1112 BIND(DO1_LOOP);
1113 ldrb(ch1, Address(str1, cnt1_neg));
1114 cmp(ch, ch1);
1115 br(EQ, MATCH);
1116 adds(cnt1_neg, cnt1_neg, 1);
1117 br(LT, DO1_LOOP);
1118 BIND(NOMATCH);
1119 mov(result, -1);
1120 b(DONE);
1121 BIND(MATCH);
1122 add(result, result_tmp, cnt1_neg);
1123 BIND(DONE);
1124 }
1125
1126 // Compare strings.
1127 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1128 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1129 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1130 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1131 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1132 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1133 SHORT_LOOP_START, TAIL_CHECK;
1134
1135 bool isLL = ae == StrIntrinsicNode::LL;
1136 bool isLU = ae == StrIntrinsicNode::LU;
1137 bool isUL = ae == StrIntrinsicNode::UL;
1138
1139 // The stub threshold for LL strings is: 72 (64 + 8) chars
1140 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1141 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1142 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1143
1144 bool str1_isL = isLL || isLU;
1145 bool str2_isL = isLL || isUL;
1146
1147 int str1_chr_shift = str1_isL ? 0 : 1;
1148 int str2_chr_shift = str2_isL ? 0 : 1;
1149 int str1_chr_size = str1_isL ? 1 : 2;
1150 int str2_chr_size = str2_isL ? 1 : 2;
1151 int minCharsInWord = isLL ? wordSize : wordSize/2;
1152
1153 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1154 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1155 (chr_insn)&MacroAssembler::ldrh;
1156 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1157 (chr_insn)&MacroAssembler::ldrh;
1158 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1159 (uxt_insn)&MacroAssembler::uxthw;
1160
1161 BLOCK_COMMENT("string_compare {");
1162
1163 // Bizarrely, the counts are passed in bytes, regardless of whether they
1164 // are L or U strings, however the result is always in characters.
1165 if (!str1_isL) asrw(cnt1, cnt1, 1);
1166 if (!str2_isL) asrw(cnt2, cnt2, 1);
1167
1168 // Compute the minimum of the string lengths and save the difference.
1169 subsw(result, cnt1, cnt2);
1170 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1171
1172 // A very short string
1173 cmpw(cnt2, minCharsInWord);
1174 br(Assembler::LE, SHORT_STRING);
1175
1176 // Compare longwords
1177 // load first parts of strings and finish initialization while loading
1178 {
1179 if (str1_isL == str2_isL) { // LL or UU
1180 ldr(tmp1, Address(str1));
1181 cmp(str1, str2);
1182 br(Assembler::EQ, DONE);
1183 ldr(tmp2, Address(str2));
1184 cmp(cnt2, stub_threshold);
1185 br(GE, STUB);
1186 subsw(cnt2, cnt2, minCharsInWord);
1187 br(EQ, TAIL_CHECK);
1188 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1189 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1190 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1191 } else if (isLU) {
1192 ldrs(vtmp, Address(str1));
1193 ldr(tmp2, Address(str2));
1194 cmp(cnt2, stub_threshold);
1195 br(GE, STUB);
1196 subw(cnt2, cnt2, 4);
1197 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1198 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1199 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1200 zip1(vtmp, T8B, vtmp, vtmpZ);
1201 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1202 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1203 add(cnt1, cnt1, 4);
1204 fmovd(tmp1, vtmp);
1205 } else { // UL case
1206 ldr(tmp1, Address(str1));
1207 ldrs(vtmp, Address(str2));
1208 cmp(cnt2, stub_threshold);
1209 br(GE, STUB);
1210 subw(cnt2, cnt2, 4);
1211 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1212 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1213 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1214 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1215 zip1(vtmp, T8B, vtmp, vtmpZ);
1216 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1217 add(cnt1, cnt1, 8);
1218 fmovd(tmp2, vtmp);
1219 }
1220 adds(cnt2, cnt2, isUL ? 4 : 8);
1221 br(GE, TAIL);
1222 eor(rscratch2, tmp1, tmp2);
1223 cbnz(rscratch2, DIFF);
1224 // main loop
1225 bind(NEXT_WORD);
1226 if (str1_isL == str2_isL) {
1227 ldr(tmp1, Address(str1, cnt2));
1228 ldr(tmp2, Address(str2, cnt2));
1229 adds(cnt2, cnt2, 8);
1230 } else if (isLU) {
1231 ldrs(vtmp, Address(str1, cnt1));
1232 ldr(tmp2, Address(str2, cnt2));
1233 add(cnt1, cnt1, 4);
1234 zip1(vtmp, T8B, vtmp, vtmpZ);
1235 fmovd(tmp1, vtmp);
1236 adds(cnt2, cnt2, 8);
1237 } else { // UL
1238 ldrs(vtmp, Address(str2, cnt2));
1239 ldr(tmp1, Address(str1, cnt1));
1240 zip1(vtmp, T8B, vtmp, vtmpZ);
1241 add(cnt1, cnt1, 8);
1242 fmovd(tmp2, vtmp);
1243 adds(cnt2, cnt2, 4);
1244 }
1245 br(GE, TAIL);
1246
1247 eor(rscratch2, tmp1, tmp2);
1248 cbz(rscratch2, NEXT_WORD);
1249 b(DIFF);
1250 bind(TAIL);
1251 eor(rscratch2, tmp1, tmp2);
1252 cbnz(rscratch2, DIFF);
1253 // Last longword. In the case where length == 4 we compare the
1254 // same longword twice, but that's still faster than another
1255 // conditional branch.
1256 if (str1_isL == str2_isL) {
1257 ldr(tmp1, Address(str1));
1258 ldr(tmp2, Address(str2));
1259 } else if (isLU) {
1260 ldrs(vtmp, Address(str1));
1261 ldr(tmp2, Address(str2));
1262 zip1(vtmp, T8B, vtmp, vtmpZ);
1263 fmovd(tmp1, vtmp);
1264 } else { // UL
1265 ldrs(vtmp, Address(str2));
1266 ldr(tmp1, Address(str1));
1267 zip1(vtmp, T8B, vtmp, vtmpZ);
1268 fmovd(tmp2, vtmp);
1269 }
1270 bind(TAIL_CHECK);
1271 eor(rscratch2, tmp1, tmp2);
1272 cbz(rscratch2, DONE);
1273
1274 // Find the first different characters in the longwords and
1275 // compute their difference.
1276 bind(DIFF);
1277 rev(rscratch2, rscratch2);
1278 clz(rscratch2, rscratch2);
1279 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1280 lsrv(tmp1, tmp1, rscratch2);
1281 (this->*ext_chr)(tmp1, tmp1);
1282 lsrv(tmp2, tmp2, rscratch2);
1283 (this->*ext_chr)(tmp2, tmp2);
1284 subw(result, tmp1, tmp2);
1285 b(DONE);
1286 }
1287
1288 bind(STUB);
1289 RuntimeAddress stub = nullptr;
1290 switch(ae) {
1291 case StrIntrinsicNode::LL:
1292 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1293 break;
1294 case StrIntrinsicNode::UU:
1295 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1296 break;
1297 case StrIntrinsicNode::LU:
1298 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1299 break;
1300 case StrIntrinsicNode::UL:
1301 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1302 break;
1303 default:
1304 ShouldNotReachHere();
1305 }
1306 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1307 address call = trampoline_call(stub);
1308 if (call == nullptr) {
1309 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1310 ciEnv::current()->record_failure("CodeCache is full");
1311 return;
1312 }
1313 b(DONE);
1314
1315 bind(SHORT_STRING);
1316 // Is the minimum length zero?
1317 cbz(cnt2, DONE);
1318 // arrange code to do most branches while loading and loading next characters
1319 // while comparing previous
1320 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1321 subs(cnt2, cnt2, 1);
1322 br(EQ, SHORT_LAST_INIT);
1323 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1324 b(SHORT_LOOP_START);
1325 bind(SHORT_LOOP);
1326 subs(cnt2, cnt2, 1);
1327 br(EQ, SHORT_LAST);
1328 bind(SHORT_LOOP_START);
1329 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1330 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1331 cmp(tmp1, cnt1);
1332 br(NE, SHORT_LOOP_TAIL);
1333 subs(cnt2, cnt2, 1);
1334 br(EQ, SHORT_LAST2);
1335 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1336 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1337 cmp(tmp2, rscratch1);
1338 br(EQ, SHORT_LOOP);
1339 sub(result, tmp2, rscratch1);
1340 b(DONE);
1341 bind(SHORT_LOOP_TAIL);
1342 sub(result, tmp1, cnt1);
1343 b(DONE);
1344 bind(SHORT_LAST2);
1345 cmp(tmp2, rscratch1);
1346 br(EQ, DONE);
1347 sub(result, tmp2, rscratch1);
1348
1349 b(DONE);
1350 bind(SHORT_LAST_INIT);
1351 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1352 bind(SHORT_LAST);
1353 cmp(tmp1, cnt1);
1354 br(EQ, DONE);
1355 sub(result, tmp1, cnt1);
1356
1357 bind(DONE);
1358
1359 BLOCK_COMMENT("} string_compare");
1360 }
1361
1362 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1363 FloatRegister src2, Condition cond, bool isQ) {
1364 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1365 FloatRegister zn = src1, zm = src2;
1366 bool needs_negation = false;
1367 switch (cond) {
1368 case LT: cond = GT; zn = src2; zm = src1; break;
1369 case LE: cond = GE; zn = src2; zm = src1; break;
1370 case LO: cond = HI; zn = src2; zm = src1; break;
1371 case LS: cond = HS; zn = src2; zm = src1; break;
1372 case NE: cond = EQ; needs_negation = true; break;
1373 default:
1374 break;
1375 }
1376
1377 if (is_floating_point_type(bt)) {
1378 fcm(cond, dst, size, zn, zm);
1379 } else {
1380 cm(cond, dst, size, zn, zm);
1381 }
1382
1383 if (needs_negation) {
1384 notr(dst, isQ ? T16B : T8B, dst);
1385 }
1386 }
1387
1388 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1389 Condition cond, bool isQ) {
1390 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1391 if (bt == T_FLOAT || bt == T_DOUBLE) {
1392 if (cond == Assembler::NE) {
1393 fcm(Assembler::EQ, dst, size, src);
1394 notr(dst, isQ ? T16B : T8B, dst);
1395 } else {
1396 fcm(cond, dst, size, src);
1397 }
1398 } else {
1399 if (cond == Assembler::NE) {
1400 cm(Assembler::EQ, dst, size, src);
1401 notr(dst, isQ ? T16B : T8B, dst);
1402 } else {
1403 cm(cond, dst, size, src);
1404 }
1405 }
1406 }
1407
1408 // Compress the least significant bit of each byte to the rightmost and clear
1409 // the higher garbage bits.
1410 void C2_MacroAssembler::bytemask_compress(Register dst) {
1411 // Example input, dst = 0x01 00 00 00 01 01 00 01
1412 // The "??" bytes are garbage.
1413 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1414 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1415 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1416 andr(dst, dst, 0xff); // dst = 0x8D
1417 }
1418
1419 // Pack the value of each mask element in "src" into a long value in "dst", at most
1420 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1421 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1422 // one bit in "dst".
1423 //
1424 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1425 // Expected: dst = 0x658D
1426 //
1427 // Clobbers: rscratch1
1428 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1429 FloatRegister vtmp, int lane_cnt) {
1430 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1431 assert_different_registers(dst, rscratch1);
1432 assert_different_registers(src, vtmp);
1433 assert(UseSVE > 0, "must be");
1434
1435 // Compress the lowest 8 bytes.
1436 fmovd(dst, src);
1437 bytemask_compress(dst);
1438 if (lane_cnt <= 8) return;
1439
1440 // Repeat on higher bytes and join the results.
1441 // Compress 8 bytes in each iteration.
1442 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1443 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1444 bytemask_compress(rscratch1);
1445 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1446 }
1447 }
1448
1449 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1450 // instruction which requires the FEAT_BITPERM feature.
1451 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1452 FloatRegister vtmp1, FloatRegister vtmp2,
1453 int lane_cnt) {
1454 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1455 assert_different_registers(src, vtmp1, vtmp2);
1456 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1457
1458 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1459 // is to compress each significant bit of the byte in a cross-lane way. Due
1460 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1461 // (bit-compress in each lane) with the biggest lane size (T = D) then
1462 // concatenate the results.
1463
1464 // The second source input of BEXT, initialized with 0x01 in each byte.
1465 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1466 sve_dup(vtmp2, B, 1);
1467
1468 // BEXT vtmp1.D, src.D, vtmp2.D
1469 // src = 0x0001010000010001 | 0x0100000001010001
1470 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1471 // ---------------------------------------
1472 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1473 sve_bext(vtmp1, D, src, vtmp2);
1474
1475 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1476 // result to dst.
1477 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1478 // dst = 0x658D
1479 if (lane_cnt <= 8) {
1480 // No need to concatenate.
1481 umov(dst, vtmp1, B, 0);
1482 } else if (lane_cnt <= 16) {
1483 ins(vtmp1, B, vtmp1, 1, 8);
1484 umov(dst, vtmp1, H, 0);
1485 } else {
1486 // As the lane count is 64 at most, the final expected value must be in
1487 // the lowest 64 bits after narrowing vtmp1 from D to B.
1488 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1489 umov(dst, vtmp1, D, 0);
1490 }
1491 }
1492
1493 // Unpack the mask, a long value in "src", into a vector register of boolean
1494 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1495 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1496 // most 64 lanes.
1497 //
1498 // Below example gives the expected dst vector register, with a valid src(0x658D)
1499 // on a 128-bit vector size machine.
1500 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1501 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1502 FloatRegister vtmp, int lane_cnt) {
1503 assert_different_registers(dst, vtmp);
1504 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1505 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1506
1507 // Example: src = 0x658D, lane_cnt = 16
1508 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1509
1510 // Put long value from general purpose register into the first lane of vector.
1511 // vtmp = 0x0000000000000000 | 0x000000000000658D
1512 sve_dup(vtmp, B, 0);
1513 mov(vtmp, D, 0, src);
1514
1515 // Transform the value in the first lane which is mask in bit now to the mask in
1516 // byte, which can be done by SVE2's BDEP instruction.
1517
1518 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1519 // vtmp = 0x0000000000000065 | 0x000000000000008D
1520 if (lane_cnt <= 8) {
1521 // Nothing. As only one byte exsits.
1522 } else if (lane_cnt <= 16) {
1523 ins(vtmp, B, vtmp, 8, 1);
1524 } else {
1525 sve_vector_extend(vtmp, D, vtmp, B);
1526 }
1527
1528 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1529 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1530 sve_dup(dst, B, 1);
1531
1532 // BDEP dst.D, vtmp.D, dst.D
1533 // vtmp = 0x0000000000000065 | 0x000000000000008D
1534 // dst = 0x0101010101010101 | 0x0101010101010101
1535 // ---------------------------------------
1536 // dst = 0x0001010000010001 | 0x0100000001010001
1537 sve_bdep(dst, D, vtmp, dst);
1538 }
1539
1540 // Clobbers: rflags
1541 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1542 FloatRegister zn, FloatRegister zm, Condition cond) {
1543 assert(pg->is_governing(), "This register has to be a governing predicate register");
1544 FloatRegister z1 = zn, z2 = zm;
1545 switch (cond) {
1546 case LE: z1 = zm; z2 = zn; cond = GE; break;
1547 case LT: z1 = zm; z2 = zn; cond = GT; break;
1548 case LO: z1 = zm; z2 = zn; cond = HI; break;
1549 case LS: z1 = zm; z2 = zn; cond = HS; break;
1550 default:
1551 break;
1552 }
1553
1554 SIMD_RegVariant size = elemType_to_regVariant(bt);
1555 if (is_floating_point_type(bt)) {
1556 sve_fcm(cond, pd, size, pg, z1, z2);
1557 } else {
1558 assert(is_integral_type(bt), "unsupported element type");
1559 sve_cmp(cond, pd, size, pg, z1, z2);
1560 }
1561 }
1562
1563 // Get index of the last mask lane that is set
1564 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1565 SIMD_RegVariant size = elemType_to_regVariant(bt);
1566 sve_rev(ptmp, size, src);
1567 sve_brkb(ptmp, ptrue, ptmp, false);
1568 sve_cntp(dst, size, ptrue, ptmp);
1569 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1570 subw(dst, rscratch1, dst);
1571 }
1572
1573 // Extend integer vector src to dst with the same lane count
1574 // but larger element size, e.g. 4B -> 4I
1575 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1576 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1577 if (src_bt == T_BYTE) {
1578 // 4B to 4S/4I, 8B to 8S
1579 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1580 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1581 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1582 if (dst_bt == T_INT) {
1583 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1584 }
1585 } else if (src_bt == T_SHORT) {
1586 // 2S to 2I/2L, 4S to 4I
1587 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1588 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1589 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1590 if (dst_bt == T_LONG) {
1591 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1592 }
1593 } else if (src_bt == T_INT) {
1594 // 2I to 2L
1595 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1596 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1597 } else {
1598 ShouldNotReachHere();
1599 }
1600 }
1601
1602 // Narrow integer vector src down to dst with the same lane count
1603 // but smaller element size, e.g. 4I -> 4B
1604 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1605 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1606 if (src_bt == T_SHORT) {
1607 // 4S/8S to 4B/8B
1608 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1609 assert(dst_bt == T_BYTE, "unsupported");
1610 xtn(dst, T8B, src, T8H);
1611 } else if (src_bt == T_INT) {
1612 // 2I to 2S, 4I to 4B/4S
1613 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1614 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1615 xtn(dst, T4H, src, T4S);
1616 if (dst_bt == T_BYTE) {
1617 xtn(dst, T8B, dst, T8H);
1618 }
1619 } else if (src_bt == T_LONG) {
1620 // 2L to 2S/2I
1621 assert(src_vlen_in_bytes == 16, "unsupported");
1622 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1623 xtn(dst, T2S, src, T2D);
1624 if (dst_bt == T_SHORT) {
1625 xtn(dst, T4H, dst, T4S);
1626 }
1627 } else {
1628 ShouldNotReachHere();
1629 }
1630 }
1631
1632 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1633 FloatRegister src, SIMD_RegVariant src_size,
1634 bool is_unsigned) {
1635 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1636
1637 if (src_size == B) {
1638 switch (dst_size) {
1639 case H:
1640 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1641 break;
1642 case S:
1643 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1644 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1645 break;
1646 case D:
1647 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1648 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1649 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1650 break;
1651 default:
1652 ShouldNotReachHere();
1653 }
1654 } else if (src_size == H) {
1655 if (dst_size == S) {
1656 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1657 } else { // D
1658 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1659 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1660 }
1661 } else if (src_size == S) {
1662 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1663 }
1664 }
1665
1666 // Vector narrow from src to dst with specified element sizes.
1667 // High part of dst vector will be filled with zero.
1668 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1669 FloatRegister src, SIMD_RegVariant src_size,
1670 FloatRegister tmp) {
1671 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1672 assert_different_registers(src, tmp);
1673 sve_dup(tmp, src_size, 0);
1674 if (src_size == D) {
1675 switch (dst_size) {
1676 case S:
1677 sve_uzp1(dst, S, src, tmp);
1678 break;
1679 case H:
1680 assert_different_registers(dst, tmp);
1681 sve_uzp1(dst, S, src, tmp);
1682 sve_uzp1(dst, H, dst, tmp);
1683 break;
1684 case B:
1685 assert_different_registers(dst, tmp);
1686 sve_uzp1(dst, S, src, tmp);
1687 sve_uzp1(dst, H, dst, tmp);
1688 sve_uzp1(dst, B, dst, tmp);
1689 break;
1690 default:
1691 ShouldNotReachHere();
1692 }
1693 } else if (src_size == S) {
1694 if (dst_size == H) {
1695 sve_uzp1(dst, H, src, tmp);
1696 } else { // B
1697 assert_different_registers(dst, tmp);
1698 sve_uzp1(dst, H, src, tmp);
1699 sve_uzp1(dst, B, dst, tmp);
1700 }
1701 } else if (src_size == H) {
1702 sve_uzp1(dst, B, src, tmp);
1703 }
1704 }
1705
1706 // Extend src predicate to dst predicate with the same lane count but larger
1707 // element size, e.g. 64Byte -> 512Long
1708 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1709 uint dst_element_length_in_bytes,
1710 uint src_element_length_in_bytes) {
1711 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1712 sve_punpklo(dst, src);
1713 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1714 sve_punpklo(dst, src);
1715 sve_punpklo(dst, dst);
1716 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1717 sve_punpklo(dst, src);
1718 sve_punpklo(dst, dst);
1719 sve_punpklo(dst, dst);
1720 } else {
1721 assert(false, "unsupported");
1722 ShouldNotReachHere();
1723 }
1724 }
1725
1726 // Narrow src predicate to dst predicate with the same lane count but
1727 // smaller element size, e.g. 512Long -> 64Byte
1728 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1729 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1730 // The insignificant bits in src predicate are expected to be zero.
1731 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1732 // passed as the second argument. An example narrowing operation with a given mask would be -
1733 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1734 // Mask (for 2 Longs) : TF
1735 // Predicate register for the above mask (16 bits) : 00000001 00000000
1736 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1737 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1738 assert_different_registers(src, ptmp);
1739 assert_different_registers(dst, ptmp);
1740 sve_pfalse(ptmp);
1741 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1742 sve_uzp1(dst, B, src, ptmp);
1743 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1744 sve_uzp1(dst, H, src, ptmp);
1745 sve_uzp1(dst, B, dst, ptmp);
1746 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1747 sve_uzp1(dst, S, src, ptmp);
1748 sve_uzp1(dst, H, dst, ptmp);
1749 sve_uzp1(dst, B, dst, ptmp);
1750 } else {
1751 assert(false, "unsupported");
1752 ShouldNotReachHere();
1753 }
1754 }
1755
1756 // Vector reduction add for integral type with ASIMD instructions.
1757 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1758 Register isrc, FloatRegister vsrc,
1759 unsigned vector_length_in_bytes,
1760 FloatRegister vtmp) {
1761 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1762 assert_different_registers(dst, isrc);
1763 bool isQ = vector_length_in_bytes == 16;
1764
1765 BLOCK_COMMENT("neon_reduce_add_integral {");
1766 switch(bt) {
1767 case T_BYTE:
1768 addv(vtmp, isQ ? T16B : T8B, vsrc);
1769 smov(dst, vtmp, B, 0);
1770 addw(dst, dst, isrc, ext::sxtb);
1771 break;
1772 case T_SHORT:
1773 addv(vtmp, isQ ? T8H : T4H, vsrc);
1774 smov(dst, vtmp, H, 0);
1775 addw(dst, dst, isrc, ext::sxth);
1776 break;
1777 case T_INT:
1778 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1779 umov(dst, vtmp, S, 0);
1780 addw(dst, dst, isrc);
1781 break;
1782 case T_LONG:
1783 assert(isQ, "unsupported");
1784 addpd(vtmp, vsrc);
1785 umov(dst, vtmp, D, 0);
1786 add(dst, dst, isrc);
1787 break;
1788 default:
1789 assert(false, "unsupported");
1790 ShouldNotReachHere();
1791 }
1792 BLOCK_COMMENT("} neon_reduce_add_integral");
1793 }
1794
1795 // Vector reduction multiply for integral type with ASIMD instructions.
1796 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1797 // Clobbers: rscratch1
1798 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1799 Register isrc, FloatRegister vsrc,
1800 unsigned vector_length_in_bytes,
1801 FloatRegister vtmp1, FloatRegister vtmp2) {
1802 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1803 bool isQ = vector_length_in_bytes == 16;
1804
1805 BLOCK_COMMENT("neon_reduce_mul_integral {");
1806 switch(bt) {
1807 case T_BYTE:
1808 if (isQ) {
1809 // Multiply the lower half and higher half of vector iteratively.
1810 // vtmp1 = vsrc[8:15]
1811 ins(vtmp1, D, vsrc, 0, 1);
1812 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1813 mulv(vtmp1, T8B, vtmp1, vsrc);
1814 // vtmp2 = vtmp1[4:7]
1815 ins(vtmp2, S, vtmp1, 0, 1);
1816 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1817 mulv(vtmp1, T8B, vtmp2, vtmp1);
1818 } else {
1819 ins(vtmp1, S, vsrc, 0, 1);
1820 mulv(vtmp1, T8B, vtmp1, vsrc);
1821 }
1822 // vtmp2 = vtmp1[2:3]
1823 ins(vtmp2, H, vtmp1, 0, 1);
1824 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1825 mulv(vtmp2, T8B, vtmp2, vtmp1);
1826 // dst = vtmp2[0] * isrc * vtmp2[1]
1827 umov(rscratch1, vtmp2, B, 0);
1828 mulw(dst, rscratch1, isrc);
1829 sxtb(dst, dst);
1830 umov(rscratch1, vtmp2, B, 1);
1831 mulw(dst, rscratch1, dst);
1832 sxtb(dst, dst);
1833 break;
1834 case T_SHORT:
1835 if (isQ) {
1836 ins(vtmp2, D, vsrc, 0, 1);
1837 mulv(vtmp2, T4H, vtmp2, vsrc);
1838 ins(vtmp1, S, vtmp2, 0, 1);
1839 mulv(vtmp1, T4H, vtmp1, vtmp2);
1840 } else {
1841 ins(vtmp1, S, vsrc, 0, 1);
1842 mulv(vtmp1, T4H, vtmp1, vsrc);
1843 }
1844 umov(rscratch1, vtmp1, H, 0);
1845 mulw(dst, rscratch1, isrc);
1846 sxth(dst, dst);
1847 umov(rscratch1, vtmp1, H, 1);
1848 mulw(dst, rscratch1, dst);
1849 sxth(dst, dst);
1850 break;
1851 case T_INT:
1852 if (isQ) {
1853 ins(vtmp1, D, vsrc, 0, 1);
1854 mulv(vtmp1, T2S, vtmp1, vsrc);
1855 } else {
1856 vtmp1 = vsrc;
1857 }
1858 umov(rscratch1, vtmp1, S, 0);
1859 mul(dst, rscratch1, isrc);
1860 umov(rscratch1, vtmp1, S, 1);
1861 mul(dst, rscratch1, dst);
1862 break;
1863 case T_LONG:
1864 umov(rscratch1, vsrc, D, 0);
1865 mul(dst, isrc, rscratch1);
1866 umov(rscratch1, vsrc, D, 1);
1867 mul(dst, dst, rscratch1);
1868 break;
1869 default:
1870 assert(false, "unsupported");
1871 ShouldNotReachHere();
1872 }
1873 BLOCK_COMMENT("} neon_reduce_mul_integral");
1874 }
1875
1876 // Vector reduction multiply for floating-point type with ASIMD instructions.
1877 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1878 FloatRegister fsrc, FloatRegister vsrc,
1879 unsigned vector_length_in_bytes,
1880 FloatRegister vtmp) {
1881 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1882 bool isQ = vector_length_in_bytes == 16;
1883
1884 BLOCK_COMMENT("neon_reduce_mul_fp {");
1885 switch(bt) {
1886 case T_FLOAT:
1887 fmuls(dst, fsrc, vsrc);
1888 ins(vtmp, S, vsrc, 0, 1);
1889 fmuls(dst, dst, vtmp);
1890 if (isQ) {
1891 ins(vtmp, S, vsrc, 0, 2);
1892 fmuls(dst, dst, vtmp);
1893 ins(vtmp, S, vsrc, 0, 3);
1894 fmuls(dst, dst, vtmp);
1895 }
1896 break;
1897 case T_DOUBLE:
1898 assert(isQ, "unsupported");
1899 fmuld(dst, fsrc, vsrc);
1900 ins(vtmp, D, vsrc, 0, 1);
1901 fmuld(dst, dst, vtmp);
1902 break;
1903 default:
1904 assert(false, "unsupported");
1905 ShouldNotReachHere();
1906 }
1907 BLOCK_COMMENT("} neon_reduce_mul_fp");
1908 }
1909
1910 // Helper to select logical instruction
1911 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1912 Register Rn, Register Rm,
1913 enum shift_kind kind, unsigned shift) {
1914 switch(opc) {
1915 case Op_AndReductionV:
1916 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1917 break;
1918 case Op_OrReductionV:
1919 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1920 break;
1921 case Op_XorReductionV:
1922 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1923 break;
1924 default:
1925 assert(false, "unsupported");
1926 ShouldNotReachHere();
1927 }
1928 }
1929
1930 // Vector reduction logical operations And, Or, Xor
1931 // Clobbers: rscratch1
1932 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1933 Register isrc, FloatRegister vsrc,
1934 unsigned vector_length_in_bytes) {
1935 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1936 "unsupported");
1937 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1938 assert_different_registers(dst, isrc);
1939 bool isQ = vector_length_in_bytes == 16;
1940
1941 BLOCK_COMMENT("neon_reduce_logical {");
1942 umov(rscratch1, vsrc, isQ ? D : S, 0);
1943 umov(dst, vsrc, isQ ? D : S, 1);
1944 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1945 switch(bt) {
1946 case T_BYTE:
1947 if (isQ) {
1948 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1949 }
1950 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1951 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1952 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1953 sxtb(dst, dst);
1954 break;
1955 case T_SHORT:
1956 if (isQ) {
1957 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1958 }
1959 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1960 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1961 sxth(dst, dst);
1962 break;
1963 case T_INT:
1964 if (isQ) {
1965 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1966 }
1967 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1968 break;
1969 case T_LONG:
1970 assert(isQ, "unsupported");
1971 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1972 break;
1973 default:
1974 assert(false, "unsupported");
1975 ShouldNotReachHere();
1976 }
1977 BLOCK_COMMENT("} neon_reduce_logical");
1978 }
1979
1980 // Helper function to decode min/max reduction operation properties
1981 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
1982 bool* is_unsigned,
1983 Condition* cond) {
1984 switch(opc) {
1985 case Op_MinReductionV:
1986 *is_min = true; *is_unsigned = false; *cond = LT; break;
1987 case Op_MaxReductionV:
1988 *is_min = false; *is_unsigned = false; *cond = GT; break;
1989 case Op_UMinReductionV:
1990 *is_min = true; *is_unsigned = true; *cond = LO; break;
1991 case Op_UMaxReductionV:
1992 *is_min = false; *is_unsigned = true; *cond = HI; break;
1993 default:
1994 ShouldNotReachHere();
1995 }
1996 }
1997
1998 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
1999 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2000 // Clobbers: rscratch1, rflags
2001 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2002 Register isrc, FloatRegister vsrc,
2003 unsigned vector_length_in_bytes,
2004 FloatRegister vtmp) {
2005 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2006 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2007 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2008 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2009 assert_different_registers(dst, isrc);
2010 bool isQ = vector_length_in_bytes == 16;
2011 bool is_min;
2012 bool is_unsigned;
2013 Condition cond;
2014 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2015 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2016 if (bt == T_LONG) {
2017 assert(vtmp == fnoreg, "should be");
2018 assert(isQ, "should be");
2019 umov(rscratch1, vsrc, D, 0);
2020 cmp(isrc, rscratch1);
2021 csel(dst, isrc, rscratch1, cond);
2022 umov(rscratch1, vsrc, D, 1);
2023 cmp(dst, rscratch1);
2024 csel(dst, dst, rscratch1, cond);
2025 } else {
2026 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2027 if (size == T2S) {
2028 // For T2S (2x32-bit elements), use pairwise instructions because
2029 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2030 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2031 } else {
2032 // For other sizes, use reduction to scalar instructions.
2033 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2034 }
2035 if (bt == T_INT) {
2036 umov(dst, vtmp, S, 0);
2037 } else if (is_unsigned) {
2038 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2039 } else {
2040 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2041 }
2042 cmpw(dst, isrc);
2043 cselw(dst, dst, isrc, cond);
2044 }
2045 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2046 }
2047
2048 // Vector reduction for integral type with SVE instruction.
2049 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2050 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2051 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2052 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2053 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2054 assert(pg->is_governing(), "This register has to be a governing predicate register");
2055 assert_different_registers(src1, dst);
2056 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2057 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2058 switch (opc) {
2059 case Op_AddReductionVI: {
2060 sve_uaddv(tmp, size, pg, src2);
2061 if (bt == T_BYTE) {
2062 smov(dst, tmp, size, 0);
2063 addw(dst, src1, dst, ext::sxtb);
2064 } else if (bt == T_SHORT) {
2065 smov(dst, tmp, size, 0);
2066 addw(dst, src1, dst, ext::sxth);
2067 } else {
2068 umov(dst, tmp, size, 0);
2069 addw(dst, dst, src1);
2070 }
2071 break;
2072 }
2073 case Op_AddReductionVL: {
2074 sve_uaddv(tmp, size, pg, src2);
2075 umov(dst, tmp, size, 0);
2076 add(dst, dst, src1);
2077 break;
2078 }
2079 case Op_AndReductionV: {
2080 sve_andv(tmp, size, pg, src2);
2081 if (bt == T_INT || bt == T_LONG) {
2082 umov(dst, tmp, size, 0);
2083 } else {
2084 smov(dst, tmp, size, 0);
2085 }
2086 if (bt == T_LONG) {
2087 andr(dst, dst, src1);
2088 } else {
2089 andw(dst, dst, src1);
2090 }
2091 break;
2092 }
2093 case Op_OrReductionV: {
2094 sve_orv(tmp, size, pg, src2);
2095 if (bt == T_INT || bt == T_LONG) {
2096 umov(dst, tmp, size, 0);
2097 } else {
2098 smov(dst, tmp, size, 0);
2099 }
2100 if (bt == T_LONG) {
2101 orr(dst, dst, src1);
2102 } else {
2103 orrw(dst, dst, src1);
2104 }
2105 break;
2106 }
2107 case Op_XorReductionV: {
2108 sve_eorv(tmp, size, pg, src2);
2109 if (bt == T_INT || bt == T_LONG) {
2110 umov(dst, tmp, size, 0);
2111 } else {
2112 smov(dst, tmp, size, 0);
2113 }
2114 if (bt == T_LONG) {
2115 eor(dst, dst, src1);
2116 } else {
2117 eorw(dst, dst, src1);
2118 }
2119 break;
2120 }
2121 case Op_MaxReductionV:
2122 case Op_MinReductionV:
2123 case Op_UMaxReductionV:
2124 case Op_UMinReductionV: {
2125 bool is_min;
2126 bool is_unsigned;
2127 Condition cond;
2128 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2129 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2130 // Move result from vector to general register
2131 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2132 umov(dst, tmp, size, 0);
2133 } else {
2134 smov(dst, tmp, size, 0);
2135 }
2136 if (bt == T_LONG) {
2137 cmp(dst, src1);
2138 csel(dst, dst, src1, cond);
2139 } else {
2140 cmpw(dst, src1);
2141 cselw(dst, dst, src1, cond);
2142 }
2143 break;
2144 }
2145 default:
2146 assert(false, "unsupported");
2147 ShouldNotReachHere();
2148 }
2149
2150 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2151 if (bt == T_BYTE) {
2152 sxtb(dst, dst);
2153 } else if (bt == T_SHORT) {
2154 sxth(dst, dst);
2155 }
2156 }
2157 }
2158
2159 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2160 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2161 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2162 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2163 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2164 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2165
2166 // Set all elements to false if the input "lane_cnt" is zero.
2167 if (lane_cnt == 0) {
2168 sve_pfalse(dst);
2169 return;
2170 }
2171
2172 SIMD_RegVariant size = elemType_to_regVariant(bt);
2173 assert(size != Q, "invalid size");
2174
2175 // Set all true if "lane_cnt" equals to the max lane count.
2176 if (lane_cnt == max_vector_length) {
2177 sve_ptrue(dst, size, /* ALL */ 0b11111);
2178 return;
2179 }
2180
2181 // Fixed numbers for "ptrue".
2182 switch(lane_cnt) {
2183 case 1: /* VL1 */
2184 case 2: /* VL2 */
2185 case 3: /* VL3 */
2186 case 4: /* VL4 */
2187 case 5: /* VL5 */
2188 case 6: /* VL6 */
2189 case 7: /* VL7 */
2190 case 8: /* VL8 */
2191 sve_ptrue(dst, size, lane_cnt);
2192 return;
2193 case 16:
2194 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2195 return;
2196 case 32:
2197 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2198 return;
2199 case 64:
2200 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2201 return;
2202 case 128:
2203 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2204 return;
2205 case 256:
2206 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2207 return;
2208 default:
2209 break;
2210 }
2211
2212 // Special patterns for "ptrue".
2213 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2214 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2215 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2216 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2217 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2218 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2219 } else {
2220 // Encode to "whileltw" for the remaining cases.
2221 mov(rscratch1, lane_cnt);
2222 sve_whileltw(dst, size, zr, rscratch1);
2223 }
2224 }
2225
2226 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2227 // Any remaining elements of dst will be filled with zero.
2228 // Clobbers: rscratch1
2229 // Preserves: mask, vzr
2230 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2231 FloatRegister vzr, FloatRegister vtmp,
2232 PRegister pgtmp, unsigned vector_length_in_bytes) {
2233 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2234 // When called by sve_compress_byte, src and vtmp may be the same register.
2235 assert_different_registers(dst, src, vzr);
2236 assert_different_registers(dst, vtmp, vzr);
2237 assert_different_registers(mask, pgtmp);
2238 // high <-- low
2239 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2240 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2241 // Expected result: dst = 00 00 00 hh ee dd bb aa
2242
2243 // Extend lowest half to type INT.
2244 // dst = 00dd 00cc 00bb 00aa
2245 sve_uunpklo(dst, S, src);
2246 // pgtmp = 0001 0000 0001 0001
2247 sve_punpklo(pgtmp, mask);
2248 // Pack the active elements in size of type INT to the right,
2249 // and fill the remainings with zero.
2250 // dst = 0000 00dd 00bb 00aa
2251 sve_compact(dst, S, dst, pgtmp);
2252 // Narrow the result back to type SHORT.
2253 // dst = 00 00 00 00 00 dd bb aa
2254 sve_uzp1(dst, H, dst, vzr);
2255
2256 // Return if the vector length is no more than MaxVectorSize/2, since the
2257 // highest half is invalid.
2258 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2259 return;
2260 }
2261
2262 // Count the active elements of lowest half.
2263 // rscratch1 = 3
2264 sve_cntp(rscratch1, S, ptrue, pgtmp);
2265
2266 // Repeat to the highest half.
2267 // pgtmp = 0001 0000 0000 0001
2268 sve_punpkhi(pgtmp, mask);
2269 // vtmp = 00hh 00gg 00ff 00ee
2270 sve_uunpkhi(vtmp, S, src);
2271 // vtmp = 0000 0000 00hh 00ee
2272 sve_compact(vtmp, S, vtmp, pgtmp);
2273 // vtmp = 00 00 00 00 00 00 hh ee
2274 sve_uzp1(vtmp, H, vtmp, vzr);
2275
2276 // pgtmp = 00 00 00 00 00 01 01 01
2277 sve_whilelt(pgtmp, H, zr, rscratch1);
2278 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2279 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2280 // Combine the compressed low with the compressed high:
2281 // dst = 00 00 00 hh ee dd bb aa
2282 sve_splice(dst, H, pgtmp, vtmp);
2283 }
2284
2285 // Clobbers: rscratch1, rscratch2
2286 // Preserves: src, mask
2287 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2288 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2289 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2290 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2291 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2292 assert_different_registers(mask, ptmp, pgtmp);
2293 // high <-- low
2294 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2295 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2296 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2297 FloatRegister vzr = vtmp3;
2298 sve_dup(vzr, B, 0);
2299
2300 // Extend lowest half to type SHORT.
2301 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2302 sve_uunpklo(vtmp1, H, src);
2303 // ptmp = 00 01 00 00 00 01 00 01
2304 sve_punpklo(ptmp, mask);
2305 // Pack the active elements in size of type SHORT to the right,
2306 // and fill the remainings with zero.
2307 // dst = 00 00 00 00 00 0g 0c 0a
2308 unsigned extended_size = vector_length_in_bytes << 1;
2309 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2310 // Narrow the result back to type BYTE.
2311 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2312 sve_uzp1(dst, B, dst, vzr);
2313
2314 // Return if the vector length is no more than MaxVectorSize/2, since the
2315 // highest half is invalid.
2316 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2317 return;
2318 }
2319 // Count the active elements of lowest half.
2320 // rscratch2 = 3
2321 sve_cntp(rscratch2, H, ptrue, ptmp);
2322
2323 // Repeat to the highest half.
2324 // ptmp = 00 01 00 00 00 00 00 01
2325 sve_punpkhi(ptmp, mask);
2326 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2327 sve_uunpkhi(vtmp2, H, src);
2328 // vtmp1 = 00 00 00 00 00 00 0p 0i
2329 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2330 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2331 sve_uzp1(vtmp1, B, vtmp1, vzr);
2332
2333 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2334 sve_whilelt(ptmp, B, zr, rscratch2);
2335 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2336 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2337 // Combine the compressed low with the compressed high:
2338 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2339 sve_splice(dst, B, ptmp, vtmp1);
2340 }
2341
2342 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2343 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2344 SIMD_Arrangement size = isQ ? T16B : T8B;
2345 if (bt == T_BYTE) {
2346 rbit(dst, size, src);
2347 } else {
2348 neon_reverse_bytes(dst, src, bt, isQ);
2349 rbit(dst, size, dst);
2350 }
2351 }
2352
2353 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2354 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2355 SIMD_Arrangement size = isQ ? T16B : T8B;
2356 switch (bt) {
2357 case T_BYTE:
2358 if (dst != src) {
2359 orr(dst, size, src, src);
2360 }
2361 break;
2362 case T_SHORT:
2363 rev16(dst, size, src);
2364 break;
2365 case T_INT:
2366 rev32(dst, size, src);
2367 break;
2368 case T_LONG:
2369 rev64(dst, size, src);
2370 break;
2371 default:
2372 assert(false, "unsupported");
2373 ShouldNotReachHere();
2374 }
2375 }
2376
2377 // VectorRearrange implementation for short/int/float/long/double types with NEON
2378 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2379 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2380 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2381 // and use bsl to implement the operation.
2382 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2383 FloatRegister shuffle, FloatRegister tmp,
2384 BasicType bt, bool isQ) {
2385 assert_different_registers(dst, src, shuffle, tmp);
2386 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2387 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2388
2389 // Here is an example that rearranges a NEON vector with 4 ints:
2390 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2391 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2392 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2393 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2394 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2395 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2396 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2397 // 4. Use Vm as index register, and use V1 as table register.
2398 // Then get V2 as the result by tbl NEON instructions.
2399 switch (bt) {
2400 case T_SHORT:
2401 mov(tmp, size1, 0x02);
2402 mulv(dst, size2, shuffle, tmp);
2403 mov(tmp, size2, 0x0100);
2404 addv(dst, size1, dst, tmp);
2405 tbl(dst, size1, src, 1, dst);
2406 break;
2407 case T_INT:
2408 case T_FLOAT:
2409 mov(tmp, size1, 0x04);
2410 mulv(dst, size2, shuffle, tmp);
2411 mov(tmp, size2, 0x03020100);
2412 addv(dst, size1, dst, tmp);
2413 tbl(dst, size1, src, 1, dst);
2414 break;
2415 case T_LONG:
2416 case T_DOUBLE:
2417 // Load the iota indices for Long type. The indices are ordered by
2418 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2419 // the offset for L is 48.
2420 lea(rscratch1,
2421 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2422 ldrq(tmp, rscratch1);
2423 // Check whether the input "shuffle" is the same with iota indices.
2424 // Return "src" if true, otherwise swap the two elements of "src".
2425 cm(EQ, dst, size2, shuffle, tmp);
2426 ext(tmp, size1, src, src, 8);
2427 bsl(dst, size1, src, tmp);
2428 break;
2429 default:
2430 assert(false, "unsupported element type");
2431 ShouldNotReachHere();
2432 }
2433 }
2434
2435 // Extract a scalar element from an sve vector at position 'idx'.
2436 // The input elements in src are expected to be of integral type.
2437 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2438 int idx, FloatRegister vtmp) {
2439 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2440 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2441 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2442 if (bt == T_INT || bt == T_LONG) {
2443 umov(dst, src, size, idx);
2444 } else {
2445 smov(dst, src, size, idx);
2446 }
2447 } else {
2448 sve_orr(vtmp, src, src);
2449 sve_ext(vtmp, vtmp, idx << size);
2450 if (bt == T_INT || bt == T_LONG) {
2451 umov(dst, vtmp, size, 0);
2452 } else {
2453 smov(dst, vtmp, size, 0);
2454 }
2455 }
2456 }
2457
2458 // java.lang.Math::round intrinsics
2459
2460 // Clobbers: rscratch1, rflags
2461 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2462 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2463 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2464 switch (T) {
2465 case T2S:
2466 case T4S:
2467 fmovs(tmp1, T, 0.5f);
2468 mov(rscratch1, jint_cast(0x1.0p23f));
2469 break;
2470 case T2D:
2471 fmovd(tmp1, T, 0.5);
2472 mov(rscratch1, julong_cast(0x1.0p52));
2473 break;
2474 default:
2475 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2476 }
2477 fadd(tmp1, T, tmp1, src);
2478 fcvtms(tmp1, T, tmp1);
2479 // tmp1 = floor(src + 0.5, ties to even)
2480
2481 fcvtas(dst, T, src);
2482 // dst = round(src), ties to away
2483
2484 fneg(tmp3, T, src);
2485 dup(tmp2, T, rscratch1);
2486 cm(HS, tmp3, T, tmp3, tmp2);
2487 // tmp3 is now a set of flags
2488
2489 bif(dst, T16B, tmp1, tmp3);
2490 // result in dst
2491 }
2492
2493 // Clobbers: rscratch1, rflags
2494 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2495 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2496 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2497 assert_different_registers(tmp1, tmp2, src, dst);
2498
2499 switch (T) {
2500 case S:
2501 mov(rscratch1, jint_cast(0x1.0p23f));
2502 break;
2503 case D:
2504 mov(rscratch1, julong_cast(0x1.0p52));
2505 break;
2506 default:
2507 assert(T == S || T == D, "invalid register variant");
2508 }
2509
2510 sve_frinta(dst, T, ptrue, src);
2511 // dst = round(src), ties to away
2512
2513 Label none;
2514
2515 sve_fneg(tmp1, T, ptrue, src);
2516 sve_dup(tmp2, T, rscratch1);
2517 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2518 br(EQ, none);
2519 {
2520 sve_cpy(tmp1, T, pgtmp, 0.5);
2521 sve_fadd(tmp1, T, pgtmp, src);
2522 sve_frintm(dst, T, pgtmp, tmp1);
2523 // dst = floor(src + 0.5, ties to even)
2524 }
2525 bind(none);
2526
2527 sve_fcvtzs(dst, T, ptrue, dst, T);
2528 // result in dst
2529 }
2530
2531 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2532 FloatRegister one, SIMD_Arrangement T) {
2533 assert_different_registers(dst, src, zero, one);
2534 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2535
2536 facgt(dst, T, src, zero);
2537 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2538 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2539 }
2540
2541 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2542 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2543 assert_different_registers(dst, src, zero, one, vtmp);
2544 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2545
2546 sve_orr(vtmp, src, src);
2547 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2548 switch (T) {
2549 case S:
2550 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2551 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2552 // on the sign of the float value
2553 break;
2554 case D:
2555 sve_and(vtmp, T, min_jlong);
2556 sve_orr(vtmp, T, jlong_cast(1.0));
2557 break;
2558 default:
2559 assert(false, "unsupported");
2560 ShouldNotReachHere();
2561 }
2562 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2563 // Result in dst
2564 }
2565
2566 bool C2_MacroAssembler::in_scratch_emit_size() {
2567 if (ciEnv::current()->task() != nullptr) {
2568 PhaseOutput* phase_output = Compile::current()->output();
2569 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2570 return true;
2571 }
2572 }
2573 return MacroAssembler::in_scratch_emit_size();
2574 }
2575
2576 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2577 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2578 }
2579
2580 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2581 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2582 if (t == TypeInt::INT) {
2583 return;
2584 }
2585
2586 BLOCK_COMMENT("verify_int_in_range {");
2587 Label L_success, L_failure;
2588
2589 jint lo = t->_lo;
2590 jint hi = t->_hi;
2591
2592 if (lo != min_jint) {
2593 subsw(rtmp, rval, lo);
2594 br(Assembler::LT, L_failure);
2595 }
2596 if (hi != max_jint) {
2597 subsw(rtmp, rval, hi);
2598 br(Assembler::GT, L_failure);
2599 }
2600 b(L_success);
2601
2602 bind(L_failure);
2603 movw(c_rarg0, idx);
2604 mov(c_rarg1, rval);
2605 movw(c_rarg2, lo);
2606 movw(c_rarg3, hi);
2607 reconstruct_frame_pointer(rtmp);
2608 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2609 hlt(0);
2610
2611 bind(L_success);
2612 BLOCK_COMMENT("} verify_int_in_range");
2613 }
2614
2615 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2616 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2617 }
2618
2619 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2620 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2621 if (t == TypeLong::LONG) {
2622 return;
2623 }
2624
2625 BLOCK_COMMENT("verify_long_in_range {");
2626 Label L_success, L_failure;
2627
2628 jlong lo = t->_lo;
2629 jlong hi = t->_hi;
2630
2631 if (lo != min_jlong) {
2632 subs(rtmp, rval, lo);
2633 br(Assembler::LT, L_failure);
2634 }
2635 if (hi != max_jlong) {
2636 subs(rtmp, rval, hi);
2637 br(Assembler::GT, L_failure);
2638 }
2639 b(L_success);
2640
2641 bind(L_failure);
2642 movw(c_rarg0, idx);
2643 mov(c_rarg1, rval);
2644 mov(c_rarg2, lo);
2645 mov(c_rarg3, hi);
2646 reconstruct_frame_pointer(rtmp);
2647 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2648 hlt(0);
2649
2650 bind(L_success);
2651 BLOCK_COMMENT("} verify_long_in_range");
2652 }
2653
2654 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2655 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2656 if (PreserveFramePointer) {
2657 // frame pointer is valid
2658 #ifdef ASSERT
2659 // Verify frame pointer value in rfp.
2660 add(rtmp, sp, framesize - 2 * wordSize);
2661 Label L_success;
2662 cmp(rfp, rtmp);
2663 br(Assembler::EQ, L_success);
2664 stop("frame pointer mismatch");
2665 bind(L_success);
2666 #endif // ASSERT
2667 } else {
2668 add(rfp, sp, framesize - 2 * wordSize);
2669 }
2670 }
2671
2672 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2673 // using Neon instructions and places it in the destination vector element corresponding to the
2674 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2675 // where NUM_ELEM is the number of BasicType elements per vector.
2676 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2677 // Otherwise, selects src2[idx – NUM_ELEM]
2678 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2679 FloatRegister src2, FloatRegister index,
2680 FloatRegister tmp, unsigned vector_length_in_bytes) {
2681 assert_different_registers(dst, src1, src2, tmp);
2682 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2683
2684 if (vector_length_in_bytes == 16) {
2685 assert(UseSVE <= 1, "sve must be <= 1");
2686 assert(src1->successor() == src2, "Source registers must be ordered");
2687 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2688 tbl(dst, size, src1, 2, index);
2689 } else { // vector length == 8
2690 assert(UseSVE == 0, "must be Neon only");
2691 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2692 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2693 // instruction with one vector lookup
2694 ins(tmp, D, src1, 0, 0);
2695 ins(tmp, D, src2, 1, 0);
2696 tbl(dst, size, tmp, 1, index);
2697 }
2698 }
2699
2700 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2701 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2702 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2703 // where NUM_ELEM is the number of BasicType elements per vector.
2704 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2705 // Otherwise, selects src2[idx – NUM_ELEM]
2706 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2707 FloatRegister src2, FloatRegister index,
2708 FloatRegister tmp, SIMD_RegVariant T,
2709 unsigned vector_length_in_bytes) {
2710 assert_different_registers(dst, src1, src2, index, tmp);
2711
2712 if (vector_length_in_bytes == 8) {
2713 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2714 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2715 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2716 // instruction with one vector lookup
2717 assert(UseSVE >= 1, "sve must be >= 1");
2718 ins(tmp, D, src1, 0, 0);
2719 ins(tmp, D, src2, 1, 0);
2720 sve_tbl(dst, T, tmp, index);
2721 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2722 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2723 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2724 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2725 // with the only exception of 8B vector length.
2726 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2727 assert(src1->successor() == src2, "Source registers must be ordered");
2728 sve_tbl(dst, T, src1, src2, index);
2729 }
2730 }
2731
2732 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2733 FloatRegister src2, FloatRegister index,
2734 FloatRegister tmp, BasicType bt,
2735 unsigned vector_length_in_bytes) {
2736
2737 assert_different_registers(dst, src1, src2, index, tmp);
2738
2739 // The cases that can reach this method are -
2740 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2741 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2742 //
2743 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2744 // and UseSVE = 2 with vector_length_in_bytes >= 8
2745 //
2746 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2747 // UseSVE = 1 with vector_length_in_bytes = 16
2748
2749 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2750 SIMD_RegVariant T = elemType_to_regVariant(bt);
2751 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2752 return;
2753 }
2754
2755 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2756 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2757 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2758
2759 bool isQ = vector_length_in_bytes == 16;
2760
2761 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2762 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2763
2764 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2765 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2766 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2767 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2768 // the indices can range from [0, 8).
2769 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2770 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2771 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2772 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2773 // Add the multiplied result to the vector in tmp to obtain the byte level
2774 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2775 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2776
2777 if (bt == T_BYTE) {
2778 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2779 } else {
2780 int elem_size = (bt == T_SHORT) ? 2 : 4;
2781 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2782
2783 mov(tmp, size1, elem_size);
2784 mulv(dst, size2, index, tmp);
2785 mov(tmp, size2, tbl_offset);
2786 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2787 // to select a set of 2B/4B
2788 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2789 }
2790 }
2791
2792 // Vector expand implementation. Elements from the src vector are expanded into
2793 // the dst vector under the control of the vector mask.
2794 // Since there are no native instructions directly corresponding to expand before
2795 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2796 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2797 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2798 // for NEON and SVE, but with different instructions where appropriate.
2799
2800 // Vector expand implementation for NEON.
2801 //
2802 // An example of 128-bit Byte vector:
2803 // Data direction: high <== low
2804 // Input:
2805 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2806 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2807 // Expected result:
2808 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2809 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2810 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2811 int vector_length_in_bytes) {
2812 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2813 assert_different_registers(dst, src, mask, tmp1, tmp2);
2814 // Since the TBL instruction only supports byte table, we need to
2815 // compute indices in byte type for all types.
2816 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2817 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2818 dup(tmp1, size, zr);
2819 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2820 negr(dst, size, mask);
2821 // Calculate vector index for TBL with prefix sum algorithm.
2822 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2823 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2824 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2825 addv(dst, size, tmp2, dst);
2826 }
2827 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2828 orr(tmp2, size, mask, mask);
2829 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2830 bsl(tmp2, size, dst, tmp1);
2831 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2832 movi(tmp1, size, 1);
2833 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2834 subv(dst, size, tmp2, tmp1);
2835 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2836 tbl(dst, size, src, 1, dst);
2837 }
2838
2839 // Vector expand implementation for SVE.
2840 //
2841 // An example of 128-bit Short vector:
2842 // Data direction: high <== low
2843 // Input:
2844 // src = gf ed cb a9 87 65 43 21
2845 // pg = 00 01 00 01 00 01 00 01
2846 // Expected result:
2847 // dst = 00 87 00 65 00 43 00 21
2848 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2849 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2850 int vector_length_in_bytes) {
2851 assert(UseSVE > 0, "expand implementation only for SVE");
2852 assert_different_registers(dst, src, tmp1, tmp2);
2853 SIMD_RegVariant size = elemType_to_regVariant(bt);
2854
2855 // tmp1 = 00 00 00 00 00 00 00 00
2856 sve_dup(tmp1, size, 0);
2857 sve_movprfx(tmp2, tmp1);
2858 // tmp2 = 00 01 00 01 00 01 00 01
2859 sve_cpy(tmp2, size, pg, 1, true);
2860 // Calculate vector index for TBL with prefix sum algorithm.
2861 // tmp2 = 04 04 03 03 02 02 01 01
2862 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2863 sve_movprfx(dst, tmp1);
2864 // The EXT instruction operates on the full-width sve register. The correct
2865 // index calculation method is:
2866 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2867 // MaxVectorSize - i.
2868 sve_ext(dst, tmp2, MaxVectorSize - i);
2869 sve_add(tmp2, size, dst, tmp2);
2870 }
2871 // dst = 00 04 00 03 00 02 00 01
2872 sve_sel(dst, size, pg, tmp2, tmp1);
2873 // dst = -1 03 -1 02 -1 01 -1 00
2874 sve_sub(dst, size, 1);
2875 // dst = 00 87 00 65 00 43 00 21
2876 sve_tbl(dst, size, src, dst);
2877 }
2878
2879 // Optimized SVE cpy (imm, zeroing) instruction.
2880 //
2881 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2882 // functionality, but test results show that `movi; cpy(imm, merging)` has
2883 // higher throughput on some microarchitectures. This would depend on
2884 // microarchitecture and so may vary between implementations.
2885 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2886 PRegister pg, int imm8, bool isMerge) {
2887 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2888 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2889 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2890 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2891 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2892 // entire Z<dst> register. According to the Arm Software Optimization
2893 // Guide, `movi` is zero latency.
2894 movi(dst, T2D, 0);
2895 isMerge = true;
2896 }
2897 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2898 }