1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2026 Arm Limited and/or its affiliates.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/matcher.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/objectMonitorTable.hpp"
35 #include "runtime/stubRoutines.hpp"
36 #include "runtime/synchronizer.hpp"
37 #include "utilities/globalDefinitions.hpp"
38 #include "utilities/powerOfTwo.hpp"
39
40 #ifdef PRODUCT
41 #define BLOCK_COMMENT(str) /* nothing */
42 #define STOP(error) stop(error)
43 #else
44 #define BLOCK_COMMENT(str) block_comment(str)
45 #define STOP(error) block_comment(error); stop(error)
46 #endif
47
48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
49
50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
51
52 void C2_MacroAssembler::entry_barrier() {
53 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
54 // Dummy labels for just measuring the code size
55 Label dummy_slow_path;
56 Label dummy_continuation;
57 Label dummy_guard;
58 Label* slow_path = &dummy_slow_path;
59 Label* continuation = &dummy_continuation;
60 Label* guard = &dummy_guard;
61 if (!Compile::current()->output()->in_scratch_emit_size()) {
62 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
63 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
64 Compile::current()->output()->add_stub(stub);
65 slow_path = &stub->entry();
66 continuation = &stub->continuation();
67 guard = &stub->guard();
68 }
69 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
70 bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
71 }
72
73 // jdk.internal.util.ArraysSupport.vectorizedHashCode
74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
75 FloatRegister vdata0, FloatRegister vdata1,
76 FloatRegister vdata2, FloatRegister vdata3,
77 FloatRegister vmul0, FloatRegister vmul1,
78 FloatRegister vmul2, FloatRegister vmul3,
79 FloatRegister vpow, FloatRegister vpowm,
80 BasicType eltype) {
81 ARRAYS_HASHCODE_REGISTERS;
82
83 Register tmp1 = rscratch1, tmp2 = rscratch2;
84
85 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
86
87 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
88 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
89 // use 4H for chars and shorts instead, but using 8H gives better performance.
90 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
91 : eltype == T_CHAR || eltype == T_SHORT ? 8
92 : eltype == T_INT ? 4
93 : 0;
94 guarantee(vf, "unsupported eltype");
95
96 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
97 const size_t unroll_factor = 4;
98
99 switch (eltype) {
100 case T_BOOLEAN:
101 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
102 break;
103 case T_CHAR:
104 BLOCK_COMMENT("arrays_hashcode(char) {");
105 break;
106 case T_BYTE:
107 BLOCK_COMMENT("arrays_hashcode(byte) {");
108 break;
109 case T_SHORT:
110 BLOCK_COMMENT("arrays_hashcode(short) {");
111 break;
112 case T_INT:
113 BLOCK_COMMENT("arrays_hashcode(int) {");
114 break;
115 default:
116 ShouldNotReachHere();
117 }
118
119 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
120 // implemented by the stub executes just once. Call the stub only if at least two iterations will
121 // be executed.
122 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
123 cmpw(cnt, large_threshold);
124 br(Assembler::HS, LARGE);
125
126 bind(TAIL);
127
128 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
129 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
130 // Iteration eats up the remainder, uf elements at a time.
131 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
132 andr(tmp2, cnt, unroll_factor - 1);
133 adr(tmp1, BR_BASE);
134 // For Cortex-A53 offset is 4 because 2 nops are generated.
135 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
136 movw(tmp2, 0x1f);
137 br(tmp1);
138
139 bind(LOOP);
140 for (size_t i = 0; i < unroll_factor; ++i) {
141 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
142 maddw(result, result, tmp2, tmp1);
143 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
144 // Generate 2nd nop to have 4 instructions per iteration.
145 if (VM_Version::supports_a53mac()) {
146 nop();
147 }
148 }
149 bind(BR_BASE);
150 subsw(cnt, cnt, unroll_factor);
151 br(Assembler::HS, LOOP);
152
153 b(DONE);
154
155 bind(LARGE);
156
157 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
158 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
159 address tpc = trampoline_call(stub);
160 if (tpc == nullptr) {
161 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
162 postcond(pc() == badAddress);
163 return nullptr;
164 }
165
166 bind(DONE);
167
168 BLOCK_COMMENT("} // arrays_hashcode");
169
170 postcond(pc() != badAddress);
171 return pc();
172 }
173
174 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
175 Register t2, Register t3) {
176 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
177
178 // Handle inflated monitor.
179 Label inflated;
180 // Finish fast lock successfully. MUST branch to with flag == EQ
181 Label locked;
182 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
183 Label slow_path;
184
185 if (UseObjectMonitorTable) {
186 // Clear cache in case fast locking succeeds or we need to take the slow-path.
187 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
188 }
189
190 if (DiagnoseSyncOnValueBasedClasses != 0) {
191 load_klass(t1, obj);
192 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
193 tst(t1, KlassFlags::_misc_is_value_based_class);
194 br(Assembler::NE, slow_path);
195 }
196
197 const Register t1_mark = t1;
198 const Register t3_t = t3;
199
200 { // Fast locking
201
202 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
203 Label push;
204
205 const Register t2_top = t2;
206
207 // Check if lock-stack is full.
208 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
209 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
210 br(Assembler::GT, slow_path);
211
212 // Check if recursive.
213 subw(t3_t, t2_top, oopSize);
214 ldr(t3_t, Address(rthread, t3_t));
215 cmp(obj, t3_t);
216 br(Assembler::EQ, push);
217
218 // Relaxed normal load to check for monitor. Optimization for monitor case.
219 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
220 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
221
222 // Not inflated
223 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
224
225 // Try to lock. Transition lock-bits 0b01 => 0b00
226 orr(t1_mark, t1_mark, markWord::unlocked_value);
227 eor(t3_t, t1_mark, markWord::unlocked_value);
228 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
229 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
230 br(Assembler::NE, slow_path);
231
232 bind(push);
233 // After successful lock, push object on lock-stack.
234 str(obj, Address(rthread, t2_top));
235 addw(t2_top, t2_top, oopSize);
236 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
237 b(locked);
238 }
239
240 { // Handle inflated monitor.
241 bind(inflated);
242
243 const Register t1_monitor = t1;
244
245 if (!UseObjectMonitorTable) {
246 assert(t1_monitor == t1_mark, "should be the same here");
247 } else {
248 const Register t1_hash = t1;
249 Label monitor_found;
250
251 // Save the mark, we might need it to extract the hash.
252 mov(t3, t1_mark);
253
254 // Look for the monitor in the om_cache.
255
256 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
257 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
258 const int num_unrolled = OMCache::CAPACITY;
259 for (int i = 0; i < num_unrolled; i++) {
260 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
261 ldr(t2, Address(rthread, cache_offset));
262 cmp(obj, t2);
263 br(Assembler::EQ, monitor_found);
264 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
265 }
266
267 // Look for the monitor in the table.
268
269 // Get the hash code.
270 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
271
272 // Get the table and calculate the bucket's address
273 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
274 ldr(t3, Address(t3));
275 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
276 ands(t1_hash, t1_hash, t2);
277 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
278
279 // Read the monitor from the bucket.
280 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
281
282 // Check if the monitor in the bucket is special (empty, tombstone or removed).
283 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
284 br(Assembler::LO, slow_path);
285
286 // Check if object matches.
287 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
288 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
289 bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
290 cmp(t3, obj);
291 br(Assembler::NE, slow_path);
292
293 bind(monitor_found);
294 }
295
296 const Register t2_owner_addr = t2;
297 const Register t3_owner = t3;
298 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
299 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
300 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
301
302 Label monitor_locked;
303
304 // Compute owner address.
305 lea(t2_owner_addr, owner_address);
306
307 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
308 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
309 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
310 /*release*/ false, /*weak*/ false, t3_owner);
311 br(Assembler::EQ, monitor_locked);
312
313 // Check if recursive.
314 cmp(t3_owner, rscratch2);
315 br(Assembler::NE, slow_path);
316
317 // Recursive.
318 increment(recursions_address, 1);
319
320 bind(monitor_locked);
321 if (UseObjectMonitorTable) {
322 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
323 }
324 }
325
326 bind(locked);
327
328 #ifdef ASSERT
329 // Check that locked label is reached with Flags == EQ.
330 Label flag_correct;
331 br(Assembler::EQ, flag_correct);
332 stop("Fast Lock Flag != EQ");
333 #endif
334
335 bind(slow_path);
336 #ifdef ASSERT
337 // Check that slow_path label is reached with Flags == NE.
338 br(Assembler::NE, flag_correct);
339 stop("Fast Lock Flag != NE");
340 bind(flag_correct);
341 #endif
342 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
343 }
344
345 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
346 Register t2, Register t3) {
347 assert_different_registers(obj, box, t1, t2, t3);
348
349 // Handle inflated monitor.
350 Label inflated, inflated_load_mark;
351 // Finish fast unlock successfully. MUST branch to with flag == EQ
352 Label unlocked;
353 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
354 Label slow_path;
355
356 const Register t1_mark = t1;
357 const Register t2_top = t2;
358 const Register t3_t = t3;
359
360 { // Fast unlock
361
362 Label push_and_slow_path;
363
364 // Check if obj is top of lock-stack.
365 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
366 subw(t2_top, t2_top, oopSize);
367 ldr(t3_t, Address(rthread, t2_top));
368 cmp(obj, t3_t);
369 // Top of lock stack was not obj. Must be monitor.
370 br(Assembler::NE, inflated_load_mark);
371
372 // Pop lock-stack.
373 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
374 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
375
376 // Check if recursive.
377 subw(t3_t, t2_top, oopSize);
378 ldr(t3_t, Address(rthread, t3_t));
379 cmp(obj, t3_t);
380 br(Assembler::EQ, unlocked);
381
382 // Not recursive.
383 // Load Mark.
384 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
385
386 // Check header for monitor (0b10).
387 // Because we got here by popping (meaning we pushed in locked)
388 // there will be no monitor in the box. So we need to push back the obj
389 // so that the runtime can fix any potential anonymous owner.
390 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
391
392 // Try to unlock. Transition lock bits 0b00 => 0b01
393 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
394 orr(t3_t, t1_mark, markWord::unlocked_value);
395 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
396 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
397 br(Assembler::EQ, unlocked);
398
399 bind(push_and_slow_path);
400 // Compare and exchange failed.
401 // Restore lock-stack and handle the unlock in runtime.
402 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
403 addw(t2_top, t2_top, oopSize);
404 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
405 b(slow_path);
406 }
407
408
409 { // Handle inflated monitor.
410 bind(inflated_load_mark);
411 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
412 #ifdef ASSERT
413 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
414 stop("Fast Unlock not monitor");
415 #endif
416
417 bind(inflated);
418
419 #ifdef ASSERT
420 Label check_done;
421 subw(t2_top, t2_top, oopSize);
422 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
423 br(Assembler::LT, check_done);
424 ldr(t3_t, Address(rthread, t2_top));
425 cmp(obj, t3_t);
426 br(Assembler::NE, inflated);
427 stop("Fast Unlock lock on stack");
428 bind(check_done);
429 #endif
430
431 const Register t1_monitor = t1;
432
433 if (!UseObjectMonitorTable) {
434 assert(t1_monitor == t1_mark, "should be the same here");
435
436 // Untag the monitor.
437 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
438 } else {
439 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
440 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
441 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
442 br(Assembler::LO, slow_path);
443 }
444
445 const Register t2_recursions = t2;
446 Label not_recursive;
447
448 // Check if recursive.
449 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
450 cbz(t2_recursions, not_recursive);
451
452 // Recursive unlock.
453 sub(t2_recursions, t2_recursions, 1u);
454 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
455 // Set flag == EQ
456 cmp(t2_recursions, t2_recursions);
457 b(unlocked);
458
459 bind(not_recursive);
460
461 const Register t2_owner_addr = t2;
462
463 // Compute owner address.
464 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
465
466 // Set owner to null.
467 // Release to satisfy the JMM
468 stlr(zr, t2_owner_addr);
469 // We need a full fence after clearing owner to avoid stranding.
470 // StoreLoad achieves this.
471 membar(StoreLoad);
472
473 // Check if the entry_list is empty.
474 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
475 cmp(rscratch1, zr);
476 br(Assembler::EQ, unlocked); // If so we are done.
477
478 // Check if there is a successor.
479 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
480 cmp(rscratch1, zr);
481 br(Assembler::NE, unlocked); // If so we are done.
482
483 // Save the monitor pointer in the current thread, so we can try to
484 // reacquire the lock in SharedRuntime::monitor_exit_helper().
485 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
486
487 cmp(zr, rthread); // Set Flag to NE => slow path
488 b(slow_path);
489 }
490
491 bind(unlocked);
492 cmp(zr, zr); // Set Flags to EQ => fast path
493
494 #ifdef ASSERT
495 // Check that unlocked label is reached with Flags == EQ.
496 Label flag_correct;
497 br(Assembler::EQ, flag_correct);
498 stop("Fast Unlock Flag != EQ");
499 #endif
500
501 bind(slow_path);
502 #ifdef ASSERT
503 // Check that slow_path label is reached with Flags == NE.
504 br(Assembler::NE, flag_correct);
505 stop("Fast Unlock Flag != NE");
506 bind(flag_correct);
507 #endif
508 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
509 }
510
511 // Search for str1 in str2 and return index or -1
512 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
513 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
514 Register cnt2, Register cnt1,
515 Register tmp1, Register tmp2,
516 Register tmp3, Register tmp4,
517 Register tmp5, Register tmp6,
518 int icnt1, Register result, int ae) {
519 // NOTE: tmp5, tmp6 can be zr depending on specific method version
520 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
521
522 Register ch1 = rscratch1;
523 Register ch2 = rscratch2;
524 Register cnt1tmp = tmp1;
525 Register cnt2tmp = tmp2;
526 Register cnt1_neg = cnt1;
527 Register cnt2_neg = cnt2;
528 Register result_tmp = tmp4;
529
530 bool isL = ae == StrIntrinsicNode::LL;
531
532 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
533 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
534 int str1_chr_shift = str1_isL ? 0:1;
535 int str2_chr_shift = str2_isL ? 0:1;
536 int str1_chr_size = str1_isL ? 1:2;
537 int str2_chr_size = str2_isL ? 1:2;
538 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
539 (chr_insn)&MacroAssembler::ldrh;
540 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
541 (chr_insn)&MacroAssembler::ldrh;
542 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
543 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
544
545 // Note, inline_string_indexOf() generates checks:
546 // if (substr.count > string.count) return -1;
547 // if (substr.count == 0) return 0;
548
549 // We have two strings, a source string in str2, cnt2 and a pattern string
550 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
551
552 // For larger pattern and source we use a simplified Boyer Moore algorithm.
553 // With a small pattern and source we use linear scan.
554
555 if (icnt1 == -1) {
556 sub(result_tmp, cnt2, cnt1);
557 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
558 br(LT, LINEARSEARCH);
559 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
560 subs(zr, cnt1, 256);
561 lsr(tmp1, cnt2, 2);
562 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
563 br(GE, LINEARSTUB);
564 }
565
566 // The Boyer Moore alogorithm is based on the description here:-
567 //
568 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
569 //
570 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
571 // and the 'Good Suffix' rule.
572 //
573 // These rules are essentially heuristics for how far we can shift the
574 // pattern along the search string.
575 //
576 // The implementation here uses the 'Bad Character' rule only because of the
577 // complexity of initialisation for the 'Good Suffix' rule.
578 //
579 // This is also known as the Boyer-Moore-Horspool algorithm:-
580 //
581 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
582 //
583 // This particular implementation has few java-specific optimizations.
584 //
585 // #define ASIZE 256
586 //
587 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
588 // int i, j;
589 // unsigned c;
590 // unsigned char bc[ASIZE];
591 //
592 // /* Preprocessing */
593 // for (i = 0; i < ASIZE; ++i)
594 // bc[i] = m;
595 // for (i = 0; i < m - 1; ) {
596 // c = x[i];
597 // ++i;
598 // // c < 256 for Latin1 string, so, no need for branch
599 // #ifdef PATTERN_STRING_IS_LATIN1
600 // bc[c] = m - i;
601 // #else
602 // if (c < ASIZE) bc[c] = m - i;
603 // #endif
604 // }
605 //
606 // /* Searching */
607 // j = 0;
608 // while (j <= n - m) {
609 // c = y[i+j];
610 // if (x[m-1] == c)
611 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
612 // if (i < 0) return j;
613 // // c < 256 for Latin1 string, so, no need for branch
614 // #ifdef SOURCE_STRING_IS_LATIN1
615 // // LL case: (c< 256) always true. Remove branch
616 // j += bc[y[j+m-1]];
617 // #endif
618 // #ifndef PATTERN_STRING_IS_UTF
619 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
620 // if (c < ASIZE)
621 // j += bc[y[j+m-1]];
622 // else
623 // j += 1
624 // #endif
625 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
626 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
627 // if (c < ASIZE)
628 // j += bc[y[j+m-1]];
629 // else
630 // j += m
631 // #endif
632 // }
633 // }
634
635 if (icnt1 == -1) {
636 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
637 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
638 Register cnt1end = tmp2;
639 Register str2end = cnt2;
640 Register skipch = tmp2;
641
642 // str1 length is >=8, so, we can read at least 1 register for cases when
643 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
644 // UL case. We'll re-read last character in inner pre-loop code to have
645 // single outer pre-loop load
646 const int firstStep = isL ? 7 : 3;
647
648 const int ASIZE = 256;
649 const int STORED_BYTES = 32; // amount of bytes stored per instruction
650 sub(sp, sp, ASIZE);
651 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
652 mov(ch1, sp);
653 BIND(BM_INIT_LOOP);
654 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
655 subs(tmp5, tmp5, 1);
656 br(GT, BM_INIT_LOOP);
657
658 sub(cnt1tmp, cnt1, 1);
659 mov(tmp5, str2);
660 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
661 sub(ch2, cnt1, 1);
662 mov(tmp3, str1);
663 BIND(BCLOOP);
664 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
665 if (!str1_isL) {
666 subs(zr, ch1, ASIZE);
667 br(HS, BCSKIP);
668 }
669 strb(ch2, Address(sp, ch1));
670 BIND(BCSKIP);
671 subs(ch2, ch2, 1);
672 br(GT, BCLOOP);
673
674 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
675 if (str1_isL == str2_isL) {
676 // load last 8 bytes (8LL/4UU symbols)
677 ldr(tmp6, Address(tmp6, -wordSize));
678 } else {
679 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
680 // convert Latin1 to UTF. We'll have to wait until load completed, but
681 // it's still faster than per-character loads+checks
682 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
683 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
684 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
685 andr(tmp6, tmp6, 0xFF); // str1[N-4]
686 orr(ch2, ch1, ch2, LSL, 16);
687 orr(tmp6, tmp6, tmp3, LSL, 48);
688 orr(tmp6, tmp6, ch2, LSL, 16);
689 }
690 BIND(BMLOOPSTR2);
691 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
692 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
693 if (str1_isL == str2_isL) {
694 // re-init tmp3. It's for free because it's executed in parallel with
695 // load above. Alternative is to initialize it before loop, but it'll
696 // affect performance on in-order systems with 2 or more ld/st pipelines
697 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
698 }
699 if (!isL) { // UU/UL case
700 lsl(ch2, cnt1tmp, 1); // offset in bytes
701 }
702 cmp(tmp3, skipch);
703 br(NE, BMSKIP);
704 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
705 mov(ch1, tmp6);
706 if (isL) {
707 b(BMLOOPSTR1_AFTER_LOAD);
708 } else {
709 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
710 b(BMLOOPSTR1_CMP);
711 }
712 BIND(BMLOOPSTR1);
713 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
714 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
715 BIND(BMLOOPSTR1_AFTER_LOAD);
716 subs(cnt1tmp, cnt1tmp, 1);
717 br(LT, BMLOOPSTR1_LASTCMP);
718 BIND(BMLOOPSTR1_CMP);
719 cmp(ch1, ch2);
720 br(EQ, BMLOOPSTR1);
721 BIND(BMSKIP);
722 if (!isL) {
723 // if we've met UTF symbol while searching Latin1 pattern, then we can
724 // skip cnt1 symbols
725 if (str1_isL != str2_isL) {
726 mov(result_tmp, cnt1);
727 } else {
728 mov(result_tmp, 1);
729 }
730 subs(zr, skipch, ASIZE);
731 br(HS, BMADV);
732 }
733 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
734 BIND(BMADV);
735 sub(cnt1tmp, cnt1, 1);
736 add(str2, str2, result_tmp, LSL, str2_chr_shift);
737 cmp(str2, str2end);
738 br(LE, BMLOOPSTR2);
739 add(sp, sp, ASIZE);
740 b(NOMATCH);
741 BIND(BMLOOPSTR1_LASTCMP);
742 cmp(ch1, ch2);
743 br(NE, BMSKIP);
744 BIND(BMMATCH);
745 sub(result, str2, tmp5);
746 if (!str2_isL) lsr(result, result, 1);
747 add(sp, sp, ASIZE);
748 b(DONE);
749
750 BIND(LINEARSTUB);
751 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
752 br(LT, LINEAR_MEDIUM);
753 mov(result, zr);
754 RuntimeAddress stub = nullptr;
755 if (isL) {
756 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
757 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
758 } else if (str1_isL) {
759 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
760 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
761 } else {
762 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
763 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
764 }
765 address call = trampoline_call(stub);
766 if (call == nullptr) {
767 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
768 ciEnv::current()->record_failure("CodeCache is full");
769 return;
770 }
771 b(DONE);
772 }
773
774 BIND(LINEARSEARCH);
775 {
776 Label DO1, DO2, DO3;
777
778 Register str2tmp = tmp2;
779 Register first = tmp3;
780
781 if (icnt1 == -1)
782 {
783 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
784
785 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
786 br(LT, DOSHORT);
787 BIND(LINEAR_MEDIUM);
788 (this->*str1_load_1chr)(first, Address(str1));
789 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
790 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
791 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
792 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
793
794 BIND(FIRST_LOOP);
795 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
796 cmp(first, ch2);
797 br(EQ, STR1_LOOP);
798 BIND(STR2_NEXT);
799 adds(cnt2_neg, cnt2_neg, str2_chr_size);
800 br(LE, FIRST_LOOP);
801 b(NOMATCH);
802
803 BIND(STR1_LOOP);
804 adds(cnt1tmp, cnt1_neg, str1_chr_size);
805 add(cnt2tmp, cnt2_neg, str2_chr_size);
806 br(GE, MATCH);
807
808 BIND(STR1_NEXT);
809 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
810 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
811 cmp(ch1, ch2);
812 br(NE, STR2_NEXT);
813 adds(cnt1tmp, cnt1tmp, str1_chr_size);
814 add(cnt2tmp, cnt2tmp, str2_chr_size);
815 br(LT, STR1_NEXT);
816 b(MATCH);
817
818 BIND(DOSHORT);
819 if (str1_isL == str2_isL) {
820 cmp(cnt1, (u1)2);
821 br(LT, DO1);
822 br(GT, DO3);
823 }
824 }
825
826 if (icnt1 == 4) {
827 Label CH1_LOOP;
828
829 (this->*load_4chr)(ch1, str1);
830 sub(result_tmp, cnt2, 4);
831 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
832 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
833
834 BIND(CH1_LOOP);
835 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
836 cmp(ch1, ch2);
837 br(EQ, MATCH);
838 adds(cnt2_neg, cnt2_neg, str2_chr_size);
839 br(LE, CH1_LOOP);
840 b(NOMATCH);
841 }
842
843 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
844 Label CH1_LOOP;
845
846 BIND(DO2);
847 (this->*load_2chr)(ch1, str1);
848 if (icnt1 == 2) {
849 sub(result_tmp, cnt2, 2);
850 }
851 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
852 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
853 BIND(CH1_LOOP);
854 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
855 cmp(ch1, ch2);
856 br(EQ, MATCH);
857 adds(cnt2_neg, cnt2_neg, str2_chr_size);
858 br(LE, CH1_LOOP);
859 b(NOMATCH);
860 }
861
862 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
863 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
864
865 BIND(DO3);
866 (this->*load_2chr)(first, str1);
867 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
868 if (icnt1 == 3) {
869 sub(result_tmp, cnt2, 3);
870 }
871 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
872 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
873 BIND(FIRST_LOOP);
874 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
875 cmpw(first, ch2);
876 br(EQ, STR1_LOOP);
877 BIND(STR2_NEXT);
878 adds(cnt2_neg, cnt2_neg, str2_chr_size);
879 br(LE, FIRST_LOOP);
880 b(NOMATCH);
881
882 BIND(STR1_LOOP);
883 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
884 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
885 cmp(ch1, ch2);
886 br(NE, STR2_NEXT);
887 b(MATCH);
888 }
889
890 if (icnt1 == -1 || icnt1 == 1) {
891 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
892
893 BIND(DO1);
894 (this->*str1_load_1chr)(ch1, str1);
895 cmp(cnt2, (u1)8);
896 br(LT, DO1_SHORT);
897
898 sub(result_tmp, cnt2, 8/str2_chr_size);
899 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
900 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
901 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
902
903 if (str2_isL) {
904 orr(ch1, ch1, ch1, LSL, 8);
905 }
906 orr(ch1, ch1, ch1, LSL, 16);
907 orr(ch1, ch1, ch1, LSL, 32);
908 BIND(CH1_LOOP);
909 ldr(ch2, Address(str2, cnt2_neg));
910 eor(ch2, ch1, ch2);
911 sub(tmp1, ch2, tmp3);
912 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
913 bics(tmp1, tmp1, tmp2);
914 br(NE, HAS_ZERO);
915 adds(cnt2_neg, cnt2_neg, 8);
916 br(LT, CH1_LOOP);
917
918 cmp(cnt2_neg, (u1)8);
919 mov(cnt2_neg, 0);
920 br(LT, CH1_LOOP);
921 b(NOMATCH);
922
923 BIND(HAS_ZERO);
924 rev(tmp1, tmp1);
925 clz(tmp1, tmp1);
926 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
927 b(MATCH);
928
929 BIND(DO1_SHORT);
930 mov(result_tmp, cnt2);
931 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
932 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
933 BIND(DO1_LOOP);
934 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
935 cmpw(ch1, ch2);
936 br(EQ, MATCH);
937 adds(cnt2_neg, cnt2_neg, str2_chr_size);
938 br(LT, DO1_LOOP);
939 }
940 }
941 BIND(NOMATCH);
942 mov(result, -1);
943 b(DONE);
944 BIND(MATCH);
945 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
946 BIND(DONE);
947 }
948
949 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
950 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
951
952 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
953 Register ch, Register result,
954 Register tmp1, Register tmp2, Register tmp3)
955 {
956 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
957 Register cnt1_neg = cnt1;
958 Register ch1 = rscratch1;
959 Register result_tmp = rscratch2;
960
961 cbz(cnt1, NOMATCH);
962
963 cmp(cnt1, (u1)4);
964 br(LT, DO1_SHORT);
965
966 orr(ch, ch, ch, LSL, 16);
967 orr(ch, ch, ch, LSL, 32);
968
969 sub(cnt1, cnt1, 4);
970 mov(result_tmp, cnt1);
971 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
972 sub(cnt1_neg, zr, cnt1, LSL, 1);
973
974 mov(tmp3, 0x0001000100010001);
975
976 BIND(CH1_LOOP);
977 ldr(ch1, Address(str1, cnt1_neg));
978 eor(ch1, ch, ch1);
979 sub(tmp1, ch1, tmp3);
980 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
981 bics(tmp1, tmp1, tmp2);
982 br(NE, HAS_ZERO);
983 adds(cnt1_neg, cnt1_neg, 8);
984 br(LT, CH1_LOOP);
985
986 cmp(cnt1_neg, (u1)8);
987 mov(cnt1_neg, 0);
988 br(LT, CH1_LOOP);
989 b(NOMATCH);
990
991 BIND(HAS_ZERO);
992 rev(tmp1, tmp1);
993 clz(tmp1, tmp1);
994 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
995 b(MATCH);
996
997 BIND(DO1_SHORT);
998 mov(result_tmp, cnt1);
999 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1000 sub(cnt1_neg, zr, cnt1, LSL, 1);
1001 BIND(DO1_LOOP);
1002 ldrh(ch1, Address(str1, cnt1_neg));
1003 cmpw(ch, ch1);
1004 br(EQ, MATCH);
1005 adds(cnt1_neg, cnt1_neg, 2);
1006 br(LT, DO1_LOOP);
1007 BIND(NOMATCH);
1008 mov(result, -1);
1009 b(DONE);
1010 BIND(MATCH);
1011 add(result, result_tmp, cnt1_neg, ASR, 1);
1012 BIND(DONE);
1013 }
1014
1015 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1016 Register ch, Register result,
1017 FloatRegister ztmp1,
1018 FloatRegister ztmp2,
1019 PRegister tmp_pg,
1020 PRegister tmp_pdn, bool isL)
1021 {
1022 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1023 assert(tmp_pg->is_governing(),
1024 "this register has to be a governing predicate register");
1025
1026 Label LOOP, MATCH, DONE, NOMATCH;
1027 Register vec_len = rscratch1;
1028 Register idx = rscratch2;
1029
1030 SIMD_RegVariant T = (isL == true) ? B : H;
1031
1032 cbz(cnt1, NOMATCH);
1033
1034 // Assign the particular char throughout the vector.
1035 sve_dup(ztmp2, T, ch);
1036 if (isL) {
1037 sve_cntb(vec_len);
1038 } else {
1039 sve_cnth(vec_len);
1040 }
1041 mov(idx, 0);
1042
1043 // Generate a predicate to control the reading of input string.
1044 sve_whilelt(tmp_pg, T, idx, cnt1);
1045
1046 BIND(LOOP);
1047 // Read a vector of 8- or 16-bit data depending on the string type. Note
1048 // that inactive elements indicated by the predicate register won't cause
1049 // a data read from memory to the destination vector.
1050 if (isL) {
1051 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1052 } else {
1053 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1054 }
1055 add(idx, idx, vec_len);
1056
1057 // Perform the comparison. An element of the destination predicate is set
1058 // to active if the particular char is matched.
1059 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1060
1061 // Branch if the particular char is found.
1062 br(NE, MATCH);
1063
1064 sve_whilelt(tmp_pg, T, idx, cnt1);
1065
1066 // Loop back if the particular char not found.
1067 br(MI, LOOP);
1068
1069 BIND(NOMATCH);
1070 mov(result, -1);
1071 b(DONE);
1072
1073 BIND(MATCH);
1074 // Undo the index increment.
1075 sub(idx, idx, vec_len);
1076
1077 // Crop the vector to find its location.
1078 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1079 add(result, idx, -1);
1080 sve_incp(result, T, tmp_pdn);
1081 BIND(DONE);
1082 }
1083
1084 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1085 Register ch, Register result,
1086 Register tmp1, Register tmp2, Register tmp3)
1087 {
1088 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1089 Register cnt1_neg = cnt1;
1090 Register ch1 = rscratch1;
1091 Register result_tmp = rscratch2;
1092
1093 cbz(cnt1, NOMATCH);
1094
1095 cmp(cnt1, (u1)8);
1096 br(LT, DO1_SHORT);
1097
1098 orr(ch, ch, ch, LSL, 8);
1099 orr(ch, ch, ch, LSL, 16);
1100 orr(ch, ch, ch, LSL, 32);
1101
1102 sub(cnt1, cnt1, 8);
1103 mov(result_tmp, cnt1);
1104 lea(str1, Address(str1, cnt1));
1105 sub(cnt1_neg, zr, cnt1);
1106
1107 mov(tmp3, 0x0101010101010101);
1108
1109 BIND(CH1_LOOP);
1110 ldr(ch1, Address(str1, cnt1_neg));
1111 eor(ch1, ch, ch1);
1112 sub(tmp1, ch1, tmp3);
1113 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1114 bics(tmp1, tmp1, tmp2);
1115 br(NE, HAS_ZERO);
1116 adds(cnt1_neg, cnt1_neg, 8);
1117 br(LT, CH1_LOOP);
1118
1119 cmp(cnt1_neg, (u1)8);
1120 mov(cnt1_neg, 0);
1121 br(LT, CH1_LOOP);
1122 b(NOMATCH);
1123
1124 BIND(HAS_ZERO);
1125 rev(tmp1, tmp1);
1126 clz(tmp1, tmp1);
1127 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1128 b(MATCH);
1129
1130 BIND(DO1_SHORT);
1131 mov(result_tmp, cnt1);
1132 lea(str1, Address(str1, cnt1));
1133 sub(cnt1_neg, zr, cnt1);
1134 BIND(DO1_LOOP);
1135 ldrb(ch1, Address(str1, cnt1_neg));
1136 cmp(ch, ch1);
1137 br(EQ, MATCH);
1138 adds(cnt1_neg, cnt1_neg, 1);
1139 br(LT, DO1_LOOP);
1140 BIND(NOMATCH);
1141 mov(result, -1);
1142 b(DONE);
1143 BIND(MATCH);
1144 add(result, result_tmp, cnt1_neg);
1145 BIND(DONE);
1146 }
1147
1148 // Compare strings.
1149 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1150 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1151 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1152 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1153 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1154 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1155 SHORT_LOOP_START, TAIL_CHECK;
1156
1157 bool isLL = ae == StrIntrinsicNode::LL;
1158 bool isLU = ae == StrIntrinsicNode::LU;
1159 bool isUL = ae == StrIntrinsicNode::UL;
1160
1161 // The stub threshold for LL strings is: 72 (64 + 8) chars
1162 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1163 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1164 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1165
1166 bool str1_isL = isLL || isLU;
1167 bool str2_isL = isLL || isUL;
1168
1169 int str1_chr_shift = str1_isL ? 0 : 1;
1170 int str2_chr_shift = str2_isL ? 0 : 1;
1171 int str1_chr_size = str1_isL ? 1 : 2;
1172 int str2_chr_size = str2_isL ? 1 : 2;
1173 int minCharsInWord = isLL ? wordSize : wordSize/2;
1174
1175 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1176 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1177 (chr_insn)&MacroAssembler::ldrh;
1178 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1179 (chr_insn)&MacroAssembler::ldrh;
1180 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1181 (uxt_insn)&MacroAssembler::uxthw;
1182
1183 BLOCK_COMMENT("string_compare {");
1184
1185 // Bizarrely, the counts are passed in bytes, regardless of whether they
1186 // are L or U strings, however the result is always in characters.
1187 if (!str1_isL) asrw(cnt1, cnt1, 1);
1188 if (!str2_isL) asrw(cnt2, cnt2, 1);
1189
1190 // Compute the minimum of the string lengths and save the difference.
1191 subsw(result, cnt1, cnt2);
1192 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1193
1194 // A very short string
1195 cmpw(cnt2, minCharsInWord);
1196 br(Assembler::LE, SHORT_STRING);
1197
1198 // Compare longwords
1199 // load first parts of strings and finish initialization while loading
1200 {
1201 if (str1_isL == str2_isL) { // LL or UU
1202 ldr(tmp1, Address(str1));
1203 cmp(str1, str2);
1204 br(Assembler::EQ, DONE);
1205 ldr(tmp2, Address(str2));
1206 cmp(cnt2, stub_threshold);
1207 br(GE, STUB);
1208 subsw(cnt2, cnt2, minCharsInWord);
1209 br(EQ, TAIL_CHECK);
1210 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1211 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1212 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1213 } else if (isLU) {
1214 ldrs(vtmp, Address(str1));
1215 ldr(tmp2, Address(str2));
1216 cmp(cnt2, stub_threshold);
1217 br(GE, STUB);
1218 subw(cnt2, cnt2, 4);
1219 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1220 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1221 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1222 zip1(vtmp, T8B, vtmp, vtmpZ);
1223 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1224 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1225 add(cnt1, cnt1, 4);
1226 fmovd(tmp1, vtmp);
1227 } else { // UL case
1228 ldr(tmp1, Address(str1));
1229 ldrs(vtmp, Address(str2));
1230 cmp(cnt2, stub_threshold);
1231 br(GE, STUB);
1232 subw(cnt2, cnt2, 4);
1233 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1234 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1235 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1236 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1237 zip1(vtmp, T8B, vtmp, vtmpZ);
1238 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1239 add(cnt1, cnt1, 8);
1240 fmovd(tmp2, vtmp);
1241 }
1242 adds(cnt2, cnt2, isUL ? 4 : 8);
1243 br(GE, TAIL);
1244 eor(rscratch2, tmp1, tmp2);
1245 cbnz(rscratch2, DIFF);
1246 // main loop
1247 bind(NEXT_WORD);
1248 if (str1_isL == str2_isL) {
1249 ldr(tmp1, Address(str1, cnt2));
1250 ldr(tmp2, Address(str2, cnt2));
1251 adds(cnt2, cnt2, 8);
1252 } else if (isLU) {
1253 ldrs(vtmp, Address(str1, cnt1));
1254 ldr(tmp2, Address(str2, cnt2));
1255 add(cnt1, cnt1, 4);
1256 zip1(vtmp, T8B, vtmp, vtmpZ);
1257 fmovd(tmp1, vtmp);
1258 adds(cnt2, cnt2, 8);
1259 } else { // UL
1260 ldrs(vtmp, Address(str2, cnt2));
1261 ldr(tmp1, Address(str1, cnt1));
1262 zip1(vtmp, T8B, vtmp, vtmpZ);
1263 add(cnt1, cnt1, 8);
1264 fmovd(tmp2, vtmp);
1265 adds(cnt2, cnt2, 4);
1266 }
1267 br(GE, TAIL);
1268
1269 eor(rscratch2, tmp1, tmp2);
1270 cbz(rscratch2, NEXT_WORD);
1271 b(DIFF);
1272 bind(TAIL);
1273 eor(rscratch2, tmp1, tmp2);
1274 cbnz(rscratch2, DIFF);
1275 // Last longword. In the case where length == 4 we compare the
1276 // same longword twice, but that's still faster than another
1277 // conditional branch.
1278 if (str1_isL == str2_isL) {
1279 ldr(tmp1, Address(str1));
1280 ldr(tmp2, Address(str2));
1281 } else if (isLU) {
1282 ldrs(vtmp, Address(str1));
1283 ldr(tmp2, Address(str2));
1284 zip1(vtmp, T8B, vtmp, vtmpZ);
1285 fmovd(tmp1, vtmp);
1286 } else { // UL
1287 ldrs(vtmp, Address(str2));
1288 ldr(tmp1, Address(str1));
1289 zip1(vtmp, T8B, vtmp, vtmpZ);
1290 fmovd(tmp2, vtmp);
1291 }
1292 bind(TAIL_CHECK);
1293 eor(rscratch2, tmp1, tmp2);
1294 cbz(rscratch2, DONE);
1295
1296 // Find the first different characters in the longwords and
1297 // compute their difference.
1298 bind(DIFF);
1299 rev(rscratch2, rscratch2);
1300 clz(rscratch2, rscratch2);
1301 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1302 lsrv(tmp1, tmp1, rscratch2);
1303 (this->*ext_chr)(tmp1, tmp1);
1304 lsrv(tmp2, tmp2, rscratch2);
1305 (this->*ext_chr)(tmp2, tmp2);
1306 subw(result, tmp1, tmp2);
1307 b(DONE);
1308 }
1309
1310 bind(STUB);
1311 RuntimeAddress stub = nullptr;
1312 switch(ae) {
1313 case StrIntrinsicNode::LL:
1314 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1315 break;
1316 case StrIntrinsicNode::UU:
1317 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1318 break;
1319 case StrIntrinsicNode::LU:
1320 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1321 break;
1322 case StrIntrinsicNode::UL:
1323 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1324 break;
1325 default:
1326 ShouldNotReachHere();
1327 }
1328 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1329 address call = trampoline_call(stub);
1330 if (call == nullptr) {
1331 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1332 ciEnv::current()->record_failure("CodeCache is full");
1333 return;
1334 }
1335 b(DONE);
1336
1337 bind(SHORT_STRING);
1338 // Is the minimum length zero?
1339 cbz(cnt2, DONE);
1340 // arrange code to do most branches while loading and loading next characters
1341 // while comparing previous
1342 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1343 subs(cnt2, cnt2, 1);
1344 br(EQ, SHORT_LAST_INIT);
1345 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346 b(SHORT_LOOP_START);
1347 bind(SHORT_LOOP);
1348 subs(cnt2, cnt2, 1);
1349 br(EQ, SHORT_LAST);
1350 bind(SHORT_LOOP_START);
1351 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1352 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1353 cmp(tmp1, cnt1);
1354 br(NE, SHORT_LOOP_TAIL);
1355 subs(cnt2, cnt2, 1);
1356 br(EQ, SHORT_LAST2);
1357 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1358 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1359 cmp(tmp2, rscratch1);
1360 br(EQ, SHORT_LOOP);
1361 sub(result, tmp2, rscratch1);
1362 b(DONE);
1363 bind(SHORT_LOOP_TAIL);
1364 sub(result, tmp1, cnt1);
1365 b(DONE);
1366 bind(SHORT_LAST2);
1367 cmp(tmp2, rscratch1);
1368 br(EQ, DONE);
1369 sub(result, tmp2, rscratch1);
1370
1371 b(DONE);
1372 bind(SHORT_LAST_INIT);
1373 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1374 bind(SHORT_LAST);
1375 cmp(tmp1, cnt1);
1376 br(EQ, DONE);
1377 sub(result, tmp1, cnt1);
1378
1379 bind(DONE);
1380
1381 BLOCK_COMMENT("} string_compare");
1382 }
1383
1384 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1385 FloatRegister src2, Condition cond, bool isQ) {
1386 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1387 FloatRegister zn = src1, zm = src2;
1388 bool needs_negation = false;
1389 switch (cond) {
1390 case LT: cond = GT; zn = src2; zm = src1; break;
1391 case LE: cond = GE; zn = src2; zm = src1; break;
1392 case LO: cond = HI; zn = src2; zm = src1; break;
1393 case LS: cond = HS; zn = src2; zm = src1; break;
1394 case NE: cond = EQ; needs_negation = true; break;
1395 default:
1396 break;
1397 }
1398
1399 if (is_floating_point_type(bt)) {
1400 fcm(cond, dst, size, zn, zm);
1401 } else {
1402 cm(cond, dst, size, zn, zm);
1403 }
1404
1405 if (needs_negation) {
1406 notr(dst, isQ ? T16B : T8B, dst);
1407 }
1408 }
1409
1410 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1411 Condition cond, bool isQ) {
1412 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1413 if (bt == T_FLOAT || bt == T_DOUBLE) {
1414 if (cond == Assembler::NE) {
1415 fcm(Assembler::EQ, dst, size, src);
1416 notr(dst, isQ ? T16B : T8B, dst);
1417 } else {
1418 fcm(cond, dst, size, src);
1419 }
1420 } else {
1421 if (cond == Assembler::NE) {
1422 cm(Assembler::EQ, dst, size, src);
1423 notr(dst, isQ ? T16B : T8B, dst);
1424 } else {
1425 cm(cond, dst, size, src);
1426 }
1427 }
1428 }
1429
1430 // Compress the least significant bit of each byte to the rightmost and clear
1431 // the higher garbage bits.
1432 void C2_MacroAssembler::bytemask_compress(Register dst) {
1433 // Example input, dst = 0x01 00 00 00 01 01 00 01
1434 // The "??" bytes are garbage.
1435 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1436 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1437 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1438 andr(dst, dst, 0xff); // dst = 0x8D
1439 }
1440
1441 // Pack the value of each mask element in "src" into a long value in "dst", at most
1442 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1443 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1444 // one bit in "dst".
1445 //
1446 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1447 // Expected: dst = 0x658D
1448 //
1449 // Clobbers: rscratch1
1450 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1451 FloatRegister vtmp, int lane_cnt) {
1452 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1453 assert_different_registers(dst, rscratch1);
1454 assert_different_registers(src, vtmp);
1455 assert(UseSVE > 0, "must be");
1456
1457 // Compress the lowest 8 bytes.
1458 fmovd(dst, src);
1459 bytemask_compress(dst);
1460 if (lane_cnt <= 8) return;
1461
1462 // Repeat on higher bytes and join the results.
1463 // Compress 8 bytes in each iteration.
1464 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1465 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1466 bytemask_compress(rscratch1);
1467 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1468 }
1469 }
1470
1471 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1472 // instruction which requires the FEAT_BITPERM feature.
1473 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1474 FloatRegister vtmp1, FloatRegister vtmp2,
1475 int lane_cnt) {
1476 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1477 assert_different_registers(src, vtmp1, vtmp2);
1478 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1479
1480 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1481 // is to compress each significant bit of the byte in a cross-lane way. Due
1482 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1483 // (bit-compress in each lane) with the biggest lane size (T = D) then
1484 // concatenate the results.
1485
1486 // The second source input of BEXT, initialized with 0x01 in each byte.
1487 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1488 sve_dup(vtmp2, B, 1);
1489
1490 // BEXT vtmp1.D, src.D, vtmp2.D
1491 // src = 0x0001010000010001 | 0x0100000001010001
1492 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1493 // ---------------------------------------
1494 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1495 sve_bext(vtmp1, D, src, vtmp2);
1496
1497 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1498 // result to dst.
1499 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1500 // dst = 0x658D
1501 if (lane_cnt <= 8) {
1502 // No need to concatenate.
1503 umov(dst, vtmp1, B, 0);
1504 } else if (lane_cnt <= 16) {
1505 ins(vtmp1, B, vtmp1, 1, 8);
1506 umov(dst, vtmp1, H, 0);
1507 } else {
1508 // As the lane count is 64 at most, the final expected value must be in
1509 // the lowest 64 bits after narrowing vtmp1 from D to B.
1510 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1511 umov(dst, vtmp1, D, 0);
1512 }
1513 }
1514
1515 // Unpack the mask, a long value in "src", into a vector register of boolean
1516 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1517 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1518 // most 64 lanes.
1519 //
1520 // Below example gives the expected dst vector register, with a valid src(0x658D)
1521 // on a 128-bit vector size machine.
1522 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1523 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1524 FloatRegister vtmp, int lane_cnt) {
1525 assert_different_registers(dst, vtmp);
1526 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1527 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1528
1529 // Example: src = 0x658D, lane_cnt = 16
1530 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1531
1532 // Put long value from general purpose register into the first lane of vector.
1533 // vtmp = 0x0000000000000000 | 0x000000000000658D
1534 sve_dup(vtmp, B, 0);
1535 mov(vtmp, D, 0, src);
1536
1537 // Transform the value in the first lane which is mask in bit now to the mask in
1538 // byte, which can be done by SVE2's BDEP instruction.
1539
1540 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1541 // vtmp = 0x0000000000000065 | 0x000000000000008D
1542 if (lane_cnt <= 8) {
1543 // Nothing. As only one byte exsits.
1544 } else if (lane_cnt <= 16) {
1545 ins(vtmp, B, vtmp, 8, 1);
1546 } else {
1547 sve_vector_extend(vtmp, D, vtmp, B);
1548 }
1549
1550 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1551 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1552 sve_dup(dst, B, 1);
1553
1554 // BDEP dst.D, vtmp.D, dst.D
1555 // vtmp = 0x0000000000000065 | 0x000000000000008D
1556 // dst = 0x0101010101010101 | 0x0101010101010101
1557 // ---------------------------------------
1558 // dst = 0x0001010000010001 | 0x0100000001010001
1559 sve_bdep(dst, D, vtmp, dst);
1560 }
1561
1562 // Clobbers: rflags
1563 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1564 FloatRegister zn, FloatRegister zm, Condition cond) {
1565 assert(pg->is_governing(), "This register has to be a governing predicate register");
1566 FloatRegister z1 = zn, z2 = zm;
1567 switch (cond) {
1568 case LE: z1 = zm; z2 = zn; cond = GE; break;
1569 case LT: z1 = zm; z2 = zn; cond = GT; break;
1570 case LO: z1 = zm; z2 = zn; cond = HI; break;
1571 case LS: z1 = zm; z2 = zn; cond = HS; break;
1572 default:
1573 break;
1574 }
1575
1576 SIMD_RegVariant size = elemType_to_regVariant(bt);
1577 if (is_floating_point_type(bt)) {
1578 sve_fcm(cond, pd, size, pg, z1, z2);
1579 } else {
1580 assert(is_integral_type(bt), "unsupported element type");
1581 sve_cmp(cond, pd, size, pg, z1, z2);
1582 }
1583 }
1584
1585 // Get index of the last mask lane that is set
1586 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1587 SIMD_RegVariant size = elemType_to_regVariant(bt);
1588 sve_rev(ptmp, size, src);
1589 sve_brkb(ptmp, ptrue, ptmp, false);
1590 sve_cntp(dst, size, ptrue, ptmp);
1591 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1592 subw(dst, rscratch1, dst);
1593 }
1594
1595 // Extend integer vector src to dst with the same lane count
1596 // but larger element size, e.g. 4B -> 4I
1597 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1598 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1599 if (src_bt == T_BYTE) {
1600 // 4B to 4S/4I, 8B to 8S
1601 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1602 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1603 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1604 if (dst_bt == T_INT) {
1605 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1606 }
1607 } else if (src_bt == T_SHORT) {
1608 // 2S to 2I/2L, 4S to 4I
1609 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1610 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1611 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1612 if (dst_bt == T_LONG) {
1613 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1614 }
1615 } else if (src_bt == T_INT) {
1616 // 2I to 2L
1617 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1618 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1619 } else {
1620 ShouldNotReachHere();
1621 }
1622 }
1623
1624 // Narrow integer vector src down to dst with the same lane count
1625 // but smaller element size, e.g. 4I -> 4B
1626 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1627 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1628 if (src_bt == T_SHORT) {
1629 // 4S/8S to 4B/8B
1630 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1631 assert(dst_bt == T_BYTE, "unsupported");
1632 xtn(dst, T8B, src, T8H);
1633 } else if (src_bt == T_INT) {
1634 // 2I to 2S, 4I to 4B/4S
1635 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1636 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1637 xtn(dst, T4H, src, T4S);
1638 if (dst_bt == T_BYTE) {
1639 xtn(dst, T8B, dst, T8H);
1640 }
1641 } else if (src_bt == T_LONG) {
1642 // 2L to 2S/2I
1643 assert(src_vlen_in_bytes == 16, "unsupported");
1644 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1645 xtn(dst, T2S, src, T2D);
1646 if (dst_bt == T_SHORT) {
1647 xtn(dst, T4H, dst, T4S);
1648 }
1649 } else {
1650 ShouldNotReachHere();
1651 }
1652 }
1653
1654 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1655 FloatRegister src, SIMD_RegVariant src_size,
1656 bool is_unsigned) {
1657 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1658
1659 if (src_size == B) {
1660 switch (dst_size) {
1661 case H:
1662 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1663 break;
1664 case S:
1665 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1666 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1667 break;
1668 case D:
1669 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1670 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1671 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1672 break;
1673 default:
1674 ShouldNotReachHere();
1675 }
1676 } else if (src_size == H) {
1677 if (dst_size == S) {
1678 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1679 } else { // D
1680 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1681 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1682 }
1683 } else if (src_size == S) {
1684 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1685 }
1686 }
1687
1688 // Vector narrow from src to dst with specified element sizes.
1689 // High part of dst vector will be filled with zero.
1690 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1691 FloatRegister src, SIMD_RegVariant src_size,
1692 FloatRegister tmp) {
1693 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1694 assert_different_registers(src, tmp);
1695 sve_dup(tmp, src_size, 0);
1696 if (src_size == D) {
1697 switch (dst_size) {
1698 case S:
1699 sve_uzp1(dst, S, src, tmp);
1700 break;
1701 case H:
1702 assert_different_registers(dst, tmp);
1703 sve_uzp1(dst, S, src, tmp);
1704 sve_uzp1(dst, H, dst, tmp);
1705 break;
1706 case B:
1707 assert_different_registers(dst, tmp);
1708 sve_uzp1(dst, S, src, tmp);
1709 sve_uzp1(dst, H, dst, tmp);
1710 sve_uzp1(dst, B, dst, tmp);
1711 break;
1712 default:
1713 ShouldNotReachHere();
1714 }
1715 } else if (src_size == S) {
1716 if (dst_size == H) {
1717 sve_uzp1(dst, H, src, tmp);
1718 } else { // B
1719 assert_different_registers(dst, tmp);
1720 sve_uzp1(dst, H, src, tmp);
1721 sve_uzp1(dst, B, dst, tmp);
1722 }
1723 } else if (src_size == H) {
1724 sve_uzp1(dst, B, src, tmp);
1725 }
1726 }
1727
1728 // Extend src predicate to dst predicate with the same lane count but larger
1729 // element size, e.g. 64Byte -> 512Long
1730 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1731 uint dst_element_length_in_bytes,
1732 uint src_element_length_in_bytes) {
1733 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1734 sve_punpklo(dst, src);
1735 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1736 sve_punpklo(dst, src);
1737 sve_punpklo(dst, dst);
1738 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1739 sve_punpklo(dst, src);
1740 sve_punpklo(dst, dst);
1741 sve_punpklo(dst, dst);
1742 } else {
1743 assert(false, "unsupported");
1744 ShouldNotReachHere();
1745 }
1746 }
1747
1748 // Narrow src predicate to dst predicate with the same lane count but
1749 // smaller element size, e.g. 512Long -> 64Byte
1750 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1751 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1752 // The insignificant bits in src predicate are expected to be zero.
1753 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1754 // passed as the second argument. An example narrowing operation with a given mask would be -
1755 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1756 // Mask (for 2 Longs) : TF
1757 // Predicate register for the above mask (16 bits) : 00000001 00000000
1758 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1759 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1760 assert_different_registers(src, ptmp);
1761 assert_different_registers(dst, ptmp);
1762 sve_pfalse(ptmp);
1763 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1764 sve_uzp1(dst, B, src, ptmp);
1765 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1766 sve_uzp1(dst, H, src, ptmp);
1767 sve_uzp1(dst, B, dst, ptmp);
1768 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1769 sve_uzp1(dst, S, src, ptmp);
1770 sve_uzp1(dst, H, dst, ptmp);
1771 sve_uzp1(dst, B, dst, ptmp);
1772 } else {
1773 assert(false, "unsupported");
1774 ShouldNotReachHere();
1775 }
1776 }
1777
1778 // Vector reduction add for integral type with ASIMD instructions.
1779 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1780 Register isrc, FloatRegister vsrc,
1781 unsigned vector_length_in_bytes,
1782 FloatRegister vtmp) {
1783 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1784 assert_different_registers(dst, isrc);
1785 bool isQ = vector_length_in_bytes == 16;
1786
1787 BLOCK_COMMENT("neon_reduce_add_integral {");
1788 switch(bt) {
1789 case T_BYTE:
1790 addv(vtmp, isQ ? T16B : T8B, vsrc);
1791 smov(dst, vtmp, B, 0);
1792 addw(dst, dst, isrc, ext::sxtb);
1793 break;
1794 case T_SHORT:
1795 addv(vtmp, isQ ? T8H : T4H, vsrc);
1796 smov(dst, vtmp, H, 0);
1797 addw(dst, dst, isrc, ext::sxth);
1798 break;
1799 case T_INT:
1800 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1801 umov(dst, vtmp, S, 0);
1802 addw(dst, dst, isrc);
1803 break;
1804 case T_LONG:
1805 assert(isQ, "unsupported");
1806 addpd(vtmp, vsrc);
1807 umov(dst, vtmp, D, 0);
1808 add(dst, dst, isrc);
1809 break;
1810 default:
1811 assert(false, "unsupported");
1812 ShouldNotReachHere();
1813 }
1814 BLOCK_COMMENT("} neon_reduce_add_integral");
1815 }
1816
1817 // Vector reduction multiply for integral type with ASIMD instructions.
1818 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1819 // Clobbers: rscratch1
1820 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1821 Register isrc, FloatRegister vsrc,
1822 unsigned vector_length_in_bytes,
1823 FloatRegister vtmp1, FloatRegister vtmp2) {
1824 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1825 bool isQ = vector_length_in_bytes == 16;
1826
1827 BLOCK_COMMENT("neon_reduce_mul_integral {");
1828 switch(bt) {
1829 case T_BYTE:
1830 if (isQ) {
1831 // Multiply the lower half and higher half of vector iteratively.
1832 // vtmp1 = vsrc[8:15]
1833 ins(vtmp1, D, vsrc, 0, 1);
1834 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1835 mulv(vtmp1, T8B, vtmp1, vsrc);
1836 // vtmp2 = vtmp1[4:7]
1837 ins(vtmp2, S, vtmp1, 0, 1);
1838 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1839 mulv(vtmp1, T8B, vtmp2, vtmp1);
1840 } else {
1841 ins(vtmp1, S, vsrc, 0, 1);
1842 mulv(vtmp1, T8B, vtmp1, vsrc);
1843 }
1844 // vtmp2 = vtmp1[2:3]
1845 ins(vtmp2, H, vtmp1, 0, 1);
1846 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1847 mulv(vtmp2, T8B, vtmp2, vtmp1);
1848 // dst = vtmp2[0] * isrc * vtmp2[1]
1849 umov(rscratch1, vtmp2, B, 0);
1850 mulw(dst, rscratch1, isrc);
1851 sxtb(dst, dst);
1852 umov(rscratch1, vtmp2, B, 1);
1853 mulw(dst, rscratch1, dst);
1854 sxtb(dst, dst);
1855 break;
1856 case T_SHORT:
1857 if (isQ) {
1858 ins(vtmp2, D, vsrc, 0, 1);
1859 mulv(vtmp2, T4H, vtmp2, vsrc);
1860 ins(vtmp1, S, vtmp2, 0, 1);
1861 mulv(vtmp1, T4H, vtmp1, vtmp2);
1862 } else {
1863 ins(vtmp1, S, vsrc, 0, 1);
1864 mulv(vtmp1, T4H, vtmp1, vsrc);
1865 }
1866 umov(rscratch1, vtmp1, H, 0);
1867 mulw(dst, rscratch1, isrc);
1868 sxth(dst, dst);
1869 umov(rscratch1, vtmp1, H, 1);
1870 mulw(dst, rscratch1, dst);
1871 sxth(dst, dst);
1872 break;
1873 case T_INT:
1874 if (isQ) {
1875 ins(vtmp1, D, vsrc, 0, 1);
1876 mulv(vtmp1, T2S, vtmp1, vsrc);
1877 } else {
1878 vtmp1 = vsrc;
1879 }
1880 umov(rscratch1, vtmp1, S, 0);
1881 mul(dst, rscratch1, isrc);
1882 umov(rscratch1, vtmp1, S, 1);
1883 mul(dst, rscratch1, dst);
1884 break;
1885 case T_LONG:
1886 umov(rscratch1, vsrc, D, 0);
1887 mul(dst, isrc, rscratch1);
1888 umov(rscratch1, vsrc, D, 1);
1889 mul(dst, dst, rscratch1);
1890 break;
1891 default:
1892 assert(false, "unsupported");
1893 ShouldNotReachHere();
1894 }
1895 BLOCK_COMMENT("} neon_reduce_mul_integral");
1896 }
1897
1898 // Vector reduction multiply for floating-point type with ASIMD instructions.
1899 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1900 FloatRegister fsrc, FloatRegister vsrc,
1901 unsigned vector_length_in_bytes,
1902 FloatRegister vtmp) {
1903 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1904 bool isQ = vector_length_in_bytes == 16;
1905
1906 BLOCK_COMMENT("neon_reduce_mul_fp {");
1907 switch(bt) {
1908 // The T_SHORT type below is for Float16 type which also uses floating-point
1909 // instructions.
1910 case T_SHORT:
1911 fmulh(dst, fsrc, vsrc);
1912 ext(vtmp, T8B, vsrc, vsrc, 2);
1913 fmulh(dst, dst, vtmp);
1914 ext(vtmp, T8B, vsrc, vsrc, 4);
1915 fmulh(dst, dst, vtmp);
1916 ext(vtmp, T8B, vsrc, vsrc, 6);
1917 fmulh(dst, dst, vtmp);
1918 if (isQ) {
1919 ext(vtmp, T16B, vsrc, vsrc, 8);
1920 fmulh(dst, dst, vtmp);
1921 ext(vtmp, T16B, vsrc, vsrc, 10);
1922 fmulh(dst, dst, vtmp);
1923 ext(vtmp, T16B, vsrc, vsrc, 12);
1924 fmulh(dst, dst, vtmp);
1925 ext(vtmp, T16B, vsrc, vsrc, 14);
1926 fmulh(dst, dst, vtmp);
1927 }
1928 break;
1929 case T_FLOAT:
1930 fmuls(dst, fsrc, vsrc);
1931 ins(vtmp, S, vsrc, 0, 1);
1932 fmuls(dst, dst, vtmp);
1933 if (isQ) {
1934 ins(vtmp, S, vsrc, 0, 2);
1935 fmuls(dst, dst, vtmp);
1936 ins(vtmp, S, vsrc, 0, 3);
1937 fmuls(dst, dst, vtmp);
1938 }
1939 break;
1940 case T_DOUBLE:
1941 assert(isQ, "unsupported");
1942 fmuld(dst, fsrc, vsrc);
1943 ins(vtmp, D, vsrc, 0, 1);
1944 fmuld(dst, dst, vtmp);
1945 break;
1946 default:
1947 assert(false, "unsupported");
1948 ShouldNotReachHere();
1949 }
1950 BLOCK_COMMENT("} neon_reduce_mul_fp");
1951 }
1952
1953 // Vector reduction add for half float type with ASIMD instructions.
1954 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1955 unsigned vector_length_in_bytes, FloatRegister vtmp) {
1956 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1957 bool isQ = vector_length_in_bytes == 16;
1958
1959 BLOCK_COMMENT("neon_reduce_add_fp16 {");
1960 faddh(dst, fsrc, vsrc);
1961 ext(vtmp, T8B, vsrc, vsrc, 2);
1962 faddh(dst, dst, vtmp);
1963 ext(vtmp, T8B, vsrc, vsrc, 4);
1964 faddh(dst, dst, vtmp);
1965 ext(vtmp, T8B, vsrc, vsrc, 6);
1966 faddh(dst, dst, vtmp);
1967 if (isQ) {
1968 ext(vtmp, T16B, vsrc, vsrc, 8);
1969 faddh(dst, dst, vtmp);
1970 ext(vtmp, T16B, vsrc, vsrc, 10);
1971 faddh(dst, dst, vtmp);
1972 ext(vtmp, T16B, vsrc, vsrc, 12);
1973 faddh(dst, dst, vtmp);
1974 ext(vtmp, T16B, vsrc, vsrc, 14);
1975 faddh(dst, dst, vtmp);
1976 }
1977 BLOCK_COMMENT("} neon_reduce_add_fp16");
1978 }
1979
1980 // Helper to select logical instruction
1981 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1982 Register Rn, Register Rm,
1983 enum shift_kind kind, unsigned shift) {
1984 switch(opc) {
1985 case Op_AndReductionV:
1986 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1987 break;
1988 case Op_OrReductionV:
1989 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1990 break;
1991 case Op_XorReductionV:
1992 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1993 break;
1994 default:
1995 assert(false, "unsupported");
1996 ShouldNotReachHere();
1997 }
1998 }
1999
2000 // Vector reduction logical operations And, Or, Xor
2001 // Clobbers: rscratch1
2002 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2003 Register isrc, FloatRegister vsrc,
2004 unsigned vector_length_in_bytes) {
2005 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2006 "unsupported");
2007 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2008 assert_different_registers(dst, isrc);
2009 bool isQ = vector_length_in_bytes == 16;
2010
2011 BLOCK_COMMENT("neon_reduce_logical {");
2012 umov(rscratch1, vsrc, isQ ? D : S, 0);
2013 umov(dst, vsrc, isQ ? D : S, 1);
2014 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2015 switch(bt) {
2016 case T_BYTE:
2017 if (isQ) {
2018 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2019 }
2020 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2021 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2022 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2023 sxtb(dst, dst);
2024 break;
2025 case T_SHORT:
2026 if (isQ) {
2027 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2028 }
2029 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2030 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2031 sxth(dst, dst);
2032 break;
2033 case T_INT:
2034 if (isQ) {
2035 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2036 }
2037 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2038 break;
2039 case T_LONG:
2040 assert(isQ, "unsupported");
2041 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2042 break;
2043 default:
2044 assert(false, "unsupported");
2045 ShouldNotReachHere();
2046 }
2047 BLOCK_COMMENT("} neon_reduce_logical");
2048 }
2049
2050 // Helper function to decode min/max reduction operation properties
2051 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2052 bool* is_unsigned,
2053 Condition* cond) {
2054 switch(opc) {
2055 case Op_MinReductionV:
2056 *is_min = true; *is_unsigned = false; *cond = LT; break;
2057 case Op_MaxReductionV:
2058 *is_min = false; *is_unsigned = false; *cond = GT; break;
2059 case Op_UMinReductionV:
2060 *is_min = true; *is_unsigned = true; *cond = LO; break;
2061 case Op_UMaxReductionV:
2062 *is_min = false; *is_unsigned = true; *cond = HI; break;
2063 default:
2064 ShouldNotReachHere();
2065 }
2066 }
2067
2068 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2069 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2070 // Clobbers: rscratch1, rflags
2071 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2072 Register isrc, FloatRegister vsrc,
2073 unsigned vector_length_in_bytes,
2074 FloatRegister vtmp) {
2075 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2076 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2077 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2078 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2079 assert_different_registers(dst, isrc);
2080 bool isQ = vector_length_in_bytes == 16;
2081 bool is_min;
2082 bool is_unsigned;
2083 Condition cond;
2084 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2085 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2086 if (bt == T_LONG) {
2087 assert(vtmp == fnoreg, "should be");
2088 assert(isQ, "should be");
2089 umov(rscratch1, vsrc, D, 0);
2090 cmp(isrc, rscratch1);
2091 csel(dst, isrc, rscratch1, cond);
2092 umov(rscratch1, vsrc, D, 1);
2093 cmp(dst, rscratch1);
2094 csel(dst, dst, rscratch1, cond);
2095 } else {
2096 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2097 if (size == T2S) {
2098 // For T2S (2x32-bit elements), use pairwise instructions because
2099 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2100 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2101 } else {
2102 // For other sizes, use reduction to scalar instructions.
2103 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2104 }
2105 if (bt == T_INT) {
2106 umov(dst, vtmp, S, 0);
2107 } else if (is_unsigned) {
2108 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2109 } else {
2110 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2111 }
2112 cmpw(dst, isrc);
2113 cselw(dst, dst, isrc, cond);
2114 }
2115 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2116 }
2117
2118 // Vector reduction for integral type with SVE instruction.
2119 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2120 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2121 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2122 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2123 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2124 assert(pg->is_governing(), "This register has to be a governing predicate register");
2125 assert_different_registers(src1, dst);
2126 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2127 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2128 switch (opc) {
2129 case Op_AddReductionVI: {
2130 sve_uaddv(tmp, size, pg, src2);
2131 if (bt == T_BYTE) {
2132 smov(dst, tmp, size, 0);
2133 addw(dst, src1, dst, ext::sxtb);
2134 } else if (bt == T_SHORT) {
2135 smov(dst, tmp, size, 0);
2136 addw(dst, src1, dst, ext::sxth);
2137 } else {
2138 umov(dst, tmp, size, 0);
2139 addw(dst, dst, src1);
2140 }
2141 break;
2142 }
2143 case Op_AddReductionVL: {
2144 sve_uaddv(tmp, size, pg, src2);
2145 umov(dst, tmp, size, 0);
2146 add(dst, dst, src1);
2147 break;
2148 }
2149 case Op_AndReductionV: {
2150 sve_andv(tmp, size, pg, src2);
2151 if (bt == T_INT || bt == T_LONG) {
2152 umov(dst, tmp, size, 0);
2153 } else {
2154 smov(dst, tmp, size, 0);
2155 }
2156 if (bt == T_LONG) {
2157 andr(dst, dst, src1);
2158 } else {
2159 andw(dst, dst, src1);
2160 }
2161 break;
2162 }
2163 case Op_OrReductionV: {
2164 sve_orv(tmp, size, pg, src2);
2165 if (bt == T_INT || bt == T_LONG) {
2166 umov(dst, tmp, size, 0);
2167 } else {
2168 smov(dst, tmp, size, 0);
2169 }
2170 if (bt == T_LONG) {
2171 orr(dst, dst, src1);
2172 } else {
2173 orrw(dst, dst, src1);
2174 }
2175 break;
2176 }
2177 case Op_XorReductionV: {
2178 sve_eorv(tmp, size, pg, src2);
2179 if (bt == T_INT || bt == T_LONG) {
2180 umov(dst, tmp, size, 0);
2181 } else {
2182 smov(dst, tmp, size, 0);
2183 }
2184 if (bt == T_LONG) {
2185 eor(dst, dst, src1);
2186 } else {
2187 eorw(dst, dst, src1);
2188 }
2189 break;
2190 }
2191 case Op_MaxReductionV:
2192 case Op_MinReductionV:
2193 case Op_UMaxReductionV:
2194 case Op_UMinReductionV: {
2195 bool is_min;
2196 bool is_unsigned;
2197 Condition cond;
2198 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2199 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2200 // Move result from vector to general register
2201 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2202 umov(dst, tmp, size, 0);
2203 } else {
2204 smov(dst, tmp, size, 0);
2205 }
2206 if (bt == T_LONG) {
2207 cmp(dst, src1);
2208 csel(dst, dst, src1, cond);
2209 } else {
2210 cmpw(dst, src1);
2211 cselw(dst, dst, src1, cond);
2212 }
2213 break;
2214 }
2215 default:
2216 assert(false, "unsupported");
2217 ShouldNotReachHere();
2218 }
2219
2220 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2221 if (bt == T_BYTE) {
2222 sxtb(dst, dst);
2223 } else if (bt == T_SHORT) {
2224 sxth(dst, dst);
2225 }
2226 }
2227 }
2228
2229 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2230 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2231 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2232 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2233 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2234 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2235
2236 // Set all elements to false if the input "lane_cnt" is zero.
2237 if (lane_cnt == 0) {
2238 sve_pfalse(dst);
2239 return;
2240 }
2241
2242 SIMD_RegVariant size = elemType_to_regVariant(bt);
2243 assert(size != Q, "invalid size");
2244
2245 // Set all true if "lane_cnt" equals to the max lane count.
2246 if (lane_cnt == max_vector_length) {
2247 sve_ptrue(dst, size, /* ALL */ 0b11111);
2248 return;
2249 }
2250
2251 // Fixed numbers for "ptrue".
2252 switch(lane_cnt) {
2253 case 1: /* VL1 */
2254 case 2: /* VL2 */
2255 case 3: /* VL3 */
2256 case 4: /* VL4 */
2257 case 5: /* VL5 */
2258 case 6: /* VL6 */
2259 case 7: /* VL7 */
2260 case 8: /* VL8 */
2261 sve_ptrue(dst, size, lane_cnt);
2262 return;
2263 case 16:
2264 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2265 return;
2266 case 32:
2267 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2268 return;
2269 case 64:
2270 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2271 return;
2272 case 128:
2273 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2274 return;
2275 case 256:
2276 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2277 return;
2278 default:
2279 break;
2280 }
2281
2282 // Special patterns for "ptrue".
2283 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2284 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2285 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2286 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2287 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2288 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2289 } else {
2290 // Encode to "whileltw" for the remaining cases.
2291 mov(rscratch1, lane_cnt);
2292 sve_whileltw(dst, size, zr, rscratch1);
2293 }
2294 }
2295
2296 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2297 // Any remaining elements of dst will be filled with zero.
2298 // Clobbers: rscratch1
2299 // Preserves: mask, vzr
2300 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2301 FloatRegister vzr, FloatRegister vtmp,
2302 PRegister pgtmp, unsigned vector_length_in_bytes) {
2303 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2304 // When called by sve_compress_byte, src and vtmp may be the same register.
2305 assert_different_registers(dst, src, vzr);
2306 assert_different_registers(dst, vtmp, vzr);
2307 assert_different_registers(mask, pgtmp);
2308 // high <-- low
2309 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2310 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2311 // Expected result: dst = 00 00 00 hh ee dd bb aa
2312
2313 // Extend lowest half to type INT.
2314 // dst = 00dd 00cc 00bb 00aa
2315 sve_uunpklo(dst, S, src);
2316 // pgtmp = 0001 0000 0001 0001
2317 sve_punpklo(pgtmp, mask);
2318 // Pack the active elements in size of type INT to the right,
2319 // and fill the remainings with zero.
2320 // dst = 0000 00dd 00bb 00aa
2321 sve_compact(dst, S, dst, pgtmp);
2322 // Narrow the result back to type SHORT.
2323 // dst = 00 00 00 00 00 dd bb aa
2324 sve_uzp1(dst, H, dst, vzr);
2325
2326 // Return if the vector length is no more than MaxVectorSize/2, since the
2327 // highest half is invalid.
2328 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2329 return;
2330 }
2331
2332 // Count the active elements of lowest half.
2333 // rscratch1 = 3
2334 sve_cntp(rscratch1, S, ptrue, pgtmp);
2335
2336 // Repeat to the highest half.
2337 // pgtmp = 0001 0000 0000 0001
2338 sve_punpkhi(pgtmp, mask);
2339 // vtmp = 00hh 00gg 00ff 00ee
2340 sve_uunpkhi(vtmp, S, src);
2341 // vtmp = 0000 0000 00hh 00ee
2342 sve_compact(vtmp, S, vtmp, pgtmp);
2343 // vtmp = 00 00 00 00 00 00 hh ee
2344 sve_uzp1(vtmp, H, vtmp, vzr);
2345
2346 // pgtmp = 00 00 00 00 00 01 01 01
2347 sve_whilelt(pgtmp, H, zr, rscratch1);
2348 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2349 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2350 // Combine the compressed low with the compressed high:
2351 // dst = 00 00 00 hh ee dd bb aa
2352 sve_splice(dst, H, pgtmp, vtmp);
2353 }
2354
2355 // Clobbers: rscratch1, rscratch2
2356 // Preserves: src, mask
2357 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2358 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2359 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2360 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2361 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2362 assert_different_registers(mask, ptmp, pgtmp);
2363 // high <-- low
2364 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2365 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2366 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2367 FloatRegister vzr = vtmp3;
2368 sve_dup(vzr, B, 0);
2369
2370 // Extend lowest half to type SHORT.
2371 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2372 sve_uunpklo(vtmp1, H, src);
2373 // ptmp = 00 01 00 00 00 01 00 01
2374 sve_punpklo(ptmp, mask);
2375 // Pack the active elements in size of type SHORT to the right,
2376 // and fill the remainings with zero.
2377 // dst = 00 00 00 00 00 0g 0c 0a
2378 unsigned extended_size = vector_length_in_bytes << 1;
2379 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2380 // Narrow the result back to type BYTE.
2381 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2382 sve_uzp1(dst, B, dst, vzr);
2383
2384 // Return if the vector length is no more than MaxVectorSize/2, since the
2385 // highest half is invalid.
2386 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2387 return;
2388 }
2389 // Count the active elements of lowest half.
2390 // rscratch2 = 3
2391 sve_cntp(rscratch2, H, ptrue, ptmp);
2392
2393 // Repeat to the highest half.
2394 // ptmp = 00 01 00 00 00 00 00 01
2395 sve_punpkhi(ptmp, mask);
2396 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2397 sve_uunpkhi(vtmp2, H, src);
2398 // vtmp1 = 00 00 00 00 00 00 0p 0i
2399 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2400 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2401 sve_uzp1(vtmp1, B, vtmp1, vzr);
2402
2403 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2404 sve_whilelt(ptmp, B, zr, rscratch2);
2405 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2406 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2407 // Combine the compressed low with the compressed high:
2408 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2409 sve_splice(dst, B, ptmp, vtmp1);
2410 }
2411
2412 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2413 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2414 SIMD_Arrangement size = isQ ? T16B : T8B;
2415 if (bt == T_BYTE) {
2416 rbit(dst, size, src);
2417 } else {
2418 neon_reverse_bytes(dst, src, bt, isQ);
2419 rbit(dst, size, dst);
2420 }
2421 }
2422
2423 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2424 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2425 SIMD_Arrangement size = isQ ? T16B : T8B;
2426 switch (bt) {
2427 case T_BYTE:
2428 if (dst != src) {
2429 orr(dst, size, src, src);
2430 }
2431 break;
2432 case T_SHORT:
2433 rev16(dst, size, src);
2434 break;
2435 case T_INT:
2436 rev32(dst, size, src);
2437 break;
2438 case T_LONG:
2439 rev64(dst, size, src);
2440 break;
2441 default:
2442 assert(false, "unsupported");
2443 ShouldNotReachHere();
2444 }
2445 }
2446
2447 // VectorRearrange implementation for short/int/float/long/double types with NEON
2448 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2449 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2450 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2451 // and use bsl to implement the operation.
2452 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2453 FloatRegister shuffle, FloatRegister tmp,
2454 BasicType bt, bool isQ) {
2455 assert_different_registers(dst, src, shuffle, tmp);
2456 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2457 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2458
2459 // Here is an example that rearranges a NEON vector with 4 ints:
2460 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2461 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2462 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2463 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2464 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2465 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2466 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2467 // 4. Use Vm as index register, and use V1 as table register.
2468 // Then get V2 as the result by tbl NEON instructions.
2469 switch (bt) {
2470 case T_SHORT:
2471 mov(tmp, size1, 0x02);
2472 mulv(dst, size2, shuffle, tmp);
2473 mov(tmp, size2, 0x0100);
2474 addv(dst, size1, dst, tmp);
2475 tbl(dst, size1, src, 1, dst);
2476 break;
2477 case T_INT:
2478 case T_FLOAT:
2479 mov(tmp, size1, 0x04);
2480 mulv(dst, size2, shuffle, tmp);
2481 mov(tmp, size2, 0x03020100);
2482 addv(dst, size1, dst, tmp);
2483 tbl(dst, size1, src, 1, dst);
2484 break;
2485 case T_LONG:
2486 case T_DOUBLE:
2487 {
2488 int idx = vector_iota_entry_index(T_LONG);
2489 lea(rscratch1,
2490 ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2491 ldrq(tmp, rscratch1);
2492 // Check whether the input "shuffle" is the same with iota indices.
2493 // Return "src" if true, otherwise swap the two elements of "src".
2494 cm(EQ, dst, size2, shuffle, tmp);
2495 ext(tmp, size1, src, src, 8);
2496 bsl(dst, size1, src, tmp);
2497 }
2498 break;
2499 default:
2500 assert(false, "unsupported element type");
2501 ShouldNotReachHere();
2502 }
2503 }
2504
2505 // Extract a scalar element from an sve vector at position 'idx'.
2506 // The input elements in src are expected to be of integral type.
2507 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2508 int idx, FloatRegister vtmp) {
2509 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2510 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2511 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2512 if (bt == T_INT || bt == T_LONG) {
2513 umov(dst, src, size, idx);
2514 } else {
2515 smov(dst, src, size, idx);
2516 }
2517 } else {
2518 sve_orr(vtmp, src, src);
2519 sve_ext(vtmp, vtmp, idx << size);
2520 if (bt == T_INT || bt == T_LONG) {
2521 umov(dst, vtmp, size, 0);
2522 } else {
2523 smov(dst, vtmp, size, 0);
2524 }
2525 }
2526 }
2527
2528 // java.lang.Math::round intrinsics
2529
2530 // Clobbers: rscratch1, rflags
2531 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2532 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2533 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2534 switch (T) {
2535 case T2S:
2536 case T4S:
2537 fmovs(tmp1, T, 0.5f);
2538 mov(rscratch1, jint_cast(0x1.0p23f));
2539 break;
2540 case T2D:
2541 fmovd(tmp1, T, 0.5);
2542 mov(rscratch1, julong_cast(0x1.0p52));
2543 break;
2544 default:
2545 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2546 }
2547 fadd(tmp1, T, tmp1, src);
2548 fcvtms(tmp1, T, tmp1);
2549 // tmp1 = floor(src + 0.5, ties to even)
2550
2551 fcvtas(dst, T, src);
2552 // dst = round(src), ties to away
2553
2554 fneg(tmp3, T, src);
2555 dup(tmp2, T, rscratch1);
2556 cm(HS, tmp3, T, tmp3, tmp2);
2557 // tmp3 is now a set of flags
2558
2559 bif(dst, T16B, tmp1, tmp3);
2560 // result in dst
2561 }
2562
2563 // Clobbers: rscratch1, rflags
2564 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2565 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2566 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2567 assert_different_registers(tmp1, tmp2, src, dst);
2568
2569 switch (T) {
2570 case S:
2571 mov(rscratch1, jint_cast(0x1.0p23f));
2572 break;
2573 case D:
2574 mov(rscratch1, julong_cast(0x1.0p52));
2575 break;
2576 default:
2577 assert(T == S || T == D, "invalid register variant");
2578 }
2579
2580 sve_frinta(dst, T, ptrue, src);
2581 // dst = round(src), ties to away
2582
2583 Label none;
2584
2585 sve_fneg(tmp1, T, ptrue, src);
2586 sve_dup(tmp2, T, rscratch1);
2587 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2588 br(EQ, none);
2589 {
2590 sve_cpy(tmp1, T, pgtmp, 0.5);
2591 sve_fadd(tmp1, T, pgtmp, src);
2592 sve_frintm(dst, T, pgtmp, tmp1);
2593 // dst = floor(src + 0.5, ties to even)
2594 }
2595 bind(none);
2596
2597 sve_fcvtzs(dst, T, ptrue, dst, T);
2598 // result in dst
2599 }
2600
2601 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2602 FloatRegister one, SIMD_Arrangement T) {
2603 assert_different_registers(dst, src, zero, one);
2604 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2605
2606 facgt(dst, T, src, zero);
2607 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2608 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2609 }
2610
2611 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2612 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2613 assert_different_registers(dst, src, zero, one, vtmp);
2614 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2615
2616 sve_orr(vtmp, src, src);
2617 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2618 switch (T) {
2619 case S:
2620 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2621 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2622 // on the sign of the float value
2623 break;
2624 case D:
2625 sve_and(vtmp, T, min_jlong);
2626 sve_orr(vtmp, T, jlong_cast(1.0));
2627 break;
2628 default:
2629 assert(false, "unsupported");
2630 ShouldNotReachHere();
2631 }
2632 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2633 // Result in dst
2634 }
2635
2636 bool C2_MacroAssembler::in_scratch_emit_size() {
2637 if (ciEnv::current()->task() != nullptr) {
2638 PhaseOutput* phase_output = Compile::current()->output();
2639 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2640 return true;
2641 }
2642 }
2643 return MacroAssembler::in_scratch_emit_size();
2644 }
2645
2646 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2647 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2648 }
2649
2650 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2651 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2652 if (t == TypeInt::INT) {
2653 return;
2654 }
2655
2656 BLOCK_COMMENT("verify_int_in_range {");
2657 Label L_success, L_failure;
2658
2659 jint lo = t->_lo;
2660 jint hi = t->_hi;
2661
2662 if (lo != min_jint) {
2663 subsw(rtmp, rval, lo);
2664 br(Assembler::LT, L_failure);
2665 }
2666 if (hi != max_jint) {
2667 subsw(rtmp, rval, hi);
2668 br(Assembler::GT, L_failure);
2669 }
2670 b(L_success);
2671
2672 bind(L_failure);
2673 movw(c_rarg0, idx);
2674 mov(c_rarg1, rval);
2675 movw(c_rarg2, lo);
2676 movw(c_rarg3, hi);
2677 reconstruct_frame_pointer(rtmp);
2678 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2679 hlt(0);
2680
2681 bind(L_success);
2682 BLOCK_COMMENT("} verify_int_in_range");
2683 }
2684
2685 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2686 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2687 }
2688
2689 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2690 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2691 if (t == TypeLong::LONG) {
2692 return;
2693 }
2694
2695 BLOCK_COMMENT("verify_long_in_range {");
2696 Label L_success, L_failure;
2697
2698 jlong lo = t->_lo;
2699 jlong hi = t->_hi;
2700
2701 if (lo != min_jlong) {
2702 subs(rtmp, rval, lo);
2703 br(Assembler::LT, L_failure);
2704 }
2705 if (hi != max_jlong) {
2706 subs(rtmp, rval, hi);
2707 br(Assembler::GT, L_failure);
2708 }
2709 b(L_success);
2710
2711 bind(L_failure);
2712 movw(c_rarg0, idx);
2713 mov(c_rarg1, rval);
2714 mov(c_rarg2, lo);
2715 mov(c_rarg3, hi);
2716 reconstruct_frame_pointer(rtmp);
2717 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2718 hlt(0);
2719
2720 bind(L_success);
2721 BLOCK_COMMENT("} verify_long_in_range");
2722 }
2723
2724 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2725 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2726 if (PreserveFramePointer) {
2727 // frame pointer is valid
2728 #ifdef ASSERT
2729 // Verify frame pointer value in rfp.
2730 add(rtmp, sp, framesize - 2 * wordSize);
2731 Label L_success;
2732 cmp(rfp, rtmp);
2733 br(Assembler::EQ, L_success);
2734 stop("frame pointer mismatch");
2735 bind(L_success);
2736 #endif // ASSERT
2737 } else {
2738 add(rfp, sp, framesize - 2 * wordSize);
2739 }
2740 }
2741
2742 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2743 // using Neon instructions and places it in the destination vector element corresponding to the
2744 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2745 // where NUM_ELEM is the number of BasicType elements per vector.
2746 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2747 // Otherwise, selects src2[idx – NUM_ELEM]
2748 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2749 FloatRegister src2, FloatRegister index,
2750 FloatRegister tmp, unsigned vector_length_in_bytes) {
2751 assert_different_registers(dst, src1, src2, tmp);
2752 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2753
2754 if (vector_length_in_bytes == 16) {
2755 assert(UseSVE <= 1, "sve must be <= 1");
2756 assert(src1->successor() == src2, "Source registers must be ordered");
2757 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2758 tbl(dst, size, src1, 2, index);
2759 } else { // vector length == 8
2760 assert(UseSVE == 0, "must be Neon only");
2761 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2762 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2763 // instruction with one vector lookup
2764 ins(tmp, D, src1, 0, 0);
2765 ins(tmp, D, src2, 1, 0);
2766 tbl(dst, size, tmp, 1, index);
2767 }
2768 }
2769
2770 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2771 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2772 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2773 // where NUM_ELEM is the number of BasicType elements per vector.
2774 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2775 // Otherwise, selects src2[idx – NUM_ELEM]
2776 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2777 FloatRegister src2, FloatRegister index,
2778 FloatRegister tmp, SIMD_RegVariant T,
2779 unsigned vector_length_in_bytes) {
2780 assert_different_registers(dst, src1, src2, index, tmp);
2781
2782 if (vector_length_in_bytes == 8) {
2783 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2784 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2785 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2786 // instruction with one vector lookup
2787 assert(UseSVE >= 1, "sve must be >= 1");
2788 ins(tmp, D, src1, 0, 0);
2789 ins(tmp, D, src2, 1, 0);
2790 sve_tbl(dst, T, tmp, index);
2791 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2792 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2793 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2794 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2795 // with the only exception of 8B vector length.
2796 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2797 assert(src1->successor() == src2, "Source registers must be ordered");
2798 sve_tbl(dst, T, src1, src2, index);
2799 }
2800 }
2801
2802 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2803 FloatRegister src2, FloatRegister index,
2804 FloatRegister tmp, BasicType bt,
2805 unsigned vector_length_in_bytes) {
2806
2807 assert_different_registers(dst, src1, src2, index, tmp);
2808
2809 // The cases that can reach this method are -
2810 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2811 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2812 //
2813 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2814 // and UseSVE = 2 with vector_length_in_bytes >= 8
2815 //
2816 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2817 // UseSVE = 1 with vector_length_in_bytes = 16
2818
2819 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2820 SIMD_RegVariant T = elemType_to_regVariant(bt);
2821 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2822 return;
2823 }
2824
2825 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2826 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2827 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2828
2829 bool isQ = vector_length_in_bytes == 16;
2830
2831 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2832 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2833
2834 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2835 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2836 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2837 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2838 // the indices can range from [0, 8).
2839 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2840 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2841 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2842 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2843 // Add the multiplied result to the vector in tmp to obtain the byte level
2844 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2845 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2846
2847 if (bt == T_BYTE) {
2848 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2849 } else {
2850 int elem_size = (bt == T_SHORT) ? 2 : 4;
2851 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2852
2853 mov(tmp, size1, elem_size);
2854 mulv(dst, size2, index, tmp);
2855 mov(tmp, size2, tbl_offset);
2856 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2857 // to select a set of 2B/4B
2858 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2859 }
2860 }
2861
2862 // Vector expand implementation. Elements from the src vector are expanded into
2863 // the dst vector under the control of the vector mask.
2864 // Since there are no native instructions directly corresponding to expand before
2865 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2866 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2867 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2868 // for NEON and SVE, but with different instructions where appropriate.
2869
2870 // Vector expand implementation for NEON.
2871 //
2872 // An example of 128-bit Byte vector:
2873 // Data direction: high <== low
2874 // Input:
2875 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2876 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2877 // Expected result:
2878 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2879 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2880 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2881 int vector_length_in_bytes) {
2882 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2883 assert_different_registers(dst, src, mask, tmp1, tmp2);
2884 // Since the TBL instruction only supports byte table, we need to
2885 // compute indices in byte type for all types.
2886 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2887 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2888 dup(tmp1, size, zr);
2889 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2890 negr(dst, size, mask);
2891 // Calculate vector index for TBL with prefix sum algorithm.
2892 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2893 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2894 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2895 addv(dst, size, tmp2, dst);
2896 }
2897 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2898 orr(tmp2, size, mask, mask);
2899 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2900 bsl(tmp2, size, dst, tmp1);
2901 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2902 movi(tmp1, size, 1);
2903 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2904 subv(dst, size, tmp2, tmp1);
2905 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2906 tbl(dst, size, src, 1, dst);
2907 }
2908
2909 // Vector expand implementation for SVE.
2910 //
2911 // An example of 128-bit Short vector:
2912 // Data direction: high <== low
2913 // Input:
2914 // src = gf ed cb a9 87 65 43 21
2915 // pg = 00 01 00 01 00 01 00 01
2916 // Expected result:
2917 // dst = 00 87 00 65 00 43 00 21
2918 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2919 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2920 int vector_length_in_bytes) {
2921 assert(UseSVE > 0, "expand implementation only for SVE");
2922 assert_different_registers(dst, src, tmp1, tmp2);
2923 SIMD_RegVariant size = elemType_to_regVariant(bt);
2924
2925 // tmp1 = 00 00 00 00 00 00 00 00
2926 sve_dup(tmp1, size, 0);
2927 sve_movprfx(tmp2, tmp1);
2928 // tmp2 = 00 01 00 01 00 01 00 01
2929 sve_cpy(tmp2, size, pg, 1, true);
2930 // Calculate vector index for TBL with prefix sum algorithm.
2931 // tmp2 = 04 04 03 03 02 02 01 01
2932 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2933 sve_movprfx(dst, tmp1);
2934 // The EXT instruction operates on the full-width sve register. The correct
2935 // index calculation method is:
2936 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2937 // MaxVectorSize - i.
2938 sve_ext(dst, tmp2, MaxVectorSize - i);
2939 sve_add(tmp2, size, dst, tmp2);
2940 }
2941 // dst = 00 04 00 03 00 02 00 01
2942 sve_sel(dst, size, pg, tmp2, tmp1);
2943 // dst = -1 03 -1 02 -1 01 -1 00
2944 sve_sub(dst, size, 1);
2945 // dst = 00 87 00 65 00 43 00 21
2946 sve_tbl(dst, size, src, dst);
2947 }
2948
2949 // Optimized SVE cpy (imm, zeroing) instruction.
2950 //
2951 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2952 // functionality, but test results show that `movi; cpy(imm, merging)` has
2953 // higher throughput on some microarchitectures. This would depend on
2954 // microarchitecture and so may vary between implementations.
2955 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2956 PRegister pg, int imm8, bool isMerge) {
2957 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2958 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2959 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2960 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2961 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2962 // entire Z<dst> register. According to the Arm Software Optimization
2963 // Guide, `movi` is zero latency.
2964 movi(dst, T2D, 0);
2965 isMerge = true;
2966 }
2967 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2968 }
2969
2970 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2971 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2972 // the offset between two types is 16.
2973 switch(bt) {
2974 case T_BYTE:
2975 return 0;
2976 case T_SHORT:
2977 return 1;
2978 case T_INT:
2979 return 2;
2980 case T_LONG:
2981 return 3;
2982 case T_FLOAT:
2983 return 4;
2984 case T_DOUBLE:
2985 return 5;
2986 default:
2987 ShouldNotReachHere();
2988 }
2989 }