1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "opto/c2_MacroAssembler.hpp"
28 #include "opto/compile.hpp"
29 #include "opto/intrinsicnode.hpp"
30 #include "opto/matcher.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/objectMonitorTable.hpp"
34 #include "runtime/stubRoutines.hpp"
35 #include "runtime/synchronizer.hpp"
36 #include "utilities/globalDefinitions.hpp"
37 #include "utilities/powerOfTwo.hpp"
38
39 #ifdef PRODUCT
40 #define BLOCK_COMMENT(str) /* nothing */
41 #define STOP(error) stop(error)
42 #else
43 #define BLOCK_COMMENT(str) block_comment(str)
44 #define STOP(error) block_comment(error); stop(error)
45 #endif
46
47 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
48
49 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
50
51 void C2_MacroAssembler::entry_barrier() {
52 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
53 // Dummy labels for just measuring the code size
54 Label dummy_slow_path;
55 Label dummy_continuation;
56 Label dummy_guard;
57 Label* slow_path = &dummy_slow_path;
58 Label* continuation = &dummy_continuation;
59 Label* guard = &dummy_guard;
60 if (!Compile::current()->output()->in_scratch_emit_size()) {
61 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
62 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
63 Compile::current()->output()->add_stub(stub);
64 slow_path = &stub->entry();
65 continuation = &stub->continuation();
66 guard = &stub->guard();
67 }
68 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
69 bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
70 }
71
72 // jdk.internal.util.ArraysSupport.vectorizedHashCode
73 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
74 FloatRegister vdata0, FloatRegister vdata1,
75 FloatRegister vdata2, FloatRegister vdata3,
76 FloatRegister vmul0, FloatRegister vmul1,
77 FloatRegister vmul2, FloatRegister vmul3,
78 FloatRegister vpow, FloatRegister vpowm,
79 BasicType eltype) {
80 ARRAYS_HASHCODE_REGISTERS;
81
82 Register tmp1 = rscratch1, tmp2 = rscratch2;
83
84 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
85
86 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
87 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
88 // use 4H for chars and shorts instead, but using 8H gives better performance.
89 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
90 : eltype == T_CHAR || eltype == T_SHORT ? 8
91 : eltype == T_INT ? 4
92 : 0;
93 guarantee(vf, "unsupported eltype");
94
95 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
96 const size_t unroll_factor = 4;
97
98 switch (eltype) {
99 case T_BOOLEAN:
100 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
101 break;
102 case T_CHAR:
103 BLOCK_COMMENT("arrays_hashcode(char) {");
104 break;
105 case T_BYTE:
106 BLOCK_COMMENT("arrays_hashcode(byte) {");
107 break;
108 case T_SHORT:
109 BLOCK_COMMENT("arrays_hashcode(short) {");
110 break;
111 case T_INT:
112 BLOCK_COMMENT("arrays_hashcode(int) {");
113 break;
114 default:
115 ShouldNotReachHere();
116 }
117
118 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
119 // implemented by the stub executes just once. Call the stub only if at least two iterations will
120 // be executed.
121 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
122 cmpw(cnt, large_threshold);
123 br(Assembler::HS, LARGE);
124
125 bind(TAIL);
126
127 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
128 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
129 // Iteration eats up the remainder, uf elements at a time.
130 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
131 andr(tmp2, cnt, unroll_factor - 1);
132 adr(tmp1, BR_BASE);
133 // For Cortex-A53 offset is 4 because 2 nops are generated.
134 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
135 movw(tmp2, 0x1f);
136 br(tmp1);
137
138 bind(LOOP);
139 for (size_t i = 0; i < unroll_factor; ++i) {
140 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
141 maddw(result, result, tmp2, tmp1);
142 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
143 // Generate 2nd nop to have 4 instructions per iteration.
144 if (VM_Version::supports_a53mac()) {
145 nop();
146 }
147 }
148 bind(BR_BASE);
149 subsw(cnt, cnt, unroll_factor);
150 br(Assembler::HS, LOOP);
151
152 b(DONE);
153
154 bind(LARGE);
155
156 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
157 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
158 address tpc = trampoline_call(stub);
159 if (tpc == nullptr) {
160 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
161 postcond(pc() == badAddress);
162 return nullptr;
163 }
164
165 bind(DONE);
166
167 BLOCK_COMMENT("} // arrays_hashcode");
168
169 postcond(pc() != badAddress);
170 return pc();
171 }
172
173 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
174 Register t2, Register t3) {
175 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
176
177 // Handle inflated monitor.
178 Label inflated;
179 // Finish fast lock successfully. MUST branch to with flag == EQ
180 Label locked;
181 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
182 Label slow_path;
183
184 if (UseObjectMonitorTable) {
185 // Clear cache in case fast locking succeeds or we need to take the slow-path.
186 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
187 }
188
189 if (DiagnoseSyncOnValueBasedClasses != 0) {
190 load_klass(t1, obj);
191 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
192 tst(t1, KlassFlags::_misc_is_value_based_class);
193 br(Assembler::NE, slow_path);
194 }
195
196 const Register t1_mark = t1;
197 const Register t3_t = t3;
198
199 { // Fast locking
200
201 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
202 Label push;
203
204 const Register t2_top = t2;
205
206 // Check if lock-stack is full.
207 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
208 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
209 br(Assembler::GT, slow_path);
210
211 // Check if recursive.
212 subw(t3_t, t2_top, oopSize);
213 ldr(t3_t, Address(rthread, t3_t));
214 cmp(obj, t3_t);
215 br(Assembler::EQ, push);
216
217 // Relaxed normal load to check for monitor. Optimization for monitor case.
218 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
219 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
220
221 // Not inflated
222 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
223
224 // Try to lock. Transition lock-bits 0b01 => 0b00
225 orr(t1_mark, t1_mark, markWord::unlocked_value);
226 eor(t3_t, t1_mark, markWord::unlocked_value);
227 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
228 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
229 br(Assembler::NE, slow_path);
230
231 bind(push);
232 // After successful lock, push object on lock-stack.
233 str(obj, Address(rthread, t2_top));
234 addw(t2_top, t2_top, oopSize);
235 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
236 b(locked);
237 }
238
239 { // Handle inflated monitor.
240 bind(inflated);
241
242 const Register t1_monitor = t1;
243
244 if (!UseObjectMonitorTable) {
245 assert(t1_monitor == t1_mark, "should be the same here");
246 } else {
247 const Register t1_hash = t1;
248 Label monitor_found;
249
250 // Save the mark, we might need it to extract the hash.
251 mov(t3, t1_mark);
252
253 // Look for the monitor in the om_cache.
254
255 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
256 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
257 const int num_unrolled = OMCache::CAPACITY;
258 for (int i = 0; i < num_unrolled; i++) {
259 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
260 ldr(t2, Address(rthread, cache_offset));
261 cmp(obj, t2);
262 br(Assembler::EQ, monitor_found);
263 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
264 }
265
266 // Look for the monitor in the table.
267
268 // Get the hash code.
269 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
270
271 // Get the table and calculate the bucket's address
272 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
273 ldr(t3, Address(t3));
274 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
275 ands(t1_hash, t1_hash, t2);
276 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
277
278 // Read the monitor from the bucket.
279 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
280
281 // Check if the monitor in the bucket is special (empty, tombstone or removed).
282 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
283 br(Assembler::LO, slow_path);
284
285 // Check if object matches.
286 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
287 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
288 bs_asm->try_resolve_weak_handle_in_c2(this, t3, t2, slow_path);
289 cmp(t3, obj);
290 br(Assembler::NE, slow_path);
291
292 bind(monitor_found);
293 }
294
295 const Register t2_owner_addr = t2;
296 const Register t3_owner = t3;
297 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
298 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
299 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
300
301 Label monitor_locked;
302
303 // Compute owner address.
304 lea(t2_owner_addr, owner_address);
305
306 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
307 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
308 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, /*acquire*/ true,
309 /*release*/ false, /*weak*/ false, t3_owner);
310 br(Assembler::EQ, monitor_locked);
311
312 // Check if recursive.
313 cmp(t3_owner, rscratch2);
314 br(Assembler::NE, slow_path);
315
316 // Recursive.
317 increment(recursions_address, 1);
318
319 bind(monitor_locked);
320 if (UseObjectMonitorTable) {
321 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
322 }
323 }
324
325 bind(locked);
326
327 #ifdef ASSERT
328 // Check that locked label is reached with Flags == EQ.
329 Label flag_correct;
330 br(Assembler::EQ, flag_correct);
331 stop("Fast Lock Flag != EQ");
332 #endif
333
334 bind(slow_path);
335 #ifdef ASSERT
336 // Check that slow_path label is reached with Flags == NE.
337 br(Assembler::NE, flag_correct);
338 stop("Fast Lock Flag != NE");
339 bind(flag_correct);
340 #endif
341 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
342 }
343
344 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
345 Register t2, Register t3) {
346 assert_different_registers(obj, box, t1, t2, t3);
347
348 // Handle inflated monitor.
349 Label inflated, inflated_load_mark;
350 // Finish fast unlock successfully. MUST branch to with flag == EQ
351 Label unlocked;
352 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
353 Label slow_path;
354
355 const Register t1_mark = t1;
356 const Register t2_top = t2;
357 const Register t3_t = t3;
358
359 { // Fast unlock
360
361 Label push_and_slow_path;
362
363 // Check if obj is top of lock-stack.
364 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
365 subw(t2_top, t2_top, oopSize);
366 ldr(t3_t, Address(rthread, t2_top));
367 cmp(obj, t3_t);
368 // Top of lock stack was not obj. Must be monitor.
369 br(Assembler::NE, inflated_load_mark);
370
371 // Pop lock-stack.
372 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
373 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
374
375 // Check if recursive.
376 subw(t3_t, t2_top, oopSize);
377 ldr(t3_t, Address(rthread, t3_t));
378 cmp(obj, t3_t);
379 br(Assembler::EQ, unlocked);
380
381 // Not recursive.
382 // Load Mark.
383 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
384
385 // Check header for monitor (0b10).
386 // Because we got here by popping (meaning we pushed in locked)
387 // there will be no monitor in the box. So we need to push back the obj
388 // so that the runtime can fix any potential anonymous owner.
389 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
390
391 // Try to unlock. Transition lock bits 0b00 => 0b01
392 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
393 orr(t3_t, t1_mark, markWord::unlocked_value);
394 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
395 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
396 br(Assembler::EQ, unlocked);
397
398 bind(push_and_slow_path);
399 // Compare and exchange failed.
400 // Restore lock-stack and handle the unlock in runtime.
401 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
402 addw(t2_top, t2_top, oopSize);
403 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
404 b(slow_path);
405 }
406
407
408 { // Handle inflated monitor.
409 bind(inflated_load_mark);
410 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
411 #ifdef ASSERT
412 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
413 stop("Fast Unlock not monitor");
414 #endif
415
416 bind(inflated);
417
418 #ifdef ASSERT
419 Label check_done;
420 subw(t2_top, t2_top, oopSize);
421 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
422 br(Assembler::LT, check_done);
423 ldr(t3_t, Address(rthread, t2_top));
424 cmp(obj, t3_t);
425 br(Assembler::NE, inflated);
426 stop("Fast Unlock lock on stack");
427 bind(check_done);
428 #endif
429
430 const Register t1_monitor = t1;
431
432 if (!UseObjectMonitorTable) {
433 assert(t1_monitor == t1_mark, "should be the same here");
434
435 // Untag the monitor.
436 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
437 } else {
438 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
439 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
440 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
441 br(Assembler::LO, slow_path);
442 }
443
444 const Register t2_recursions = t2;
445 Label not_recursive;
446
447 // Check if recursive.
448 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
449 cbz(t2_recursions, not_recursive);
450
451 // Recursive unlock.
452 sub(t2_recursions, t2_recursions, 1u);
453 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
454 // Set flag == EQ
455 cmp(t2_recursions, t2_recursions);
456 b(unlocked);
457
458 bind(not_recursive);
459
460 const Register t2_owner_addr = t2;
461
462 // Compute owner address.
463 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
464
465 // Set owner to null.
466 // Release to satisfy the JMM
467 stlr(zr, t2_owner_addr);
468 // We need a full fence after clearing owner to avoid stranding.
469 // StoreLoad achieves this.
470 membar(StoreLoad);
471
472 // Check if the entry_list is empty.
473 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
474 cmp(rscratch1, zr);
475 br(Assembler::EQ, unlocked); // If so we are done.
476
477 // Check if there is a successor.
478 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
479 cmp(rscratch1, zr);
480 br(Assembler::NE, unlocked); // If so we are done.
481
482 // Save the monitor pointer in the current thread, so we can try to
483 // reacquire the lock in SharedRuntime::monitor_exit_helper().
484 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
485
486 cmp(zr, rthread); // Set Flag to NE => slow path
487 b(slow_path);
488 }
489
490 bind(unlocked);
491 cmp(zr, zr); // Set Flags to EQ => fast path
492
493 #ifdef ASSERT
494 // Check that unlocked label is reached with Flags == EQ.
495 Label flag_correct;
496 br(Assembler::EQ, flag_correct);
497 stop("Fast Unlock Flag != EQ");
498 #endif
499
500 bind(slow_path);
501 #ifdef ASSERT
502 // Check that slow_path label is reached with Flags == NE.
503 br(Assembler::NE, flag_correct);
504 stop("Fast Unlock Flag != NE");
505 bind(flag_correct);
506 #endif
507 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
508 }
509
510 // Search for str1 in str2 and return index or -1
511 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
512 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
513 Register cnt2, Register cnt1,
514 Register tmp1, Register tmp2,
515 Register tmp3, Register tmp4,
516 Register tmp5, Register tmp6,
517 int icnt1, Register result, int ae) {
518 // NOTE: tmp5, tmp6 can be zr depending on specific method version
519 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
520
521 Register ch1 = rscratch1;
522 Register ch2 = rscratch2;
523 Register cnt1tmp = tmp1;
524 Register cnt2tmp = tmp2;
525 Register cnt1_neg = cnt1;
526 Register cnt2_neg = cnt2;
527 Register result_tmp = tmp4;
528
529 bool isL = ae == StrIntrinsicNode::LL;
530
531 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
532 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
533 int str1_chr_shift = str1_isL ? 0:1;
534 int str2_chr_shift = str2_isL ? 0:1;
535 int str1_chr_size = str1_isL ? 1:2;
536 int str2_chr_size = str2_isL ? 1:2;
537 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
538 (chr_insn)&MacroAssembler::ldrh;
539 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
540 (chr_insn)&MacroAssembler::ldrh;
541 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
542 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
543
544 // Note, inline_string_indexOf() generates checks:
545 // if (substr.count > string.count) return -1;
546 // if (substr.count == 0) return 0;
547
548 // We have two strings, a source string in str2, cnt2 and a pattern string
549 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
550
551 // For larger pattern and source we use a simplified Boyer Moore algorithm.
552 // With a small pattern and source we use linear scan.
553
554 if (icnt1 == -1) {
555 sub(result_tmp, cnt2, cnt1);
556 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
557 br(LT, LINEARSEARCH);
558 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
559 subs(zr, cnt1, 256);
560 lsr(tmp1, cnt2, 2);
561 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
562 br(GE, LINEARSTUB);
563 }
564
565 // The Boyer Moore alogorithm is based on the description here:-
566 //
567 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
568 //
569 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
570 // and the 'Good Suffix' rule.
571 //
572 // These rules are essentially heuristics for how far we can shift the
573 // pattern along the search string.
574 //
575 // The implementation here uses the 'Bad Character' rule only because of the
576 // complexity of initialisation for the 'Good Suffix' rule.
577 //
578 // This is also known as the Boyer-Moore-Horspool algorithm:-
579 //
580 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
581 //
582 // This particular implementation has few java-specific optimizations.
583 //
584 // #define ASIZE 256
585 //
586 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
587 // int i, j;
588 // unsigned c;
589 // unsigned char bc[ASIZE];
590 //
591 // /* Preprocessing */
592 // for (i = 0; i < ASIZE; ++i)
593 // bc[i] = m;
594 // for (i = 0; i < m - 1; ) {
595 // c = x[i];
596 // ++i;
597 // // c < 256 for Latin1 string, so, no need for branch
598 // #ifdef PATTERN_STRING_IS_LATIN1
599 // bc[c] = m - i;
600 // #else
601 // if (c < ASIZE) bc[c] = m - i;
602 // #endif
603 // }
604 //
605 // /* Searching */
606 // j = 0;
607 // while (j <= n - m) {
608 // c = y[i+j];
609 // if (x[m-1] == c)
610 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
611 // if (i < 0) return j;
612 // // c < 256 for Latin1 string, so, no need for branch
613 // #ifdef SOURCE_STRING_IS_LATIN1
614 // // LL case: (c< 256) always true. Remove branch
615 // j += bc[y[j+m-1]];
616 // #endif
617 // #ifndef PATTERN_STRING_IS_UTF
618 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
619 // if (c < ASIZE)
620 // j += bc[y[j+m-1]];
621 // else
622 // j += 1
623 // #endif
624 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
625 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
626 // if (c < ASIZE)
627 // j += bc[y[j+m-1]];
628 // else
629 // j += m
630 // #endif
631 // }
632 // }
633
634 if (icnt1 == -1) {
635 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
636 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
637 Register cnt1end = tmp2;
638 Register str2end = cnt2;
639 Register skipch = tmp2;
640
641 // str1 length is >=8, so, we can read at least 1 register for cases when
642 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
643 // UL case. We'll re-read last character in inner pre-loop code to have
644 // single outer pre-loop load
645 const int firstStep = isL ? 7 : 3;
646
647 const int ASIZE = 256;
648 const int STORED_BYTES = 32; // amount of bytes stored per instruction
649 sub(sp, sp, ASIZE);
650 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
651 mov(ch1, sp);
652 BIND(BM_INIT_LOOP);
653 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
654 subs(tmp5, tmp5, 1);
655 br(GT, BM_INIT_LOOP);
656
657 sub(cnt1tmp, cnt1, 1);
658 mov(tmp5, str2);
659 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
660 sub(ch2, cnt1, 1);
661 mov(tmp3, str1);
662 BIND(BCLOOP);
663 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
664 if (!str1_isL) {
665 subs(zr, ch1, ASIZE);
666 br(HS, BCSKIP);
667 }
668 strb(ch2, Address(sp, ch1));
669 BIND(BCSKIP);
670 subs(ch2, ch2, 1);
671 br(GT, BCLOOP);
672
673 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
674 if (str1_isL == str2_isL) {
675 // load last 8 bytes (8LL/4UU symbols)
676 ldr(tmp6, Address(tmp6, -wordSize));
677 } else {
678 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
679 // convert Latin1 to UTF. We'll have to wait until load completed, but
680 // it's still faster than per-character loads+checks
681 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
682 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
683 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
684 andr(tmp6, tmp6, 0xFF); // str1[N-4]
685 orr(ch2, ch1, ch2, LSL, 16);
686 orr(tmp6, tmp6, tmp3, LSL, 48);
687 orr(tmp6, tmp6, ch2, LSL, 16);
688 }
689 BIND(BMLOOPSTR2);
690 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
691 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
692 if (str1_isL == str2_isL) {
693 // re-init tmp3. It's for free because it's executed in parallel with
694 // load above. Alternative is to initialize it before loop, but it'll
695 // affect performance on in-order systems with 2 or more ld/st pipelines
696 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
697 }
698 if (!isL) { // UU/UL case
699 lsl(ch2, cnt1tmp, 1); // offset in bytes
700 }
701 cmp(tmp3, skipch);
702 br(NE, BMSKIP);
703 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
704 mov(ch1, tmp6);
705 if (isL) {
706 b(BMLOOPSTR1_AFTER_LOAD);
707 } else {
708 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
709 b(BMLOOPSTR1_CMP);
710 }
711 BIND(BMLOOPSTR1);
712 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
713 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
714 BIND(BMLOOPSTR1_AFTER_LOAD);
715 subs(cnt1tmp, cnt1tmp, 1);
716 br(LT, BMLOOPSTR1_LASTCMP);
717 BIND(BMLOOPSTR1_CMP);
718 cmp(ch1, ch2);
719 br(EQ, BMLOOPSTR1);
720 BIND(BMSKIP);
721 if (!isL) {
722 // if we've met UTF symbol while searching Latin1 pattern, then we can
723 // skip cnt1 symbols
724 if (str1_isL != str2_isL) {
725 mov(result_tmp, cnt1);
726 } else {
727 mov(result_tmp, 1);
728 }
729 subs(zr, skipch, ASIZE);
730 br(HS, BMADV);
731 }
732 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
733 BIND(BMADV);
734 sub(cnt1tmp, cnt1, 1);
735 add(str2, str2, result_tmp, LSL, str2_chr_shift);
736 cmp(str2, str2end);
737 br(LE, BMLOOPSTR2);
738 add(sp, sp, ASIZE);
739 b(NOMATCH);
740 BIND(BMLOOPSTR1_LASTCMP);
741 cmp(ch1, ch2);
742 br(NE, BMSKIP);
743 BIND(BMMATCH);
744 sub(result, str2, tmp5);
745 if (!str2_isL) lsr(result, result, 1);
746 add(sp, sp, ASIZE);
747 b(DONE);
748
749 BIND(LINEARSTUB);
750 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
751 br(LT, LINEAR_MEDIUM);
752 mov(result, zr);
753 RuntimeAddress stub = nullptr;
754 if (isL) {
755 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
756 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
757 } else if (str1_isL) {
758 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
759 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
760 } else {
761 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
762 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
763 }
764 address call = trampoline_call(stub);
765 if (call == nullptr) {
766 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
767 ciEnv::current()->record_failure("CodeCache is full");
768 return;
769 }
770 b(DONE);
771 }
772
773 BIND(LINEARSEARCH);
774 {
775 Label DO1, DO2, DO3;
776
777 Register str2tmp = tmp2;
778 Register first = tmp3;
779
780 if (icnt1 == -1)
781 {
782 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
783
784 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
785 br(LT, DOSHORT);
786 BIND(LINEAR_MEDIUM);
787 (this->*str1_load_1chr)(first, Address(str1));
788 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
789 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
790 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
791 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
792
793 BIND(FIRST_LOOP);
794 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
795 cmp(first, ch2);
796 br(EQ, STR1_LOOP);
797 BIND(STR2_NEXT);
798 adds(cnt2_neg, cnt2_neg, str2_chr_size);
799 br(LE, FIRST_LOOP);
800 b(NOMATCH);
801
802 BIND(STR1_LOOP);
803 adds(cnt1tmp, cnt1_neg, str1_chr_size);
804 add(cnt2tmp, cnt2_neg, str2_chr_size);
805 br(GE, MATCH);
806
807 BIND(STR1_NEXT);
808 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
809 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
810 cmp(ch1, ch2);
811 br(NE, STR2_NEXT);
812 adds(cnt1tmp, cnt1tmp, str1_chr_size);
813 add(cnt2tmp, cnt2tmp, str2_chr_size);
814 br(LT, STR1_NEXT);
815 b(MATCH);
816
817 BIND(DOSHORT);
818 if (str1_isL == str2_isL) {
819 cmp(cnt1, (u1)2);
820 br(LT, DO1);
821 br(GT, DO3);
822 }
823 }
824
825 if (icnt1 == 4) {
826 Label CH1_LOOP;
827
828 (this->*load_4chr)(ch1, str1);
829 sub(result_tmp, cnt2, 4);
830 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
831 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
832
833 BIND(CH1_LOOP);
834 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
835 cmp(ch1, ch2);
836 br(EQ, MATCH);
837 adds(cnt2_neg, cnt2_neg, str2_chr_size);
838 br(LE, CH1_LOOP);
839 b(NOMATCH);
840 }
841
842 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
843 Label CH1_LOOP;
844
845 BIND(DO2);
846 (this->*load_2chr)(ch1, str1);
847 if (icnt1 == 2) {
848 sub(result_tmp, cnt2, 2);
849 }
850 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
851 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
852 BIND(CH1_LOOP);
853 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
854 cmp(ch1, ch2);
855 br(EQ, MATCH);
856 adds(cnt2_neg, cnt2_neg, str2_chr_size);
857 br(LE, CH1_LOOP);
858 b(NOMATCH);
859 }
860
861 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
862 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
863
864 BIND(DO3);
865 (this->*load_2chr)(first, str1);
866 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
867 if (icnt1 == 3) {
868 sub(result_tmp, cnt2, 3);
869 }
870 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
871 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
872 BIND(FIRST_LOOP);
873 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
874 cmpw(first, ch2);
875 br(EQ, STR1_LOOP);
876 BIND(STR2_NEXT);
877 adds(cnt2_neg, cnt2_neg, str2_chr_size);
878 br(LE, FIRST_LOOP);
879 b(NOMATCH);
880
881 BIND(STR1_LOOP);
882 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
883 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
884 cmp(ch1, ch2);
885 br(NE, STR2_NEXT);
886 b(MATCH);
887 }
888
889 if (icnt1 == -1 || icnt1 == 1) {
890 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
891
892 BIND(DO1);
893 (this->*str1_load_1chr)(ch1, str1);
894 cmp(cnt2, (u1)8);
895 br(LT, DO1_SHORT);
896
897 sub(result_tmp, cnt2, 8/str2_chr_size);
898 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
899 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
900 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
901
902 if (str2_isL) {
903 orr(ch1, ch1, ch1, LSL, 8);
904 }
905 orr(ch1, ch1, ch1, LSL, 16);
906 orr(ch1, ch1, ch1, LSL, 32);
907 BIND(CH1_LOOP);
908 ldr(ch2, Address(str2, cnt2_neg));
909 eor(ch2, ch1, ch2);
910 sub(tmp1, ch2, tmp3);
911 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
912 bics(tmp1, tmp1, tmp2);
913 br(NE, HAS_ZERO);
914 adds(cnt2_neg, cnt2_neg, 8);
915 br(LT, CH1_LOOP);
916
917 cmp(cnt2_neg, (u1)8);
918 mov(cnt2_neg, 0);
919 br(LT, CH1_LOOP);
920 b(NOMATCH);
921
922 BIND(HAS_ZERO);
923 rev(tmp1, tmp1);
924 clz(tmp1, tmp1);
925 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
926 b(MATCH);
927
928 BIND(DO1_SHORT);
929 mov(result_tmp, cnt2);
930 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
931 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
932 BIND(DO1_LOOP);
933 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
934 cmpw(ch1, ch2);
935 br(EQ, MATCH);
936 adds(cnt2_neg, cnt2_neg, str2_chr_size);
937 br(LT, DO1_LOOP);
938 }
939 }
940 BIND(NOMATCH);
941 mov(result, -1);
942 b(DONE);
943 BIND(MATCH);
944 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
945 BIND(DONE);
946 }
947
948 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
949 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
950
951 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
952 Register ch, Register result,
953 Register tmp1, Register tmp2, Register tmp3)
954 {
955 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
956 Register cnt1_neg = cnt1;
957 Register ch1 = rscratch1;
958 Register result_tmp = rscratch2;
959
960 cbz(cnt1, NOMATCH);
961
962 cmp(cnt1, (u1)4);
963 br(LT, DO1_SHORT);
964
965 orr(ch, ch, ch, LSL, 16);
966 orr(ch, ch, ch, LSL, 32);
967
968 sub(cnt1, cnt1, 4);
969 mov(result_tmp, cnt1);
970 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
971 sub(cnt1_neg, zr, cnt1, LSL, 1);
972
973 mov(tmp3, 0x0001000100010001);
974
975 BIND(CH1_LOOP);
976 ldr(ch1, Address(str1, cnt1_neg));
977 eor(ch1, ch, ch1);
978 sub(tmp1, ch1, tmp3);
979 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
980 bics(tmp1, tmp1, tmp2);
981 br(NE, HAS_ZERO);
982 adds(cnt1_neg, cnt1_neg, 8);
983 br(LT, CH1_LOOP);
984
985 cmp(cnt1_neg, (u1)8);
986 mov(cnt1_neg, 0);
987 br(LT, CH1_LOOP);
988 b(NOMATCH);
989
990 BIND(HAS_ZERO);
991 rev(tmp1, tmp1);
992 clz(tmp1, tmp1);
993 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
994 b(MATCH);
995
996 BIND(DO1_SHORT);
997 mov(result_tmp, cnt1);
998 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
999 sub(cnt1_neg, zr, cnt1, LSL, 1);
1000 BIND(DO1_LOOP);
1001 ldrh(ch1, Address(str1, cnt1_neg));
1002 cmpw(ch, ch1);
1003 br(EQ, MATCH);
1004 adds(cnt1_neg, cnt1_neg, 2);
1005 br(LT, DO1_LOOP);
1006 BIND(NOMATCH);
1007 mov(result, -1);
1008 b(DONE);
1009 BIND(MATCH);
1010 add(result, result_tmp, cnt1_neg, ASR, 1);
1011 BIND(DONE);
1012 }
1013
1014 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1015 Register ch, Register result,
1016 FloatRegister ztmp1,
1017 FloatRegister ztmp2,
1018 PRegister tmp_pg,
1019 PRegister tmp_pdn, bool isL)
1020 {
1021 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1022 assert(tmp_pg->is_governing(),
1023 "this register has to be a governing predicate register");
1024
1025 Label LOOP, MATCH, DONE, NOMATCH;
1026 Register vec_len = rscratch1;
1027 Register idx = rscratch2;
1028
1029 SIMD_RegVariant T = (isL == true) ? B : H;
1030
1031 cbz(cnt1, NOMATCH);
1032
1033 // Assign the particular char throughout the vector.
1034 sve_dup(ztmp2, T, ch);
1035 if (isL) {
1036 sve_cntb(vec_len);
1037 } else {
1038 sve_cnth(vec_len);
1039 }
1040 mov(idx, 0);
1041
1042 // Generate a predicate to control the reading of input string.
1043 sve_whilelt(tmp_pg, T, idx, cnt1);
1044
1045 BIND(LOOP);
1046 // Read a vector of 8- or 16-bit data depending on the string type. Note
1047 // that inactive elements indicated by the predicate register won't cause
1048 // a data read from memory to the destination vector.
1049 if (isL) {
1050 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1051 } else {
1052 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1053 }
1054 add(idx, idx, vec_len);
1055
1056 // Perform the comparison. An element of the destination predicate is set
1057 // to active if the particular char is matched.
1058 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1059
1060 // Branch if the particular char is found.
1061 br(NE, MATCH);
1062
1063 sve_whilelt(tmp_pg, T, idx, cnt1);
1064
1065 // Loop back if the particular char not found.
1066 br(MI, LOOP);
1067
1068 BIND(NOMATCH);
1069 mov(result, -1);
1070 b(DONE);
1071
1072 BIND(MATCH);
1073 // Undo the index increment.
1074 sub(idx, idx, vec_len);
1075
1076 // Crop the vector to find its location.
1077 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1078 add(result, idx, -1);
1079 sve_incp(result, T, tmp_pdn);
1080 BIND(DONE);
1081 }
1082
1083 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1084 Register ch, Register result,
1085 Register tmp1, Register tmp2, Register tmp3)
1086 {
1087 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1088 Register cnt1_neg = cnt1;
1089 Register ch1 = rscratch1;
1090 Register result_tmp = rscratch2;
1091
1092 cbz(cnt1, NOMATCH);
1093
1094 cmp(cnt1, (u1)8);
1095 br(LT, DO1_SHORT);
1096
1097 orr(ch, ch, ch, LSL, 8);
1098 orr(ch, ch, ch, LSL, 16);
1099 orr(ch, ch, ch, LSL, 32);
1100
1101 sub(cnt1, cnt1, 8);
1102 mov(result_tmp, cnt1);
1103 lea(str1, Address(str1, cnt1));
1104 sub(cnt1_neg, zr, cnt1);
1105
1106 mov(tmp3, 0x0101010101010101);
1107
1108 BIND(CH1_LOOP);
1109 ldr(ch1, Address(str1, cnt1_neg));
1110 eor(ch1, ch, ch1);
1111 sub(tmp1, ch1, tmp3);
1112 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1113 bics(tmp1, tmp1, tmp2);
1114 br(NE, HAS_ZERO);
1115 adds(cnt1_neg, cnt1_neg, 8);
1116 br(LT, CH1_LOOP);
1117
1118 cmp(cnt1_neg, (u1)8);
1119 mov(cnt1_neg, 0);
1120 br(LT, CH1_LOOP);
1121 b(NOMATCH);
1122
1123 BIND(HAS_ZERO);
1124 rev(tmp1, tmp1);
1125 clz(tmp1, tmp1);
1126 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1127 b(MATCH);
1128
1129 BIND(DO1_SHORT);
1130 mov(result_tmp, cnt1);
1131 lea(str1, Address(str1, cnt1));
1132 sub(cnt1_neg, zr, cnt1);
1133 BIND(DO1_LOOP);
1134 ldrb(ch1, Address(str1, cnt1_neg));
1135 cmp(ch, ch1);
1136 br(EQ, MATCH);
1137 adds(cnt1_neg, cnt1_neg, 1);
1138 br(LT, DO1_LOOP);
1139 BIND(NOMATCH);
1140 mov(result, -1);
1141 b(DONE);
1142 BIND(MATCH);
1143 add(result, result_tmp, cnt1_neg);
1144 BIND(DONE);
1145 }
1146
1147 // Compare strings.
1148 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1149 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1150 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1151 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1152 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1153 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1154 SHORT_LOOP_START, TAIL_CHECK;
1155
1156 bool isLL = ae == StrIntrinsicNode::LL;
1157 bool isLU = ae == StrIntrinsicNode::LU;
1158 bool isUL = ae == StrIntrinsicNode::UL;
1159
1160 // The stub threshold for LL strings is: 72 (64 + 8) chars
1161 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1162 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1163 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1164
1165 bool str1_isL = isLL || isLU;
1166 bool str2_isL = isLL || isUL;
1167
1168 int str1_chr_shift = str1_isL ? 0 : 1;
1169 int str2_chr_shift = str2_isL ? 0 : 1;
1170 int str1_chr_size = str1_isL ? 1 : 2;
1171 int str2_chr_size = str2_isL ? 1 : 2;
1172 int minCharsInWord = isLL ? wordSize : wordSize/2;
1173
1174 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1175 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1176 (chr_insn)&MacroAssembler::ldrh;
1177 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1178 (chr_insn)&MacroAssembler::ldrh;
1179 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1180 (uxt_insn)&MacroAssembler::uxthw;
1181
1182 BLOCK_COMMENT("string_compare {");
1183
1184 // Bizarrely, the counts are passed in bytes, regardless of whether they
1185 // are L or U strings, however the result is always in characters.
1186 if (!str1_isL) asrw(cnt1, cnt1, 1);
1187 if (!str2_isL) asrw(cnt2, cnt2, 1);
1188
1189 // Compute the minimum of the string lengths and save the difference.
1190 subsw(result, cnt1, cnt2);
1191 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1192
1193 // A very short string
1194 cmpw(cnt2, minCharsInWord);
1195 br(Assembler::LE, SHORT_STRING);
1196
1197 // Compare longwords
1198 // load first parts of strings and finish initialization while loading
1199 {
1200 if (str1_isL == str2_isL) { // LL or UU
1201 ldr(tmp1, Address(str1));
1202 cmp(str1, str2);
1203 br(Assembler::EQ, DONE);
1204 ldr(tmp2, Address(str2));
1205 cmp(cnt2, stub_threshold);
1206 br(GE, STUB);
1207 subsw(cnt2, cnt2, minCharsInWord);
1208 br(EQ, TAIL_CHECK);
1209 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1210 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1211 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1212 } else if (isLU) {
1213 ldrs(vtmp, Address(str1));
1214 ldr(tmp2, Address(str2));
1215 cmp(cnt2, stub_threshold);
1216 br(GE, STUB);
1217 subw(cnt2, cnt2, 4);
1218 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1219 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1220 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1221 zip1(vtmp, T8B, vtmp, vtmpZ);
1222 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1223 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1224 add(cnt1, cnt1, 4);
1225 fmovd(tmp1, vtmp);
1226 } else { // UL case
1227 ldr(tmp1, Address(str1));
1228 ldrs(vtmp, Address(str2));
1229 cmp(cnt2, stub_threshold);
1230 br(GE, STUB);
1231 subw(cnt2, cnt2, 4);
1232 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1233 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1234 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1235 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1236 zip1(vtmp, T8B, vtmp, vtmpZ);
1237 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1238 add(cnt1, cnt1, 8);
1239 fmovd(tmp2, vtmp);
1240 }
1241 adds(cnt2, cnt2, isUL ? 4 : 8);
1242 br(GE, TAIL);
1243 eor(rscratch2, tmp1, tmp2);
1244 cbnz(rscratch2, DIFF);
1245 // main loop
1246 bind(NEXT_WORD);
1247 if (str1_isL == str2_isL) {
1248 ldr(tmp1, Address(str1, cnt2));
1249 ldr(tmp2, Address(str2, cnt2));
1250 adds(cnt2, cnt2, 8);
1251 } else if (isLU) {
1252 ldrs(vtmp, Address(str1, cnt1));
1253 ldr(tmp2, Address(str2, cnt2));
1254 add(cnt1, cnt1, 4);
1255 zip1(vtmp, T8B, vtmp, vtmpZ);
1256 fmovd(tmp1, vtmp);
1257 adds(cnt2, cnt2, 8);
1258 } else { // UL
1259 ldrs(vtmp, Address(str2, cnt2));
1260 ldr(tmp1, Address(str1, cnt1));
1261 zip1(vtmp, T8B, vtmp, vtmpZ);
1262 add(cnt1, cnt1, 8);
1263 fmovd(tmp2, vtmp);
1264 adds(cnt2, cnt2, 4);
1265 }
1266 br(GE, TAIL);
1267
1268 eor(rscratch2, tmp1, tmp2);
1269 cbz(rscratch2, NEXT_WORD);
1270 b(DIFF);
1271 bind(TAIL);
1272 eor(rscratch2, tmp1, tmp2);
1273 cbnz(rscratch2, DIFF);
1274 // Last longword. In the case where length == 4 we compare the
1275 // same longword twice, but that's still faster than another
1276 // conditional branch.
1277 if (str1_isL == str2_isL) {
1278 ldr(tmp1, Address(str1));
1279 ldr(tmp2, Address(str2));
1280 } else if (isLU) {
1281 ldrs(vtmp, Address(str1));
1282 ldr(tmp2, Address(str2));
1283 zip1(vtmp, T8B, vtmp, vtmpZ);
1284 fmovd(tmp1, vtmp);
1285 } else { // UL
1286 ldrs(vtmp, Address(str2));
1287 ldr(tmp1, Address(str1));
1288 zip1(vtmp, T8B, vtmp, vtmpZ);
1289 fmovd(tmp2, vtmp);
1290 }
1291 bind(TAIL_CHECK);
1292 eor(rscratch2, tmp1, tmp2);
1293 cbz(rscratch2, DONE);
1294
1295 // Find the first different characters in the longwords and
1296 // compute their difference.
1297 bind(DIFF);
1298 rev(rscratch2, rscratch2);
1299 clz(rscratch2, rscratch2);
1300 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1301 lsrv(tmp1, tmp1, rscratch2);
1302 (this->*ext_chr)(tmp1, tmp1);
1303 lsrv(tmp2, tmp2, rscratch2);
1304 (this->*ext_chr)(tmp2, tmp2);
1305 subw(result, tmp1, tmp2);
1306 b(DONE);
1307 }
1308
1309 bind(STUB);
1310 RuntimeAddress stub = nullptr;
1311 switch(ae) {
1312 case StrIntrinsicNode::LL:
1313 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1314 break;
1315 case StrIntrinsicNode::UU:
1316 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1317 break;
1318 case StrIntrinsicNode::LU:
1319 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1320 break;
1321 case StrIntrinsicNode::UL:
1322 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1323 break;
1324 default:
1325 ShouldNotReachHere();
1326 }
1327 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1328 address call = trampoline_call(stub);
1329 if (call == nullptr) {
1330 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1331 ciEnv::current()->record_failure("CodeCache is full");
1332 return;
1333 }
1334 b(DONE);
1335
1336 bind(SHORT_STRING);
1337 // Is the minimum length zero?
1338 cbz(cnt2, DONE);
1339 // arrange code to do most branches while loading and loading next characters
1340 // while comparing previous
1341 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1342 subs(cnt2, cnt2, 1);
1343 br(EQ, SHORT_LAST_INIT);
1344 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1345 b(SHORT_LOOP_START);
1346 bind(SHORT_LOOP);
1347 subs(cnt2, cnt2, 1);
1348 br(EQ, SHORT_LAST);
1349 bind(SHORT_LOOP_START);
1350 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1351 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1352 cmp(tmp1, cnt1);
1353 br(NE, SHORT_LOOP_TAIL);
1354 subs(cnt2, cnt2, 1);
1355 br(EQ, SHORT_LAST2);
1356 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1357 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1358 cmp(tmp2, rscratch1);
1359 br(EQ, SHORT_LOOP);
1360 sub(result, tmp2, rscratch1);
1361 b(DONE);
1362 bind(SHORT_LOOP_TAIL);
1363 sub(result, tmp1, cnt1);
1364 b(DONE);
1365 bind(SHORT_LAST2);
1366 cmp(tmp2, rscratch1);
1367 br(EQ, DONE);
1368 sub(result, tmp2, rscratch1);
1369
1370 b(DONE);
1371 bind(SHORT_LAST_INIT);
1372 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1373 bind(SHORT_LAST);
1374 cmp(tmp1, cnt1);
1375 br(EQ, DONE);
1376 sub(result, tmp1, cnt1);
1377
1378 bind(DONE);
1379
1380 BLOCK_COMMENT("} string_compare");
1381 }
1382
1383 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1384 FloatRegister src2, Condition cond, bool isQ) {
1385 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1386 FloatRegister zn = src1, zm = src2;
1387 bool needs_negation = false;
1388 switch (cond) {
1389 case LT: cond = GT; zn = src2; zm = src1; break;
1390 case LE: cond = GE; zn = src2; zm = src1; break;
1391 case LO: cond = HI; zn = src2; zm = src1; break;
1392 case LS: cond = HS; zn = src2; zm = src1; break;
1393 case NE: cond = EQ; needs_negation = true; break;
1394 default:
1395 break;
1396 }
1397
1398 if (is_floating_point_type(bt)) {
1399 fcm(cond, dst, size, zn, zm);
1400 } else {
1401 cm(cond, dst, size, zn, zm);
1402 }
1403
1404 if (needs_negation) {
1405 notr(dst, isQ ? T16B : T8B, dst);
1406 }
1407 }
1408
1409 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1410 Condition cond, bool isQ) {
1411 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1412 if (bt == T_FLOAT || bt == T_DOUBLE) {
1413 if (cond == Assembler::NE) {
1414 fcm(Assembler::EQ, dst, size, src);
1415 notr(dst, isQ ? T16B : T8B, dst);
1416 } else {
1417 fcm(cond, dst, size, src);
1418 }
1419 } else {
1420 if (cond == Assembler::NE) {
1421 cm(Assembler::EQ, dst, size, src);
1422 notr(dst, isQ ? T16B : T8B, dst);
1423 } else {
1424 cm(cond, dst, size, src);
1425 }
1426 }
1427 }
1428
1429 // Compress the least significant bit of each byte to the rightmost and clear
1430 // the higher garbage bits.
1431 void C2_MacroAssembler::bytemask_compress(Register dst) {
1432 // Example input, dst = 0x01 00 00 00 01 01 00 01
1433 // The "??" bytes are garbage.
1434 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1435 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1436 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1437 andr(dst, dst, 0xff); // dst = 0x8D
1438 }
1439
1440 // Pack the value of each mask element in "src" into a long value in "dst", at most
1441 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1442 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1443 // one bit in "dst".
1444 //
1445 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1446 // Expected: dst = 0x658D
1447 //
1448 // Clobbers: rscratch1
1449 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1450 FloatRegister vtmp, int lane_cnt) {
1451 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1452 assert_different_registers(dst, rscratch1);
1453 assert_different_registers(src, vtmp);
1454 assert(UseSVE > 0, "must be");
1455
1456 // Compress the lowest 8 bytes.
1457 fmovd(dst, src);
1458 bytemask_compress(dst);
1459 if (lane_cnt <= 8) return;
1460
1461 // Repeat on higher bytes and join the results.
1462 // Compress 8 bytes in each iteration.
1463 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1464 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1465 bytemask_compress(rscratch1);
1466 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1467 }
1468 }
1469
1470 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1471 // instruction which requires the FEAT_BITPERM feature.
1472 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1473 FloatRegister vtmp1, FloatRegister vtmp2,
1474 int lane_cnt) {
1475 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1476 assert_different_registers(src, vtmp1, vtmp2);
1477 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1478
1479 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1480 // is to compress each significant bit of the byte in a cross-lane way. Due
1481 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1482 // (bit-compress in each lane) with the biggest lane size (T = D) then
1483 // concatenate the results.
1484
1485 // The second source input of BEXT, initialized with 0x01 in each byte.
1486 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1487 sve_dup(vtmp2, B, 1);
1488
1489 // BEXT vtmp1.D, src.D, vtmp2.D
1490 // src = 0x0001010000010001 | 0x0100000001010001
1491 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1492 // ---------------------------------------
1493 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1494 sve_bext(vtmp1, D, src, vtmp2);
1495
1496 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1497 // result to dst.
1498 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1499 // dst = 0x658D
1500 if (lane_cnt <= 8) {
1501 // No need to concatenate.
1502 umov(dst, vtmp1, B, 0);
1503 } else if (lane_cnt <= 16) {
1504 ins(vtmp1, B, vtmp1, 1, 8);
1505 umov(dst, vtmp1, H, 0);
1506 } else {
1507 // As the lane count is 64 at most, the final expected value must be in
1508 // the lowest 64 bits after narrowing vtmp1 from D to B.
1509 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1510 umov(dst, vtmp1, D, 0);
1511 }
1512 }
1513
1514 // Unpack the mask, a long value in "src", into a vector register of boolean
1515 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1516 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1517 // most 64 lanes.
1518 //
1519 // Below example gives the expected dst vector register, with a valid src(0x658D)
1520 // on a 128-bit vector size machine.
1521 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1522 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1523 FloatRegister vtmp, int lane_cnt) {
1524 assert_different_registers(dst, vtmp);
1525 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1526 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1527
1528 // Example: src = 0x658D, lane_cnt = 16
1529 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1530
1531 // Put long value from general purpose register into the first lane of vector.
1532 // vtmp = 0x0000000000000000 | 0x000000000000658D
1533 sve_dup(vtmp, B, 0);
1534 mov(vtmp, D, 0, src);
1535
1536 // Transform the value in the first lane which is mask in bit now to the mask in
1537 // byte, which can be done by SVE2's BDEP instruction.
1538
1539 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1540 // vtmp = 0x0000000000000065 | 0x000000000000008D
1541 if (lane_cnt <= 8) {
1542 // Nothing. As only one byte exsits.
1543 } else if (lane_cnt <= 16) {
1544 ins(vtmp, B, vtmp, 8, 1);
1545 } else {
1546 sve_vector_extend(vtmp, D, vtmp, B);
1547 }
1548
1549 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1550 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1551 sve_dup(dst, B, 1);
1552
1553 // BDEP dst.D, vtmp.D, dst.D
1554 // vtmp = 0x0000000000000065 | 0x000000000000008D
1555 // dst = 0x0101010101010101 | 0x0101010101010101
1556 // ---------------------------------------
1557 // dst = 0x0001010000010001 | 0x0100000001010001
1558 sve_bdep(dst, D, vtmp, dst);
1559 }
1560
1561 // Clobbers: rflags
1562 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1563 FloatRegister zn, FloatRegister zm, Condition cond) {
1564 assert(pg->is_governing(), "This register has to be a governing predicate register");
1565 FloatRegister z1 = zn, z2 = zm;
1566 switch (cond) {
1567 case LE: z1 = zm; z2 = zn; cond = GE; break;
1568 case LT: z1 = zm; z2 = zn; cond = GT; break;
1569 case LO: z1 = zm; z2 = zn; cond = HI; break;
1570 case LS: z1 = zm; z2 = zn; cond = HS; break;
1571 default:
1572 break;
1573 }
1574
1575 SIMD_RegVariant size = elemType_to_regVariant(bt);
1576 if (is_floating_point_type(bt)) {
1577 sve_fcm(cond, pd, size, pg, z1, z2);
1578 } else {
1579 assert(is_integral_type(bt), "unsupported element type");
1580 sve_cmp(cond, pd, size, pg, z1, z2);
1581 }
1582 }
1583
1584 // Get index of the last mask lane that is set
1585 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1586 SIMD_RegVariant size = elemType_to_regVariant(bt);
1587 sve_rev(ptmp, size, src);
1588 sve_brkb(ptmp, ptrue, ptmp, false);
1589 sve_cntp(dst, size, ptrue, ptmp);
1590 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1591 subw(dst, rscratch1, dst);
1592 }
1593
1594 // Extend integer vector src to dst with the same lane count
1595 // but larger element size, e.g. 4B -> 4I
1596 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1597 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1598 if (src_bt == T_BYTE) {
1599 // 4B to 4S/4I, 8B to 8S
1600 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1601 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1602 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1603 if (dst_bt == T_INT) {
1604 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1605 }
1606 } else if (src_bt == T_SHORT) {
1607 // 2S to 2I/2L, 4S to 4I
1608 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1609 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1610 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1611 if (dst_bt == T_LONG) {
1612 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1613 }
1614 } else if (src_bt == T_INT) {
1615 // 2I to 2L
1616 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1617 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1618 } else {
1619 ShouldNotReachHere();
1620 }
1621 }
1622
1623 // Narrow integer vector src down to dst with the same lane count
1624 // but smaller element size, e.g. 4I -> 4B
1625 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1626 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1627 if (src_bt == T_SHORT) {
1628 // 4S/8S to 4B/8B
1629 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1630 assert(dst_bt == T_BYTE, "unsupported");
1631 xtn(dst, T8B, src, T8H);
1632 } else if (src_bt == T_INT) {
1633 // 2I to 2S, 4I to 4B/4S
1634 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1635 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1636 xtn(dst, T4H, src, T4S);
1637 if (dst_bt == T_BYTE) {
1638 xtn(dst, T8B, dst, T8H);
1639 }
1640 } else if (src_bt == T_LONG) {
1641 // 2L to 2S/2I
1642 assert(src_vlen_in_bytes == 16, "unsupported");
1643 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1644 xtn(dst, T2S, src, T2D);
1645 if (dst_bt == T_SHORT) {
1646 xtn(dst, T4H, dst, T4S);
1647 }
1648 } else {
1649 ShouldNotReachHere();
1650 }
1651 }
1652
1653 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1654 FloatRegister src, SIMD_RegVariant src_size,
1655 bool is_unsigned) {
1656 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1657
1658 if (src_size == B) {
1659 switch (dst_size) {
1660 case H:
1661 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1662 break;
1663 case S:
1664 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1665 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1666 break;
1667 case D:
1668 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1669 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1670 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1671 break;
1672 default:
1673 ShouldNotReachHere();
1674 }
1675 } else if (src_size == H) {
1676 if (dst_size == S) {
1677 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1678 } else { // D
1679 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1680 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1681 }
1682 } else if (src_size == S) {
1683 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1684 }
1685 }
1686
1687 // Vector narrow from src to dst with specified element sizes.
1688 // High part of dst vector will be filled with zero.
1689 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1690 FloatRegister src, SIMD_RegVariant src_size,
1691 FloatRegister tmp) {
1692 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1693 assert_different_registers(src, tmp);
1694 sve_dup(tmp, src_size, 0);
1695 if (src_size == D) {
1696 switch (dst_size) {
1697 case S:
1698 sve_uzp1(dst, S, src, tmp);
1699 break;
1700 case H:
1701 assert_different_registers(dst, tmp);
1702 sve_uzp1(dst, S, src, tmp);
1703 sve_uzp1(dst, H, dst, tmp);
1704 break;
1705 case B:
1706 assert_different_registers(dst, tmp);
1707 sve_uzp1(dst, S, src, tmp);
1708 sve_uzp1(dst, H, dst, tmp);
1709 sve_uzp1(dst, B, dst, tmp);
1710 break;
1711 default:
1712 ShouldNotReachHere();
1713 }
1714 } else if (src_size == S) {
1715 if (dst_size == H) {
1716 sve_uzp1(dst, H, src, tmp);
1717 } else { // B
1718 assert_different_registers(dst, tmp);
1719 sve_uzp1(dst, H, src, tmp);
1720 sve_uzp1(dst, B, dst, tmp);
1721 }
1722 } else if (src_size == H) {
1723 sve_uzp1(dst, B, src, tmp);
1724 }
1725 }
1726
1727 // Extend src predicate to dst predicate with the same lane count but larger
1728 // element size, e.g. 64Byte -> 512Long
1729 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1730 uint dst_element_length_in_bytes,
1731 uint src_element_length_in_bytes) {
1732 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1733 sve_punpklo(dst, src);
1734 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1735 sve_punpklo(dst, src);
1736 sve_punpklo(dst, dst);
1737 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1738 sve_punpklo(dst, src);
1739 sve_punpklo(dst, dst);
1740 sve_punpklo(dst, dst);
1741 } else {
1742 assert(false, "unsupported");
1743 ShouldNotReachHere();
1744 }
1745 }
1746
1747 // Narrow src predicate to dst predicate with the same lane count but
1748 // smaller element size, e.g. 512Long -> 64Byte
1749 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1750 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1751 // The insignificant bits in src predicate are expected to be zero.
1752 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1753 // passed as the second argument. An example narrowing operation with a given mask would be -
1754 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1755 // Mask (for 2 Longs) : TF
1756 // Predicate register for the above mask (16 bits) : 00000001 00000000
1757 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1758 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1759 assert_different_registers(src, ptmp);
1760 assert_different_registers(dst, ptmp);
1761 sve_pfalse(ptmp);
1762 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1763 sve_uzp1(dst, B, src, ptmp);
1764 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1765 sve_uzp1(dst, H, src, ptmp);
1766 sve_uzp1(dst, B, dst, ptmp);
1767 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1768 sve_uzp1(dst, S, src, ptmp);
1769 sve_uzp1(dst, H, dst, ptmp);
1770 sve_uzp1(dst, B, dst, ptmp);
1771 } else {
1772 assert(false, "unsupported");
1773 ShouldNotReachHere();
1774 }
1775 }
1776
1777 // Vector reduction add for integral type with ASIMD instructions.
1778 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1779 Register isrc, FloatRegister vsrc,
1780 unsigned vector_length_in_bytes,
1781 FloatRegister vtmp) {
1782 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1783 assert_different_registers(dst, isrc);
1784 bool isQ = vector_length_in_bytes == 16;
1785
1786 BLOCK_COMMENT("neon_reduce_add_integral {");
1787 switch(bt) {
1788 case T_BYTE:
1789 addv(vtmp, isQ ? T16B : T8B, vsrc);
1790 smov(dst, vtmp, B, 0);
1791 addw(dst, dst, isrc, ext::sxtb);
1792 break;
1793 case T_SHORT:
1794 addv(vtmp, isQ ? T8H : T4H, vsrc);
1795 smov(dst, vtmp, H, 0);
1796 addw(dst, dst, isrc, ext::sxth);
1797 break;
1798 case T_INT:
1799 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1800 umov(dst, vtmp, S, 0);
1801 addw(dst, dst, isrc);
1802 break;
1803 case T_LONG:
1804 assert(isQ, "unsupported");
1805 addpd(vtmp, vsrc);
1806 umov(dst, vtmp, D, 0);
1807 add(dst, dst, isrc);
1808 break;
1809 default:
1810 assert(false, "unsupported");
1811 ShouldNotReachHere();
1812 }
1813 BLOCK_COMMENT("} neon_reduce_add_integral");
1814 }
1815
1816 // Vector reduction multiply for integral type with ASIMD instructions.
1817 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1818 // Clobbers: rscratch1
1819 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1820 Register isrc, FloatRegister vsrc,
1821 unsigned vector_length_in_bytes,
1822 FloatRegister vtmp1, FloatRegister vtmp2) {
1823 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1824 bool isQ = vector_length_in_bytes == 16;
1825
1826 BLOCK_COMMENT("neon_reduce_mul_integral {");
1827 switch(bt) {
1828 case T_BYTE:
1829 if (isQ) {
1830 // Multiply the lower half and higher half of vector iteratively.
1831 // vtmp1 = vsrc[8:15]
1832 ins(vtmp1, D, vsrc, 0, 1);
1833 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1834 mulv(vtmp1, T8B, vtmp1, vsrc);
1835 // vtmp2 = vtmp1[4:7]
1836 ins(vtmp2, S, vtmp1, 0, 1);
1837 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1838 mulv(vtmp1, T8B, vtmp2, vtmp1);
1839 } else {
1840 ins(vtmp1, S, vsrc, 0, 1);
1841 mulv(vtmp1, T8B, vtmp1, vsrc);
1842 }
1843 // vtmp2 = vtmp1[2:3]
1844 ins(vtmp2, H, vtmp1, 0, 1);
1845 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1846 mulv(vtmp2, T8B, vtmp2, vtmp1);
1847 // dst = vtmp2[0] * isrc * vtmp2[1]
1848 umov(rscratch1, vtmp2, B, 0);
1849 mulw(dst, rscratch1, isrc);
1850 sxtb(dst, dst);
1851 umov(rscratch1, vtmp2, B, 1);
1852 mulw(dst, rscratch1, dst);
1853 sxtb(dst, dst);
1854 break;
1855 case T_SHORT:
1856 if (isQ) {
1857 ins(vtmp2, D, vsrc, 0, 1);
1858 mulv(vtmp2, T4H, vtmp2, vsrc);
1859 ins(vtmp1, S, vtmp2, 0, 1);
1860 mulv(vtmp1, T4H, vtmp1, vtmp2);
1861 } else {
1862 ins(vtmp1, S, vsrc, 0, 1);
1863 mulv(vtmp1, T4H, vtmp1, vsrc);
1864 }
1865 umov(rscratch1, vtmp1, H, 0);
1866 mulw(dst, rscratch1, isrc);
1867 sxth(dst, dst);
1868 umov(rscratch1, vtmp1, H, 1);
1869 mulw(dst, rscratch1, dst);
1870 sxth(dst, dst);
1871 break;
1872 case T_INT:
1873 if (isQ) {
1874 ins(vtmp1, D, vsrc, 0, 1);
1875 mulv(vtmp1, T2S, vtmp1, vsrc);
1876 } else {
1877 vtmp1 = vsrc;
1878 }
1879 umov(rscratch1, vtmp1, S, 0);
1880 mul(dst, rscratch1, isrc);
1881 umov(rscratch1, vtmp1, S, 1);
1882 mul(dst, rscratch1, dst);
1883 break;
1884 case T_LONG:
1885 umov(rscratch1, vsrc, D, 0);
1886 mul(dst, isrc, rscratch1);
1887 umov(rscratch1, vsrc, D, 1);
1888 mul(dst, dst, rscratch1);
1889 break;
1890 default:
1891 assert(false, "unsupported");
1892 ShouldNotReachHere();
1893 }
1894 BLOCK_COMMENT("} neon_reduce_mul_integral");
1895 }
1896
1897 // Vector reduction multiply for floating-point type with ASIMD instructions.
1898 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1899 FloatRegister fsrc, FloatRegister vsrc,
1900 unsigned vector_length_in_bytes,
1901 FloatRegister vtmp) {
1902 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1903 bool isQ = vector_length_in_bytes == 16;
1904
1905 BLOCK_COMMENT("neon_reduce_mul_fp {");
1906 switch(bt) {
1907 case T_FLOAT:
1908 fmuls(dst, fsrc, vsrc);
1909 ins(vtmp, S, vsrc, 0, 1);
1910 fmuls(dst, dst, vtmp);
1911 if (isQ) {
1912 ins(vtmp, S, vsrc, 0, 2);
1913 fmuls(dst, dst, vtmp);
1914 ins(vtmp, S, vsrc, 0, 3);
1915 fmuls(dst, dst, vtmp);
1916 }
1917 break;
1918 case T_DOUBLE:
1919 assert(isQ, "unsupported");
1920 fmuld(dst, fsrc, vsrc);
1921 ins(vtmp, D, vsrc, 0, 1);
1922 fmuld(dst, dst, vtmp);
1923 break;
1924 default:
1925 assert(false, "unsupported");
1926 ShouldNotReachHere();
1927 }
1928 BLOCK_COMMENT("} neon_reduce_mul_fp");
1929 }
1930
1931 // Helper to select logical instruction
1932 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1933 Register Rn, Register Rm,
1934 enum shift_kind kind, unsigned shift) {
1935 switch(opc) {
1936 case Op_AndReductionV:
1937 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1938 break;
1939 case Op_OrReductionV:
1940 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1941 break;
1942 case Op_XorReductionV:
1943 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1944 break;
1945 default:
1946 assert(false, "unsupported");
1947 ShouldNotReachHere();
1948 }
1949 }
1950
1951 // Vector reduction logical operations And, Or, Xor
1952 // Clobbers: rscratch1
1953 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1954 Register isrc, FloatRegister vsrc,
1955 unsigned vector_length_in_bytes) {
1956 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1957 "unsupported");
1958 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1959 assert_different_registers(dst, isrc);
1960 bool isQ = vector_length_in_bytes == 16;
1961
1962 BLOCK_COMMENT("neon_reduce_logical {");
1963 umov(rscratch1, vsrc, isQ ? D : S, 0);
1964 umov(dst, vsrc, isQ ? D : S, 1);
1965 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1966 switch(bt) {
1967 case T_BYTE:
1968 if (isQ) {
1969 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1970 }
1971 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1972 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1973 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1974 sxtb(dst, dst);
1975 break;
1976 case T_SHORT:
1977 if (isQ) {
1978 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1979 }
1980 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1981 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1982 sxth(dst, dst);
1983 break;
1984 case T_INT:
1985 if (isQ) {
1986 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1987 }
1988 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1989 break;
1990 case T_LONG:
1991 assert(isQ, "unsupported");
1992 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1993 break;
1994 default:
1995 assert(false, "unsupported");
1996 ShouldNotReachHere();
1997 }
1998 BLOCK_COMMENT("} neon_reduce_logical");
1999 }
2000
2001 // Helper function to decode min/max reduction operation properties
2002 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2003 bool* is_unsigned,
2004 Condition* cond) {
2005 switch(opc) {
2006 case Op_MinReductionV:
2007 *is_min = true; *is_unsigned = false; *cond = LT; break;
2008 case Op_MaxReductionV:
2009 *is_min = false; *is_unsigned = false; *cond = GT; break;
2010 case Op_UMinReductionV:
2011 *is_min = true; *is_unsigned = true; *cond = LO; break;
2012 case Op_UMaxReductionV:
2013 *is_min = false; *is_unsigned = true; *cond = HI; break;
2014 default:
2015 ShouldNotReachHere();
2016 }
2017 }
2018
2019 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2020 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2021 // Clobbers: rscratch1, rflags
2022 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2023 Register isrc, FloatRegister vsrc,
2024 unsigned vector_length_in_bytes,
2025 FloatRegister vtmp) {
2026 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2027 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2028 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2029 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2030 assert_different_registers(dst, isrc);
2031 bool isQ = vector_length_in_bytes == 16;
2032 bool is_min;
2033 bool is_unsigned;
2034 Condition cond;
2035 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2036 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2037 if (bt == T_LONG) {
2038 assert(vtmp == fnoreg, "should be");
2039 assert(isQ, "should be");
2040 umov(rscratch1, vsrc, D, 0);
2041 cmp(isrc, rscratch1);
2042 csel(dst, isrc, rscratch1, cond);
2043 umov(rscratch1, vsrc, D, 1);
2044 cmp(dst, rscratch1);
2045 csel(dst, dst, rscratch1, cond);
2046 } else {
2047 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2048 if (size == T2S) {
2049 // For T2S (2x32-bit elements), use pairwise instructions because
2050 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2051 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2052 } else {
2053 // For other sizes, use reduction to scalar instructions.
2054 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2055 }
2056 if (bt == T_INT) {
2057 umov(dst, vtmp, S, 0);
2058 } else if (is_unsigned) {
2059 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2060 } else {
2061 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2062 }
2063 cmpw(dst, isrc);
2064 cselw(dst, dst, isrc, cond);
2065 }
2066 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2067 }
2068
2069 // Vector reduction for integral type with SVE instruction.
2070 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2071 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2072 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2073 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2074 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2075 assert(pg->is_governing(), "This register has to be a governing predicate register");
2076 assert_different_registers(src1, dst);
2077 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2078 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2079 switch (opc) {
2080 case Op_AddReductionVI: {
2081 sve_uaddv(tmp, size, pg, src2);
2082 if (bt == T_BYTE) {
2083 smov(dst, tmp, size, 0);
2084 addw(dst, src1, dst, ext::sxtb);
2085 } else if (bt == T_SHORT) {
2086 smov(dst, tmp, size, 0);
2087 addw(dst, src1, dst, ext::sxth);
2088 } else {
2089 umov(dst, tmp, size, 0);
2090 addw(dst, dst, src1);
2091 }
2092 break;
2093 }
2094 case Op_AddReductionVL: {
2095 sve_uaddv(tmp, size, pg, src2);
2096 umov(dst, tmp, size, 0);
2097 add(dst, dst, src1);
2098 break;
2099 }
2100 case Op_AndReductionV: {
2101 sve_andv(tmp, size, pg, src2);
2102 if (bt == T_INT || bt == T_LONG) {
2103 umov(dst, tmp, size, 0);
2104 } else {
2105 smov(dst, tmp, size, 0);
2106 }
2107 if (bt == T_LONG) {
2108 andr(dst, dst, src1);
2109 } else {
2110 andw(dst, dst, src1);
2111 }
2112 break;
2113 }
2114 case Op_OrReductionV: {
2115 sve_orv(tmp, size, pg, src2);
2116 if (bt == T_INT || bt == T_LONG) {
2117 umov(dst, tmp, size, 0);
2118 } else {
2119 smov(dst, tmp, size, 0);
2120 }
2121 if (bt == T_LONG) {
2122 orr(dst, dst, src1);
2123 } else {
2124 orrw(dst, dst, src1);
2125 }
2126 break;
2127 }
2128 case Op_XorReductionV: {
2129 sve_eorv(tmp, size, pg, src2);
2130 if (bt == T_INT || bt == T_LONG) {
2131 umov(dst, tmp, size, 0);
2132 } else {
2133 smov(dst, tmp, size, 0);
2134 }
2135 if (bt == T_LONG) {
2136 eor(dst, dst, src1);
2137 } else {
2138 eorw(dst, dst, src1);
2139 }
2140 break;
2141 }
2142 case Op_MaxReductionV:
2143 case Op_MinReductionV:
2144 case Op_UMaxReductionV:
2145 case Op_UMinReductionV: {
2146 bool is_min;
2147 bool is_unsigned;
2148 Condition cond;
2149 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2150 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2151 // Move result from vector to general register
2152 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2153 umov(dst, tmp, size, 0);
2154 } else {
2155 smov(dst, tmp, size, 0);
2156 }
2157 if (bt == T_LONG) {
2158 cmp(dst, src1);
2159 csel(dst, dst, src1, cond);
2160 } else {
2161 cmpw(dst, src1);
2162 cselw(dst, dst, src1, cond);
2163 }
2164 break;
2165 }
2166 default:
2167 assert(false, "unsupported");
2168 ShouldNotReachHere();
2169 }
2170
2171 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2172 if (bt == T_BYTE) {
2173 sxtb(dst, dst);
2174 } else if (bt == T_SHORT) {
2175 sxth(dst, dst);
2176 }
2177 }
2178 }
2179
2180 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2181 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2182 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2183 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2184 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2185 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2186
2187 // Set all elements to false if the input "lane_cnt" is zero.
2188 if (lane_cnt == 0) {
2189 sve_pfalse(dst);
2190 return;
2191 }
2192
2193 SIMD_RegVariant size = elemType_to_regVariant(bt);
2194 assert(size != Q, "invalid size");
2195
2196 // Set all true if "lane_cnt" equals to the max lane count.
2197 if (lane_cnt == max_vector_length) {
2198 sve_ptrue(dst, size, /* ALL */ 0b11111);
2199 return;
2200 }
2201
2202 // Fixed numbers for "ptrue".
2203 switch(lane_cnt) {
2204 case 1: /* VL1 */
2205 case 2: /* VL2 */
2206 case 3: /* VL3 */
2207 case 4: /* VL4 */
2208 case 5: /* VL5 */
2209 case 6: /* VL6 */
2210 case 7: /* VL7 */
2211 case 8: /* VL8 */
2212 sve_ptrue(dst, size, lane_cnt);
2213 return;
2214 case 16:
2215 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2216 return;
2217 case 32:
2218 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2219 return;
2220 case 64:
2221 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2222 return;
2223 case 128:
2224 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2225 return;
2226 case 256:
2227 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2228 return;
2229 default:
2230 break;
2231 }
2232
2233 // Special patterns for "ptrue".
2234 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2235 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2236 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2237 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2238 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2239 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2240 } else {
2241 // Encode to "whileltw" for the remaining cases.
2242 mov(rscratch1, lane_cnt);
2243 sve_whileltw(dst, size, zr, rscratch1);
2244 }
2245 }
2246
2247 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2248 // Any remaining elements of dst will be filled with zero.
2249 // Clobbers: rscratch1
2250 // Preserves: mask, vzr
2251 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2252 FloatRegister vzr, FloatRegister vtmp,
2253 PRegister pgtmp, unsigned vector_length_in_bytes) {
2254 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2255 // When called by sve_compress_byte, src and vtmp may be the same register.
2256 assert_different_registers(dst, src, vzr);
2257 assert_different_registers(dst, vtmp, vzr);
2258 assert_different_registers(mask, pgtmp);
2259 // high <-- low
2260 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2261 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2262 // Expected result: dst = 00 00 00 hh ee dd bb aa
2263
2264 // Extend lowest half to type INT.
2265 // dst = 00dd 00cc 00bb 00aa
2266 sve_uunpklo(dst, S, src);
2267 // pgtmp = 0001 0000 0001 0001
2268 sve_punpklo(pgtmp, mask);
2269 // Pack the active elements in size of type INT to the right,
2270 // and fill the remainings with zero.
2271 // dst = 0000 00dd 00bb 00aa
2272 sve_compact(dst, S, dst, pgtmp);
2273 // Narrow the result back to type SHORT.
2274 // dst = 00 00 00 00 00 dd bb aa
2275 sve_uzp1(dst, H, dst, vzr);
2276
2277 // Return if the vector length is no more than MaxVectorSize/2, since the
2278 // highest half is invalid.
2279 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2280 return;
2281 }
2282
2283 // Count the active elements of lowest half.
2284 // rscratch1 = 3
2285 sve_cntp(rscratch1, S, ptrue, pgtmp);
2286
2287 // Repeat to the highest half.
2288 // pgtmp = 0001 0000 0000 0001
2289 sve_punpkhi(pgtmp, mask);
2290 // vtmp = 00hh 00gg 00ff 00ee
2291 sve_uunpkhi(vtmp, S, src);
2292 // vtmp = 0000 0000 00hh 00ee
2293 sve_compact(vtmp, S, vtmp, pgtmp);
2294 // vtmp = 00 00 00 00 00 00 hh ee
2295 sve_uzp1(vtmp, H, vtmp, vzr);
2296
2297 // pgtmp = 00 00 00 00 00 01 01 01
2298 sve_whilelt(pgtmp, H, zr, rscratch1);
2299 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2300 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2301 // Combine the compressed low with the compressed high:
2302 // dst = 00 00 00 hh ee dd bb aa
2303 sve_splice(dst, H, pgtmp, vtmp);
2304 }
2305
2306 // Clobbers: rscratch1, rscratch2
2307 // Preserves: src, mask
2308 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2309 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2310 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2311 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2312 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2313 assert_different_registers(mask, ptmp, pgtmp);
2314 // high <-- low
2315 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2316 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2317 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2318 FloatRegister vzr = vtmp3;
2319 sve_dup(vzr, B, 0);
2320
2321 // Extend lowest half to type SHORT.
2322 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2323 sve_uunpklo(vtmp1, H, src);
2324 // ptmp = 00 01 00 00 00 01 00 01
2325 sve_punpklo(ptmp, mask);
2326 // Pack the active elements in size of type SHORT to the right,
2327 // and fill the remainings with zero.
2328 // dst = 00 00 00 00 00 0g 0c 0a
2329 unsigned extended_size = vector_length_in_bytes << 1;
2330 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2331 // Narrow the result back to type BYTE.
2332 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2333 sve_uzp1(dst, B, dst, vzr);
2334
2335 // Return if the vector length is no more than MaxVectorSize/2, since the
2336 // highest half is invalid.
2337 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2338 return;
2339 }
2340 // Count the active elements of lowest half.
2341 // rscratch2 = 3
2342 sve_cntp(rscratch2, H, ptrue, ptmp);
2343
2344 // Repeat to the highest half.
2345 // ptmp = 00 01 00 00 00 00 00 01
2346 sve_punpkhi(ptmp, mask);
2347 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2348 sve_uunpkhi(vtmp2, H, src);
2349 // vtmp1 = 00 00 00 00 00 00 0p 0i
2350 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2351 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2352 sve_uzp1(vtmp1, B, vtmp1, vzr);
2353
2354 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2355 sve_whilelt(ptmp, B, zr, rscratch2);
2356 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2357 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2358 // Combine the compressed low with the compressed high:
2359 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2360 sve_splice(dst, B, ptmp, vtmp1);
2361 }
2362
2363 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2364 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2365 SIMD_Arrangement size = isQ ? T16B : T8B;
2366 if (bt == T_BYTE) {
2367 rbit(dst, size, src);
2368 } else {
2369 neon_reverse_bytes(dst, src, bt, isQ);
2370 rbit(dst, size, dst);
2371 }
2372 }
2373
2374 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2375 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2376 SIMD_Arrangement size = isQ ? T16B : T8B;
2377 switch (bt) {
2378 case T_BYTE:
2379 if (dst != src) {
2380 orr(dst, size, src, src);
2381 }
2382 break;
2383 case T_SHORT:
2384 rev16(dst, size, src);
2385 break;
2386 case T_INT:
2387 rev32(dst, size, src);
2388 break;
2389 case T_LONG:
2390 rev64(dst, size, src);
2391 break;
2392 default:
2393 assert(false, "unsupported");
2394 ShouldNotReachHere();
2395 }
2396 }
2397
2398 // VectorRearrange implementation for short/int/float/long/double types with NEON
2399 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2400 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2401 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2402 // and use bsl to implement the operation.
2403 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2404 FloatRegister shuffle, FloatRegister tmp,
2405 BasicType bt, bool isQ) {
2406 assert_different_registers(dst, src, shuffle, tmp);
2407 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2408 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2409
2410 // Here is an example that rearranges a NEON vector with 4 ints:
2411 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2412 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2413 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2414 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2415 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2416 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2417 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2418 // 4. Use Vm as index register, and use V1 as table register.
2419 // Then get V2 as the result by tbl NEON instructions.
2420 switch (bt) {
2421 case T_SHORT:
2422 mov(tmp, size1, 0x02);
2423 mulv(dst, size2, shuffle, tmp);
2424 mov(tmp, size2, 0x0100);
2425 addv(dst, size1, dst, tmp);
2426 tbl(dst, size1, src, 1, dst);
2427 break;
2428 case T_INT:
2429 case T_FLOAT:
2430 mov(tmp, size1, 0x04);
2431 mulv(dst, size2, shuffle, tmp);
2432 mov(tmp, size2, 0x03020100);
2433 addv(dst, size1, dst, tmp);
2434 tbl(dst, size1, src, 1, dst);
2435 break;
2436 case T_LONG:
2437 case T_DOUBLE:
2438 // Load the iota indices for Long type. The indices are ordered by
2439 // type B/S/I/L/F/D, and the offset between two types is 16; Hence
2440 // the offset for L is 48.
2441 lea(rscratch1,
2442 ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
2443 ldrq(tmp, rscratch1);
2444 // Check whether the input "shuffle" is the same with iota indices.
2445 // Return "src" if true, otherwise swap the two elements of "src".
2446 cm(EQ, dst, size2, shuffle, tmp);
2447 ext(tmp, size1, src, src, 8);
2448 bsl(dst, size1, src, tmp);
2449 break;
2450 default:
2451 assert(false, "unsupported element type");
2452 ShouldNotReachHere();
2453 }
2454 }
2455
2456 // Extract a scalar element from an sve vector at position 'idx'.
2457 // The input elements in src are expected to be of integral type.
2458 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2459 int idx, FloatRegister vtmp) {
2460 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2461 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2462 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2463 if (bt == T_INT || bt == T_LONG) {
2464 umov(dst, src, size, idx);
2465 } else {
2466 smov(dst, src, size, idx);
2467 }
2468 } else {
2469 sve_orr(vtmp, src, src);
2470 sve_ext(vtmp, vtmp, idx << size);
2471 if (bt == T_INT || bt == T_LONG) {
2472 umov(dst, vtmp, size, 0);
2473 } else {
2474 smov(dst, vtmp, size, 0);
2475 }
2476 }
2477 }
2478
2479 // java.lang.Math::round intrinsics
2480
2481 // Clobbers: rscratch1, rflags
2482 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2483 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2484 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2485 switch (T) {
2486 case T2S:
2487 case T4S:
2488 fmovs(tmp1, T, 0.5f);
2489 mov(rscratch1, jint_cast(0x1.0p23f));
2490 break;
2491 case T2D:
2492 fmovd(tmp1, T, 0.5);
2493 mov(rscratch1, julong_cast(0x1.0p52));
2494 break;
2495 default:
2496 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2497 }
2498 fadd(tmp1, T, tmp1, src);
2499 fcvtms(tmp1, T, tmp1);
2500 // tmp1 = floor(src + 0.5, ties to even)
2501
2502 fcvtas(dst, T, src);
2503 // dst = round(src), ties to away
2504
2505 fneg(tmp3, T, src);
2506 dup(tmp2, T, rscratch1);
2507 cm(HS, tmp3, T, tmp3, tmp2);
2508 // tmp3 is now a set of flags
2509
2510 bif(dst, T16B, tmp1, tmp3);
2511 // result in dst
2512 }
2513
2514 // Clobbers: rscratch1, rflags
2515 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2516 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2517 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2518 assert_different_registers(tmp1, tmp2, src, dst);
2519
2520 switch (T) {
2521 case S:
2522 mov(rscratch1, jint_cast(0x1.0p23f));
2523 break;
2524 case D:
2525 mov(rscratch1, julong_cast(0x1.0p52));
2526 break;
2527 default:
2528 assert(T == S || T == D, "invalid register variant");
2529 }
2530
2531 sve_frinta(dst, T, ptrue, src);
2532 // dst = round(src), ties to away
2533
2534 Label none;
2535
2536 sve_fneg(tmp1, T, ptrue, src);
2537 sve_dup(tmp2, T, rscratch1);
2538 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2539 br(EQ, none);
2540 {
2541 sve_cpy(tmp1, T, pgtmp, 0.5);
2542 sve_fadd(tmp1, T, pgtmp, src);
2543 sve_frintm(dst, T, pgtmp, tmp1);
2544 // dst = floor(src + 0.5, ties to even)
2545 }
2546 bind(none);
2547
2548 sve_fcvtzs(dst, T, ptrue, dst, T);
2549 // result in dst
2550 }
2551
2552 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2553 FloatRegister one, SIMD_Arrangement T) {
2554 assert_different_registers(dst, src, zero, one);
2555 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2556
2557 facgt(dst, T, src, zero);
2558 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2559 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2560 }
2561
2562 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2563 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2564 assert_different_registers(dst, src, zero, one, vtmp);
2565 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2566
2567 sve_orr(vtmp, src, src);
2568 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2569 switch (T) {
2570 case S:
2571 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2572 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2573 // on the sign of the float value
2574 break;
2575 case D:
2576 sve_and(vtmp, T, min_jlong);
2577 sve_orr(vtmp, T, jlong_cast(1.0));
2578 break;
2579 default:
2580 assert(false, "unsupported");
2581 ShouldNotReachHere();
2582 }
2583 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2584 // Result in dst
2585 }
2586
2587 bool C2_MacroAssembler::in_scratch_emit_size() {
2588 if (ciEnv::current()->task() != nullptr) {
2589 PhaseOutput* phase_output = Compile::current()->output();
2590 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2591 return true;
2592 }
2593 }
2594 return MacroAssembler::in_scratch_emit_size();
2595 }
2596
2597 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2598 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2599 }
2600
2601 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2602 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2603 if (t == TypeInt::INT) {
2604 return;
2605 }
2606
2607 BLOCK_COMMENT("verify_int_in_range {");
2608 Label L_success, L_failure;
2609
2610 jint lo = t->_lo;
2611 jint hi = t->_hi;
2612
2613 if (lo != min_jint) {
2614 subsw(rtmp, rval, lo);
2615 br(Assembler::LT, L_failure);
2616 }
2617 if (hi != max_jint) {
2618 subsw(rtmp, rval, hi);
2619 br(Assembler::GT, L_failure);
2620 }
2621 b(L_success);
2622
2623 bind(L_failure);
2624 movw(c_rarg0, idx);
2625 mov(c_rarg1, rval);
2626 movw(c_rarg2, lo);
2627 movw(c_rarg3, hi);
2628 reconstruct_frame_pointer(rtmp);
2629 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2630 hlt(0);
2631
2632 bind(L_success);
2633 BLOCK_COMMENT("} verify_int_in_range");
2634 }
2635
2636 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2637 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2638 }
2639
2640 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2641 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2642 if (t == TypeLong::LONG) {
2643 return;
2644 }
2645
2646 BLOCK_COMMENT("verify_long_in_range {");
2647 Label L_success, L_failure;
2648
2649 jlong lo = t->_lo;
2650 jlong hi = t->_hi;
2651
2652 if (lo != min_jlong) {
2653 subs(rtmp, rval, lo);
2654 br(Assembler::LT, L_failure);
2655 }
2656 if (hi != max_jlong) {
2657 subs(rtmp, rval, hi);
2658 br(Assembler::GT, L_failure);
2659 }
2660 b(L_success);
2661
2662 bind(L_failure);
2663 movw(c_rarg0, idx);
2664 mov(c_rarg1, rval);
2665 mov(c_rarg2, lo);
2666 mov(c_rarg3, hi);
2667 reconstruct_frame_pointer(rtmp);
2668 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2669 hlt(0);
2670
2671 bind(L_success);
2672 BLOCK_COMMENT("} verify_long_in_range");
2673 }
2674
2675 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2676 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2677 if (PreserveFramePointer) {
2678 // frame pointer is valid
2679 #ifdef ASSERT
2680 // Verify frame pointer value in rfp.
2681 add(rtmp, sp, framesize - 2 * wordSize);
2682 Label L_success;
2683 cmp(rfp, rtmp);
2684 br(Assembler::EQ, L_success);
2685 stop("frame pointer mismatch");
2686 bind(L_success);
2687 #endif // ASSERT
2688 } else {
2689 add(rfp, sp, framesize - 2 * wordSize);
2690 }
2691 }
2692
2693 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2694 // using Neon instructions and places it in the destination vector element corresponding to the
2695 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2696 // where NUM_ELEM is the number of BasicType elements per vector.
2697 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2698 // Otherwise, selects src2[idx – NUM_ELEM]
2699 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2700 FloatRegister src2, FloatRegister index,
2701 FloatRegister tmp, unsigned vector_length_in_bytes) {
2702 assert_different_registers(dst, src1, src2, tmp);
2703 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2704
2705 if (vector_length_in_bytes == 16) {
2706 assert(UseSVE <= 1, "sve must be <= 1");
2707 assert(src1->successor() == src2, "Source registers must be ordered");
2708 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2709 tbl(dst, size, src1, 2, index);
2710 } else { // vector length == 8
2711 assert(UseSVE == 0, "must be Neon only");
2712 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2713 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2714 // instruction with one vector lookup
2715 ins(tmp, D, src1, 0, 0);
2716 ins(tmp, D, src2, 1, 0);
2717 tbl(dst, size, tmp, 1, index);
2718 }
2719 }
2720
2721 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2722 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2723 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2724 // where NUM_ELEM is the number of BasicType elements per vector.
2725 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2726 // Otherwise, selects src2[idx – NUM_ELEM]
2727 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2728 FloatRegister src2, FloatRegister index,
2729 FloatRegister tmp, SIMD_RegVariant T,
2730 unsigned vector_length_in_bytes) {
2731 assert_different_registers(dst, src1, src2, index, tmp);
2732
2733 if (vector_length_in_bytes == 8) {
2734 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2735 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2736 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2737 // instruction with one vector lookup
2738 assert(UseSVE >= 1, "sve must be >= 1");
2739 ins(tmp, D, src1, 0, 0);
2740 ins(tmp, D, src2, 1, 0);
2741 sve_tbl(dst, T, tmp, index);
2742 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2743 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2744 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2745 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2746 // with the only exception of 8B vector length.
2747 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2748 assert(src1->successor() == src2, "Source registers must be ordered");
2749 sve_tbl(dst, T, src1, src2, index);
2750 }
2751 }
2752
2753 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2754 FloatRegister src2, FloatRegister index,
2755 FloatRegister tmp, BasicType bt,
2756 unsigned vector_length_in_bytes) {
2757
2758 assert_different_registers(dst, src1, src2, index, tmp);
2759
2760 // The cases that can reach this method are -
2761 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2762 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2763 //
2764 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2765 // and UseSVE = 2 with vector_length_in_bytes >= 8
2766 //
2767 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2768 // UseSVE = 1 with vector_length_in_bytes = 16
2769
2770 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2771 SIMD_RegVariant T = elemType_to_regVariant(bt);
2772 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2773 return;
2774 }
2775
2776 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2777 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2778 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2779
2780 bool isQ = vector_length_in_bytes == 16;
2781
2782 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2783 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2784
2785 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2786 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2787 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2788 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2789 // the indices can range from [0, 8).
2790 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2791 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2792 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2793 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2794 // Add the multiplied result to the vector in tmp to obtain the byte level
2795 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2796 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2797
2798 if (bt == T_BYTE) {
2799 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2800 } else {
2801 int elem_size = (bt == T_SHORT) ? 2 : 4;
2802 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2803
2804 mov(tmp, size1, elem_size);
2805 mulv(dst, size2, index, tmp);
2806 mov(tmp, size2, tbl_offset);
2807 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2808 // to select a set of 2B/4B
2809 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2810 }
2811 }
2812
2813 // Vector expand implementation. Elements from the src vector are expanded into
2814 // the dst vector under the control of the vector mask.
2815 // Since there are no native instructions directly corresponding to expand before
2816 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2817 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2818 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2819 // for NEON and SVE, but with different instructions where appropriate.
2820
2821 // Vector expand implementation for NEON.
2822 //
2823 // An example of 128-bit Byte vector:
2824 // Data direction: high <== low
2825 // Input:
2826 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2827 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2828 // Expected result:
2829 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2830 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2831 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2832 int vector_length_in_bytes) {
2833 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2834 assert_different_registers(dst, src, mask, tmp1, tmp2);
2835 // Since the TBL instruction only supports byte table, we need to
2836 // compute indices in byte type for all types.
2837 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2838 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2839 dup(tmp1, size, zr);
2840 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2841 negr(dst, size, mask);
2842 // Calculate vector index for TBL with prefix sum algorithm.
2843 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2844 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2845 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2846 addv(dst, size, tmp2, dst);
2847 }
2848 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2849 orr(tmp2, size, mask, mask);
2850 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2851 bsl(tmp2, size, dst, tmp1);
2852 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2853 movi(tmp1, size, 1);
2854 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2855 subv(dst, size, tmp2, tmp1);
2856 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2857 tbl(dst, size, src, 1, dst);
2858 }
2859
2860 // Vector expand implementation for SVE.
2861 //
2862 // An example of 128-bit Short vector:
2863 // Data direction: high <== low
2864 // Input:
2865 // src = gf ed cb a9 87 65 43 21
2866 // pg = 00 01 00 01 00 01 00 01
2867 // Expected result:
2868 // dst = 00 87 00 65 00 43 00 21
2869 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2870 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2871 int vector_length_in_bytes) {
2872 assert(UseSVE > 0, "expand implementation only for SVE");
2873 assert_different_registers(dst, src, tmp1, tmp2);
2874 SIMD_RegVariant size = elemType_to_regVariant(bt);
2875
2876 // tmp1 = 00 00 00 00 00 00 00 00
2877 sve_dup(tmp1, size, 0);
2878 sve_movprfx(tmp2, tmp1);
2879 // tmp2 = 00 01 00 01 00 01 00 01
2880 sve_cpy(tmp2, size, pg, 1, true);
2881 // Calculate vector index for TBL with prefix sum algorithm.
2882 // tmp2 = 04 04 03 03 02 02 01 01
2883 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2884 sve_movprfx(dst, tmp1);
2885 // The EXT instruction operates on the full-width sve register. The correct
2886 // index calculation method is:
2887 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2888 // MaxVectorSize - i.
2889 sve_ext(dst, tmp2, MaxVectorSize - i);
2890 sve_add(tmp2, size, dst, tmp2);
2891 }
2892 // dst = 00 04 00 03 00 02 00 01
2893 sve_sel(dst, size, pg, tmp2, tmp1);
2894 // dst = -1 03 -1 02 -1 01 -1 00
2895 sve_sub(dst, size, 1);
2896 // dst = 00 87 00 65 00 43 00 21
2897 sve_tbl(dst, size, src, dst);
2898 }
2899
2900 // Optimized SVE cpy (imm, zeroing) instruction.
2901 //
2902 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2903 // functionality, but test results show that `movi; cpy(imm, merging)` has
2904 // higher throughput on some microarchitectures. This would depend on
2905 // microarchitecture and so may vary between implementations.
2906 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2907 PRegister pg, int imm8, bool isMerge) {
2908 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2909 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2910 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2911 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2912 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2913 // entire Z<dst> register. According to the Arm Software Optimization
2914 // Guide, `movi` is zero latency.
2915 movi(dst, T2D, 0);
2916 isMerge = true;
2917 }
2918 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2919 }