1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2026 Arm Limited and/or its affiliates.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/matcher.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/objectMonitorTable.hpp"
35 #include "runtime/stubRoutines.hpp"
36 #include "runtime/synchronizer.hpp"
37 #include "utilities/globalDefinitions.hpp"
38 #include "utilities/powerOfTwo.hpp"
39
40 #ifdef PRODUCT
41 #define BLOCK_COMMENT(str) /* nothing */
42 #define STOP(error) stop(error)
43 #else
44 #define BLOCK_COMMENT(str) block_comment(str)
45 #define STOP(error) block_comment(error); stop(error)
46 #endif
47
48 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
49
50 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
51
52 void C2_MacroAssembler::entry_barrier() {
53 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
54 // Dummy labels for just measuring the code size
55 Label dummy_slow_path;
56 Label dummy_continuation;
57 Label dummy_guard;
58 Label* slow_path = &dummy_slow_path;
59 Label* continuation = &dummy_continuation;
60 Label* guard = &dummy_guard;
61 if (!Compile::current()->output()->in_scratch_emit_size()) {
62 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
63 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
64 Compile::current()->output()->add_stub(stub);
65 slow_path = &stub->entry();
66 continuation = &stub->continuation();
67 guard = &stub->guard();
68 }
69 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
70 bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
71 }
72
73 // jdk.internal.util.ArraysSupport.vectorizedHashCode
74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
75 FloatRegister vdata0, FloatRegister vdata1,
76 FloatRegister vdata2, FloatRegister vdata3,
77 FloatRegister vmul0, FloatRegister vmul1,
78 FloatRegister vmul2, FloatRegister vmul3,
79 FloatRegister vpow, FloatRegister vpowm,
80 BasicType eltype) {
81 ARRAYS_HASHCODE_REGISTERS;
82
83 Register tmp1 = rscratch1, tmp2 = rscratch2;
84
85 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
86
87 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
88 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
89 // use 4H for chars and shorts instead, but using 8H gives better performance.
90 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
91 : eltype == T_CHAR || eltype == T_SHORT ? 8
92 : eltype == T_INT ? 4
93 : 0;
94 guarantee(vf, "unsupported eltype");
95
96 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
97 const size_t unroll_factor = 4;
98
99 switch (eltype) {
100 case T_BOOLEAN:
101 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
102 break;
103 case T_CHAR:
104 BLOCK_COMMENT("arrays_hashcode(char) {");
105 break;
106 case T_BYTE:
107 BLOCK_COMMENT("arrays_hashcode(byte) {");
108 break;
109 case T_SHORT:
110 BLOCK_COMMENT("arrays_hashcode(short) {");
111 break;
112 case T_INT:
113 BLOCK_COMMENT("arrays_hashcode(int) {");
114 break;
115 default:
116 ShouldNotReachHere();
117 }
118
119 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
120 // implemented by the stub executes just once. Call the stub only if at least two iterations will
121 // be executed.
122 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
123 cmpw(cnt, large_threshold);
124 br(Assembler::HS, LARGE);
125
126 bind(TAIL);
127
128 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
129 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
130 // Iteration eats up the remainder, uf elements at a time.
131 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
132 andr(tmp2, cnt, unroll_factor - 1);
133 adr(tmp1, BR_BASE);
134 // For Cortex-A53 offset is 4 because 2 nops are generated.
135 sub(tmp1, tmp1, tmp2, ext::sxtw, VM_Version::supports_a53mac() ? 4 : 3);
136 movw(tmp2, 0x1f);
137 br(tmp1);
138
139 bind(LOOP);
140 for (size_t i = 0; i < unroll_factor; ++i) {
141 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
142 maddw(result, result, tmp2, tmp1);
143 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
144 // Generate 2nd nop to have 4 instructions per iteration.
145 if (VM_Version::supports_a53mac()) {
146 nop();
147 }
148 }
149 bind(BR_BASE);
150 subsw(cnt, cnt, unroll_factor);
151 br(Assembler::HS, LOOP);
152
153 b(DONE);
154
155 bind(LARGE);
156
157 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
158 assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
159 address tpc = trampoline_call(stub);
160 if (tpc == nullptr) {
161 DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
162 postcond(pc() == badAddress);
163 return nullptr;
164 }
165
166 bind(DONE);
167
168 BLOCK_COMMENT("} // arrays_hashcode");
169
170 postcond(pc() != badAddress);
171 return pc();
172 }
173
174 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register t1,
175 Register t2, Register t3) {
176 assert_different_registers(obj, box, t1, t2, t3, rscratch2);
177
178 // Handle inflated monitor.
179 Label inflated;
180 // Finish fast lock successfully. MUST branch to with flag == EQ
181 Label locked;
182 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
183 Label slow_path;
184
185 if (UseObjectMonitorTable) {
186 // Clear cache in case fast locking succeeds or we need to take the slow-path.
187 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
188 }
189
190 if (DiagnoseSyncOnValueBasedClasses != 0) {
191 load_klass(t1, obj);
192 ldrb(t1, Address(t1, Klass::misc_flags_offset()));
193 tst(t1, KlassFlags::_misc_is_value_based_class);
194 br(Assembler::NE, slow_path);
195 }
196
197 const Register t1_mark = t1;
198 const Register t3_t = t3;
199
200 { // Fast locking
201
202 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
203 Label push;
204
205 const Register t2_top = t2;
206
207 // Check if lock-stack is full.
208 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
209 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
210 br(Assembler::GT, slow_path);
211
212 // Check if recursive.
213 subw(t3_t, t2_top, oopSize);
214 ldr(t3_t, Address(rthread, t3_t));
215 cmp(obj, t3_t);
216 br(Assembler::EQ, push);
217
218 // Relaxed normal load to check for monitor. Optimization for monitor case.
219 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
220 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
221
222 // Not inflated
223 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
224
225 // Try to lock. Transition lock-bits 0b01 => 0b00
226 orr(t1_mark, t1_mark, markWord::unlocked_value);
227 eor(t3_t, t1_mark, markWord::unlocked_value);
228 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_acquire);
229 br(Assembler::NE, slow_path);
230
231 bind(push);
232 // After successful lock, push object on lock-stack.
233 str(obj, Address(rthread, t2_top));
234 addw(t2_top, t2_top, oopSize);
235 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
236 b(locked);
237 }
238
239 { // Handle inflated monitor.
240 bind(inflated);
241
242 const Register t1_monitor = t1;
243
244 if (!UseObjectMonitorTable) {
245 assert(t1_monitor == t1_mark, "should be the same here");
246 } else {
247 const Register t1_hash = t1;
248 Label monitor_found;
249
250 // Save the mark, we might need it to extract the hash.
251 mov(t3, t1_mark);
252
253 // Look for the monitor in the om_cache.
254
255 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
256 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
257 const int num_unrolled = OMCache::CAPACITY;
258 for (int i = 0; i < num_unrolled; i++) {
259 ldr(t1_monitor, Address(rthread, cache_offset + monitor_offset));
260 ldr(t2, Address(rthread, cache_offset));
261 cmp(obj, t2);
262 br(Assembler::EQ, monitor_found);
263 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
264 }
265
266 // Look for the monitor in the table.
267
268 // Get the hash code.
269 ubfx(t1_hash, t3, markWord::hash_shift, markWord::hash_bits);
270
271 // Get the table and calculate the bucket's address
272 lea(t3, ExternalAddress(ObjectMonitorTable::current_table_address()));
273 ldr(t3, Address(t3));
274 ldr(t2, Address(t3, ObjectMonitorTable::table_capacity_mask_offset()));
275 ands(t1_hash, t1_hash, t2);
276 ldr(t3, Address(t3, ObjectMonitorTable::table_buckets_offset()));
277
278 // Read the monitor from the bucket.
279 ldr(t1_monitor, Address(t3, t1_hash, Address::lsl(LogBytesPerWord)));
280
281 // Check if the monitor in the bucket is special (empty, tombstone or removed).
282 cmp(t1_monitor, (unsigned char)ObjectMonitorTable::SpecialPointerValues::below_is_special);
283 br(Assembler::LO, slow_path);
284
285 // Check if object matches.
286 ldr(t3, Address(t1_monitor, ObjectMonitor::object_offset()));
287 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
288 bs_asm->try_peek_weak_handle_in_nmethod(this, t3, t3, t2, slow_path);
289 cmp(t3, obj);
290 br(Assembler::NE, slow_path);
291
292 bind(monitor_found);
293 }
294
295 const Register t2_owner_addr = t2;
296 const Register t3_owner = t3;
297 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
298 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
299 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
300
301 Label monitor_locked;
302
303 // Compute owner address.
304 lea(t2_owner_addr, owner_address);
305
306 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
307 ldr(rscratch2, Address(rthread, JavaThread::monitor_owner_id_offset()));
308 cmpxchg(t2_owner_addr, zr, rscratch2, Assembler::xword, memory_order_acquire, t3_owner);
309 br(Assembler::EQ, monitor_locked);
310
311 // Check if recursive.
312 cmp(t3_owner, rscratch2);
313 br(Assembler::NE, slow_path);
314
315 // Recursive.
316 increment(recursions_address, 1);
317
318 bind(monitor_locked);
319 if (UseObjectMonitorTable) {
320 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
321 }
322 }
323
324 bind(locked);
325
326 #ifdef ASSERT
327 // Check that locked label is reached with Flags == EQ.
328 Label flag_correct;
329 br(Assembler::EQ, flag_correct);
330 stop("Fast Lock Flag != EQ");
331 #endif
332
333 bind(slow_path);
334 #ifdef ASSERT
335 // Check that slow_path label is reached with Flags == NE.
336 br(Assembler::NE, flag_correct);
337 stop("Fast Lock Flag != NE");
338 bind(flag_correct);
339 #endif
340 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
341 }
342
343 void C2_MacroAssembler::fast_unlock(Register obj, Register box, Register t1,
344 Register t2, Register t3) {
345 assert_different_registers(obj, box, t1, t2, t3);
346
347 // Handle inflated monitor.
348 Label inflated, inflated_load_mark;
349 // Finish fast unlock successfully. MUST branch to with flag == EQ
350 Label unlocked;
351 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
352 Label slow_path;
353
354 const Register t1_mark = t1;
355 const Register t2_top = t2;
356 const Register t3_t = t3;
357
358 { // Fast unlock
359
360 Label push_and_slow_path;
361
362 // Check if obj is top of lock-stack.
363 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
364 subw(t2_top, t2_top, oopSize);
365 ldr(t3_t, Address(rthread, t2_top));
366 cmp(obj, t3_t);
367 // Top of lock stack was not obj. Must be monitor.
368 br(Assembler::NE, inflated_load_mark);
369
370 // Pop lock-stack.
371 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
372 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
373
374 // Check if recursive.
375 subw(t3_t, t2_top, oopSize);
376 ldr(t3_t, Address(rthread, t3_t));
377 cmp(obj, t3_t);
378 br(Assembler::EQ, unlocked);
379
380 // Not recursive.
381 // Load Mark.
382 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
383
384 // Check header for monitor (0b10).
385 // Because we got here by popping (meaning we pushed in locked)
386 // there will be no monitor in the box. So we need to push back the obj
387 // so that the runtime can fix any potential anonymous owner.
388 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
389
390 // Try to unlock. Transition lock bits 0b00 => 0b01
391 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
392 orr(t3_t, t1_mark, markWord::unlocked_value);
393 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, memory_order_release);
394 br(Assembler::EQ, unlocked);
395
396 bind(push_and_slow_path);
397 // Compare and exchange failed.
398 // Restore lock-stack and handle the unlock in runtime.
399 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
400 addw(t2_top, t2_top, oopSize);
401 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
402 b(slow_path);
403 }
404
405
406 { // Handle inflated monitor.
407 bind(inflated_load_mark);
408 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
409 #ifdef ASSERT
410 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
411 stop("Fast Unlock not monitor");
412 #endif
413
414 bind(inflated);
415
416 #ifdef ASSERT
417 Label check_done;
418 subw(t2_top, t2_top, oopSize);
419 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
420 br(Assembler::LT, check_done);
421 ldr(t3_t, Address(rthread, t2_top));
422 cmp(obj, t3_t);
423 br(Assembler::NE, inflated);
424 stop("Fast Unlock lock on stack");
425 bind(check_done);
426 #endif
427
428 const Register t1_monitor = t1;
429
430 if (!UseObjectMonitorTable) {
431 assert(t1_monitor == t1_mark, "should be the same here");
432
433 // Untag the monitor.
434 add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
435 } else {
436 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
437 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
438 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
439 br(Assembler::LO, slow_path);
440 }
441
442 const Register t2_recursions = t2;
443 Label not_recursive;
444
445 // Check if recursive.
446 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
447 cbz(t2_recursions, not_recursive);
448
449 // Recursive unlock.
450 sub(t2_recursions, t2_recursions, 1u);
451 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
452 // Set flag == EQ
453 cmp(t2_recursions, t2_recursions);
454 b(unlocked);
455
456 bind(not_recursive);
457
458 const Register t2_owner_addr = t2;
459
460 // Compute owner address.
461 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
462
463 // Set owner to null.
464 // Release to satisfy the JMM
465 stlr(zr, t2_owner_addr);
466 // We need a full fence after clearing owner to avoid stranding.
467 // StoreLoad achieves this.
468 membar(StoreLoad);
469
470 // Check if the entry_list is empty.
471 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::entry_list_offset()));
472 cmp(rscratch1, zr);
473 br(Assembler::EQ, unlocked); // If so we are done.
474
475 // Check if there is a successor.
476 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
477 cmp(rscratch1, zr);
478 br(Assembler::NE, unlocked); // If so we are done.
479
480 // Save the monitor pointer in the current thread, so we can try to
481 // reacquire the lock in SharedRuntime::monitor_exit_helper().
482 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
483
484 cmp(zr, rthread); // Set Flag to NE => slow path
485 b(slow_path);
486 }
487
488 bind(unlocked);
489 cmp(zr, zr); // Set Flags to EQ => fast path
490
491 #ifdef ASSERT
492 // Check that unlocked label is reached with Flags == EQ.
493 Label flag_correct;
494 br(Assembler::EQ, flag_correct);
495 stop("Fast Unlock Flag != EQ");
496 #endif
497
498 bind(slow_path);
499 #ifdef ASSERT
500 // Check that slow_path label is reached with Flags == NE.
501 br(Assembler::NE, flag_correct);
502 stop("Fast Unlock Flag != NE");
503 bind(flag_correct);
504 #endif
505 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
506 }
507
508 // Search for str1 in str2 and return index or -1
509 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
510 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
511 Register cnt2, Register cnt1,
512 Register tmp1, Register tmp2,
513 Register tmp3, Register tmp4,
514 Register tmp5, Register tmp6,
515 int icnt1, Register result, int ae) {
516 // NOTE: tmp5, tmp6 can be zr depending on specific method version
517 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
518
519 Register ch1 = rscratch1;
520 Register ch2 = rscratch2;
521 Register cnt1tmp = tmp1;
522 Register cnt2tmp = tmp2;
523 Register cnt1_neg = cnt1;
524 Register cnt2_neg = cnt2;
525 Register result_tmp = tmp4;
526
527 bool isL = ae == StrIntrinsicNode::LL;
528
529 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
530 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
531 int str1_chr_shift = str1_isL ? 0:1;
532 int str2_chr_shift = str2_isL ? 0:1;
533 int str1_chr_size = str1_isL ? 1:2;
534 int str2_chr_size = str2_isL ? 1:2;
535 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
536 (chr_insn)&MacroAssembler::ldrh;
537 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
538 (chr_insn)&MacroAssembler::ldrh;
539 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
540 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
541
542 // Note, inline_string_indexOf() generates checks:
543 // if (substr.count > string.count) return -1;
544 // if (substr.count == 0) return 0;
545
546 // We have two strings, a source string in str2, cnt2 and a pattern string
547 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
548
549 // For larger pattern and source we use a simplified Boyer Moore algorithm.
550 // With a small pattern and source we use linear scan.
551
552 if (icnt1 == -1) {
553 sub(result_tmp, cnt2, cnt1);
554 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
555 br(LT, LINEARSEARCH);
556 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
557 subs(zr, cnt1, 256);
558 lsr(tmp1, cnt2, 2);
559 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
560 br(GE, LINEARSTUB);
561 }
562
563 // The Boyer Moore alogorithm is based on the description here:-
564 //
565 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
566 //
567 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
568 // and the 'Good Suffix' rule.
569 //
570 // These rules are essentially heuristics for how far we can shift the
571 // pattern along the search string.
572 //
573 // The implementation here uses the 'Bad Character' rule only because of the
574 // complexity of initialisation for the 'Good Suffix' rule.
575 //
576 // This is also known as the Boyer-Moore-Horspool algorithm:-
577 //
578 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
579 //
580 // This particular implementation has few java-specific optimizations.
581 //
582 // #define ASIZE 256
583 //
584 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
585 // int i, j;
586 // unsigned c;
587 // unsigned char bc[ASIZE];
588 //
589 // /* Preprocessing */
590 // for (i = 0; i < ASIZE; ++i)
591 // bc[i] = m;
592 // for (i = 0; i < m - 1; ) {
593 // c = x[i];
594 // ++i;
595 // // c < 256 for Latin1 string, so, no need for branch
596 // #ifdef PATTERN_STRING_IS_LATIN1
597 // bc[c] = m - i;
598 // #else
599 // if (c < ASIZE) bc[c] = m - i;
600 // #endif
601 // }
602 //
603 // /* Searching */
604 // j = 0;
605 // while (j <= n - m) {
606 // c = y[i+j];
607 // if (x[m-1] == c)
608 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
609 // if (i < 0) return j;
610 // // c < 256 for Latin1 string, so, no need for branch
611 // #ifdef SOURCE_STRING_IS_LATIN1
612 // // LL case: (c< 256) always true. Remove branch
613 // j += bc[y[j+m-1]];
614 // #endif
615 // #ifndef PATTERN_STRING_IS_UTF
616 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
617 // if (c < ASIZE)
618 // j += bc[y[j+m-1]];
619 // else
620 // j += 1
621 // #endif
622 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
623 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
624 // if (c < ASIZE)
625 // j += bc[y[j+m-1]];
626 // else
627 // j += m
628 // #endif
629 // }
630 // }
631
632 if (icnt1 == -1) {
633 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
634 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
635 Register cnt1end = tmp2;
636 Register str2end = cnt2;
637 Register skipch = tmp2;
638
639 // str1 length is >=8, so, we can read at least 1 register for cases when
640 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
641 // UL case. We'll re-read last character in inner pre-loop code to have
642 // single outer pre-loop load
643 const int firstStep = isL ? 7 : 3;
644
645 const int ASIZE = 256;
646 const int STORED_BYTES = 32; // amount of bytes stored per instruction
647 sub(sp, sp, ASIZE);
648 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
649 mov(ch1, sp);
650 BIND(BM_INIT_LOOP);
651 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
652 subs(tmp5, tmp5, 1);
653 br(GT, BM_INIT_LOOP);
654
655 sub(cnt1tmp, cnt1, 1);
656 mov(tmp5, str2);
657 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
658 sub(ch2, cnt1, 1);
659 mov(tmp3, str1);
660 BIND(BCLOOP);
661 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
662 if (!str1_isL) {
663 subs(zr, ch1, ASIZE);
664 br(HS, BCSKIP);
665 }
666 strb(ch2, Address(sp, ch1));
667 BIND(BCSKIP);
668 subs(ch2, ch2, 1);
669 br(GT, BCLOOP);
670
671 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
672 if (str1_isL == str2_isL) {
673 // load last 8 bytes (8LL/4UU symbols)
674 ldr(tmp6, Address(tmp6, -wordSize));
675 } else {
676 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
677 // convert Latin1 to UTF. We'll have to wait until load completed, but
678 // it's still faster than per-character loads+checks
679 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
680 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
681 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
682 andr(tmp6, tmp6, 0xFF); // str1[N-4]
683 orr(ch2, ch1, ch2, LSL, 16);
684 orr(tmp6, tmp6, tmp3, LSL, 48);
685 orr(tmp6, tmp6, ch2, LSL, 16);
686 }
687 BIND(BMLOOPSTR2);
688 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
689 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
690 if (str1_isL == str2_isL) {
691 // re-init tmp3. It's for free because it's executed in parallel with
692 // load above. Alternative is to initialize it before loop, but it'll
693 // affect performance on in-order systems with 2 or more ld/st pipelines
694 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
695 }
696 if (!isL) { // UU/UL case
697 lsl(ch2, cnt1tmp, 1); // offset in bytes
698 }
699 cmp(tmp3, skipch);
700 br(NE, BMSKIP);
701 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
702 mov(ch1, tmp6);
703 if (isL) {
704 b(BMLOOPSTR1_AFTER_LOAD);
705 } else {
706 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
707 b(BMLOOPSTR1_CMP);
708 }
709 BIND(BMLOOPSTR1);
710 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
711 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
712 BIND(BMLOOPSTR1_AFTER_LOAD);
713 subs(cnt1tmp, cnt1tmp, 1);
714 br(LT, BMLOOPSTR1_LASTCMP);
715 BIND(BMLOOPSTR1_CMP);
716 cmp(ch1, ch2);
717 br(EQ, BMLOOPSTR1);
718 BIND(BMSKIP);
719 if (!isL) {
720 // if we've met UTF symbol while searching Latin1 pattern, then we can
721 // skip cnt1 symbols
722 if (str1_isL != str2_isL) {
723 mov(result_tmp, cnt1);
724 } else {
725 mov(result_tmp, 1);
726 }
727 subs(zr, skipch, ASIZE);
728 br(HS, BMADV);
729 }
730 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
731 BIND(BMADV);
732 sub(cnt1tmp, cnt1, 1);
733 add(str2, str2, result_tmp, LSL, str2_chr_shift);
734 cmp(str2, str2end);
735 br(LE, BMLOOPSTR2);
736 add(sp, sp, ASIZE);
737 b(NOMATCH);
738 BIND(BMLOOPSTR1_LASTCMP);
739 cmp(ch1, ch2);
740 br(NE, BMSKIP);
741 BIND(BMMATCH);
742 sub(result, str2, tmp5);
743 if (!str2_isL) lsr(result, result, 1);
744 add(sp, sp, ASIZE);
745 b(DONE);
746
747 BIND(LINEARSTUB);
748 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
749 br(LT, LINEAR_MEDIUM);
750 mov(result, zr);
751 RuntimeAddress stub = nullptr;
752 if (isL) {
753 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
754 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
755 } else if (str1_isL) {
756 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
757 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
758 } else {
759 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
760 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
761 }
762 address call = trampoline_call(stub);
763 if (call == nullptr) {
764 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
765 ciEnv::current()->record_failure("CodeCache is full");
766 return;
767 }
768 b(DONE);
769 }
770
771 BIND(LINEARSEARCH);
772 {
773 Label DO1, DO2, DO3;
774
775 Register str2tmp = tmp2;
776 Register first = tmp3;
777
778 if (icnt1 == -1)
779 {
780 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
781
782 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
783 br(LT, DOSHORT);
784 BIND(LINEAR_MEDIUM);
785 (this->*str1_load_1chr)(first, Address(str1));
786 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
787 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
788 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
789 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
790
791 BIND(FIRST_LOOP);
792 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
793 cmp(first, ch2);
794 br(EQ, STR1_LOOP);
795 BIND(STR2_NEXT);
796 adds(cnt2_neg, cnt2_neg, str2_chr_size);
797 br(LE, FIRST_LOOP);
798 b(NOMATCH);
799
800 BIND(STR1_LOOP);
801 adds(cnt1tmp, cnt1_neg, str1_chr_size);
802 add(cnt2tmp, cnt2_neg, str2_chr_size);
803 br(GE, MATCH);
804
805 BIND(STR1_NEXT);
806 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
807 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
808 cmp(ch1, ch2);
809 br(NE, STR2_NEXT);
810 adds(cnt1tmp, cnt1tmp, str1_chr_size);
811 add(cnt2tmp, cnt2tmp, str2_chr_size);
812 br(LT, STR1_NEXT);
813 b(MATCH);
814
815 BIND(DOSHORT);
816 if (str1_isL == str2_isL) {
817 cmp(cnt1, (u1)2);
818 br(LT, DO1);
819 br(GT, DO3);
820 }
821 }
822
823 if (icnt1 == 4) {
824 Label CH1_LOOP;
825
826 (this->*load_4chr)(ch1, str1);
827 sub(result_tmp, cnt2, 4);
828 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
829 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
830
831 BIND(CH1_LOOP);
832 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
833 cmp(ch1, ch2);
834 br(EQ, MATCH);
835 adds(cnt2_neg, cnt2_neg, str2_chr_size);
836 br(LE, CH1_LOOP);
837 b(NOMATCH);
838 }
839
840 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
841 Label CH1_LOOP;
842
843 BIND(DO2);
844 (this->*load_2chr)(ch1, str1);
845 if (icnt1 == 2) {
846 sub(result_tmp, cnt2, 2);
847 }
848 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
849 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
850 BIND(CH1_LOOP);
851 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
852 cmp(ch1, ch2);
853 br(EQ, MATCH);
854 adds(cnt2_neg, cnt2_neg, str2_chr_size);
855 br(LE, CH1_LOOP);
856 b(NOMATCH);
857 }
858
859 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
860 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
861
862 BIND(DO3);
863 (this->*load_2chr)(first, str1);
864 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
865 if (icnt1 == 3) {
866 sub(result_tmp, cnt2, 3);
867 }
868 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
869 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
870 BIND(FIRST_LOOP);
871 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
872 cmpw(first, ch2);
873 br(EQ, STR1_LOOP);
874 BIND(STR2_NEXT);
875 adds(cnt2_neg, cnt2_neg, str2_chr_size);
876 br(LE, FIRST_LOOP);
877 b(NOMATCH);
878
879 BIND(STR1_LOOP);
880 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
881 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
882 cmp(ch1, ch2);
883 br(NE, STR2_NEXT);
884 b(MATCH);
885 }
886
887 if (icnt1 == -1 || icnt1 == 1) {
888 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
889
890 BIND(DO1);
891 (this->*str1_load_1chr)(ch1, str1);
892 cmp(cnt2, (u1)8);
893 br(LT, DO1_SHORT);
894
895 sub(result_tmp, cnt2, 8/str2_chr_size);
896 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
897 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
898 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
899
900 if (str2_isL) {
901 orr(ch1, ch1, ch1, LSL, 8);
902 }
903 orr(ch1, ch1, ch1, LSL, 16);
904 orr(ch1, ch1, ch1, LSL, 32);
905 BIND(CH1_LOOP);
906 ldr(ch2, Address(str2, cnt2_neg));
907 eor(ch2, ch1, ch2);
908 sub(tmp1, ch2, tmp3);
909 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
910 bics(tmp1, tmp1, tmp2);
911 br(NE, HAS_ZERO);
912 adds(cnt2_neg, cnt2_neg, 8);
913 br(LT, CH1_LOOP);
914
915 cmp(cnt2_neg, (u1)8);
916 mov(cnt2_neg, 0);
917 br(LT, CH1_LOOP);
918 b(NOMATCH);
919
920 BIND(HAS_ZERO);
921 rev(tmp1, tmp1);
922 clz(tmp1, tmp1);
923 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
924 b(MATCH);
925
926 BIND(DO1_SHORT);
927 mov(result_tmp, cnt2);
928 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
929 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
930 BIND(DO1_LOOP);
931 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
932 cmpw(ch1, ch2);
933 br(EQ, MATCH);
934 adds(cnt2_neg, cnt2_neg, str2_chr_size);
935 br(LT, DO1_LOOP);
936 }
937 }
938 BIND(NOMATCH);
939 mov(result, -1);
940 b(DONE);
941 BIND(MATCH);
942 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
943 BIND(DONE);
944 }
945
946 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
947 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
948
949 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
950 Register ch, Register result,
951 Register tmp1, Register tmp2, Register tmp3)
952 {
953 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
954 Register cnt1_neg = cnt1;
955 Register ch1 = rscratch1;
956 Register result_tmp = rscratch2;
957
958 cbz(cnt1, NOMATCH);
959
960 cmp(cnt1, (u1)4);
961 br(LT, DO1_SHORT);
962
963 orr(ch, ch, ch, LSL, 16);
964 orr(ch, ch, ch, LSL, 32);
965
966 sub(cnt1, cnt1, 4);
967 mov(result_tmp, cnt1);
968 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
969 sub(cnt1_neg, zr, cnt1, LSL, 1);
970
971 mov(tmp3, 0x0001000100010001);
972
973 BIND(CH1_LOOP);
974 ldr(ch1, Address(str1, cnt1_neg));
975 eor(ch1, ch, ch1);
976 sub(tmp1, ch1, tmp3);
977 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
978 bics(tmp1, tmp1, tmp2);
979 br(NE, HAS_ZERO);
980 adds(cnt1_neg, cnt1_neg, 8);
981 br(LT, CH1_LOOP);
982
983 cmp(cnt1_neg, (u1)8);
984 mov(cnt1_neg, 0);
985 br(LT, CH1_LOOP);
986 b(NOMATCH);
987
988 BIND(HAS_ZERO);
989 rev(tmp1, tmp1);
990 clz(tmp1, tmp1);
991 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
992 b(MATCH);
993
994 BIND(DO1_SHORT);
995 mov(result_tmp, cnt1);
996 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
997 sub(cnt1_neg, zr, cnt1, LSL, 1);
998 BIND(DO1_LOOP);
999 ldrh(ch1, Address(str1, cnt1_neg));
1000 cmpw(ch, ch1);
1001 br(EQ, MATCH);
1002 adds(cnt1_neg, cnt1_neg, 2);
1003 br(LT, DO1_LOOP);
1004 BIND(NOMATCH);
1005 mov(result, -1);
1006 b(DONE);
1007 BIND(MATCH);
1008 add(result, result_tmp, cnt1_neg, ASR, 1);
1009 BIND(DONE);
1010 }
1011
1012 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1013 Register ch, Register result,
1014 FloatRegister ztmp1,
1015 FloatRegister ztmp2,
1016 PRegister tmp_pg,
1017 PRegister tmp_pdn, bool isL)
1018 {
1019 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1020 assert(tmp_pg->is_governing(),
1021 "this register has to be a governing predicate register");
1022
1023 Label LOOP, MATCH, DONE, NOMATCH;
1024 Register vec_len = rscratch1;
1025 Register idx = rscratch2;
1026
1027 SIMD_RegVariant T = (isL == true) ? B : H;
1028
1029 cbz(cnt1, NOMATCH);
1030
1031 // Assign the particular char throughout the vector.
1032 sve_dup(ztmp2, T, ch);
1033 if (isL) {
1034 sve_cntb(vec_len);
1035 } else {
1036 sve_cnth(vec_len);
1037 }
1038 mov(idx, 0);
1039
1040 // Generate a predicate to control the reading of input string.
1041 sve_whilelt(tmp_pg, T, idx, cnt1);
1042
1043 BIND(LOOP);
1044 // Read a vector of 8- or 16-bit data depending on the string type. Note
1045 // that inactive elements indicated by the predicate register won't cause
1046 // a data read from memory to the destination vector.
1047 if (isL) {
1048 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1049 } else {
1050 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1051 }
1052 add(idx, idx, vec_len);
1053
1054 // Perform the comparison. An element of the destination predicate is set
1055 // to active if the particular char is matched.
1056 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1057
1058 // Branch if the particular char is found.
1059 br(NE, MATCH);
1060
1061 sve_whilelt(tmp_pg, T, idx, cnt1);
1062
1063 // Loop back if the particular char not found.
1064 br(MI, LOOP);
1065
1066 BIND(NOMATCH);
1067 mov(result, -1);
1068 b(DONE);
1069
1070 BIND(MATCH);
1071 // Undo the index increment.
1072 sub(idx, idx, vec_len);
1073
1074 // Crop the vector to find its location.
1075 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1076 add(result, idx, -1);
1077 sve_incp(result, T, tmp_pdn);
1078 BIND(DONE);
1079 }
1080
1081 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1082 Register ch, Register result,
1083 Register tmp1, Register tmp2, Register tmp3)
1084 {
1085 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1086 Register cnt1_neg = cnt1;
1087 Register ch1 = rscratch1;
1088 Register result_tmp = rscratch2;
1089
1090 cbz(cnt1, NOMATCH);
1091
1092 cmp(cnt1, (u1)8);
1093 br(LT, DO1_SHORT);
1094
1095 orr(ch, ch, ch, LSL, 8);
1096 orr(ch, ch, ch, LSL, 16);
1097 orr(ch, ch, ch, LSL, 32);
1098
1099 sub(cnt1, cnt1, 8);
1100 mov(result_tmp, cnt1);
1101 lea(str1, Address(str1, cnt1));
1102 sub(cnt1_neg, zr, cnt1);
1103
1104 mov(tmp3, 0x0101010101010101);
1105
1106 BIND(CH1_LOOP);
1107 ldr(ch1, Address(str1, cnt1_neg));
1108 eor(ch1, ch, ch1);
1109 sub(tmp1, ch1, tmp3);
1110 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1111 bics(tmp1, tmp1, tmp2);
1112 br(NE, HAS_ZERO);
1113 adds(cnt1_neg, cnt1_neg, 8);
1114 br(LT, CH1_LOOP);
1115
1116 cmp(cnt1_neg, (u1)8);
1117 mov(cnt1_neg, 0);
1118 br(LT, CH1_LOOP);
1119 b(NOMATCH);
1120
1121 BIND(HAS_ZERO);
1122 rev(tmp1, tmp1);
1123 clz(tmp1, tmp1);
1124 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1125 b(MATCH);
1126
1127 BIND(DO1_SHORT);
1128 mov(result_tmp, cnt1);
1129 lea(str1, Address(str1, cnt1));
1130 sub(cnt1_neg, zr, cnt1);
1131 BIND(DO1_LOOP);
1132 ldrb(ch1, Address(str1, cnt1_neg));
1133 cmp(ch, ch1);
1134 br(EQ, MATCH);
1135 adds(cnt1_neg, cnt1_neg, 1);
1136 br(LT, DO1_LOOP);
1137 BIND(NOMATCH);
1138 mov(result, -1);
1139 b(DONE);
1140 BIND(MATCH);
1141 add(result, result_tmp, cnt1_neg);
1142 BIND(DONE);
1143 }
1144
1145 // Compare strings.
1146 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1147 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1148 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1149 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1150 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1151 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1152 SHORT_LOOP_START, TAIL_CHECK;
1153
1154 bool isLL = ae == StrIntrinsicNode::LL;
1155 bool isLU = ae == StrIntrinsicNode::LU;
1156 bool isUL = ae == StrIntrinsicNode::UL;
1157
1158 // The stub threshold for LL strings is: 72 (64 + 8) chars
1159 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1160 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1161 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1162
1163 bool str1_isL = isLL || isLU;
1164 bool str2_isL = isLL || isUL;
1165
1166 int str1_chr_shift = str1_isL ? 0 : 1;
1167 int str2_chr_shift = str2_isL ? 0 : 1;
1168 int str1_chr_size = str1_isL ? 1 : 2;
1169 int str2_chr_size = str2_isL ? 1 : 2;
1170 int minCharsInWord = isLL ? wordSize : wordSize/2;
1171
1172 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1173 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1174 (chr_insn)&MacroAssembler::ldrh;
1175 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1176 (chr_insn)&MacroAssembler::ldrh;
1177 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1178 (uxt_insn)&MacroAssembler::uxthw;
1179
1180 BLOCK_COMMENT("string_compare {");
1181
1182 // Bizarrely, the counts are passed in bytes, regardless of whether they
1183 // are L or U strings, however the result is always in characters.
1184 if (!str1_isL) asrw(cnt1, cnt1, 1);
1185 if (!str2_isL) asrw(cnt2, cnt2, 1);
1186
1187 // Compute the minimum of the string lengths and save the difference.
1188 subsw(result, cnt1, cnt2);
1189 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1190
1191 // A very short string
1192 cmpw(cnt2, minCharsInWord);
1193 br(Assembler::LE, SHORT_STRING);
1194
1195 // Compare longwords
1196 // load first parts of strings and finish initialization while loading
1197 {
1198 if (str1_isL == str2_isL) { // LL or UU
1199 ldr(tmp1, Address(str1));
1200 cmp(str1, str2);
1201 br(Assembler::EQ, DONE);
1202 ldr(tmp2, Address(str2));
1203 cmp(cnt2, stub_threshold);
1204 br(GE, STUB);
1205 subsw(cnt2, cnt2, minCharsInWord);
1206 br(EQ, TAIL_CHECK);
1207 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1208 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1209 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1210 } else if (isLU) {
1211 ldrs(vtmp, Address(str1));
1212 ldr(tmp2, Address(str2));
1213 cmp(cnt2, stub_threshold);
1214 br(GE, STUB);
1215 subw(cnt2, cnt2, 4);
1216 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1217 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1218 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1219 zip1(vtmp, T8B, vtmp, vtmpZ);
1220 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1221 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1222 add(cnt1, cnt1, 4);
1223 fmovd(tmp1, vtmp);
1224 } else { // UL case
1225 ldr(tmp1, Address(str1));
1226 ldrs(vtmp, Address(str2));
1227 cmp(cnt2, stub_threshold);
1228 br(GE, STUB);
1229 subw(cnt2, cnt2, 4);
1230 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1231 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1232 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1233 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1234 zip1(vtmp, T8B, vtmp, vtmpZ);
1235 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1236 add(cnt1, cnt1, 8);
1237 fmovd(tmp2, vtmp);
1238 }
1239 adds(cnt2, cnt2, isUL ? 4 : 8);
1240 br(GE, TAIL);
1241 eor(rscratch2, tmp1, tmp2);
1242 cbnz(rscratch2, DIFF);
1243 // main loop
1244 bind(NEXT_WORD);
1245 if (str1_isL == str2_isL) {
1246 ldr(tmp1, Address(str1, cnt2));
1247 ldr(tmp2, Address(str2, cnt2));
1248 adds(cnt2, cnt2, 8);
1249 } else if (isLU) {
1250 ldrs(vtmp, Address(str1, cnt1));
1251 ldr(tmp2, Address(str2, cnt2));
1252 add(cnt1, cnt1, 4);
1253 zip1(vtmp, T8B, vtmp, vtmpZ);
1254 fmovd(tmp1, vtmp);
1255 adds(cnt2, cnt2, 8);
1256 } else { // UL
1257 ldrs(vtmp, Address(str2, cnt2));
1258 ldr(tmp1, Address(str1, cnt1));
1259 zip1(vtmp, T8B, vtmp, vtmpZ);
1260 add(cnt1, cnt1, 8);
1261 fmovd(tmp2, vtmp);
1262 adds(cnt2, cnt2, 4);
1263 }
1264 br(GE, TAIL);
1265
1266 eor(rscratch2, tmp1, tmp2);
1267 cbz(rscratch2, NEXT_WORD);
1268 b(DIFF);
1269 bind(TAIL);
1270 eor(rscratch2, tmp1, tmp2);
1271 cbnz(rscratch2, DIFF);
1272 // Last longword. In the case where length == 4 we compare the
1273 // same longword twice, but that's still faster than another
1274 // conditional branch.
1275 if (str1_isL == str2_isL) {
1276 ldr(tmp1, Address(str1));
1277 ldr(tmp2, Address(str2));
1278 } else if (isLU) {
1279 ldrs(vtmp, Address(str1));
1280 ldr(tmp2, Address(str2));
1281 zip1(vtmp, T8B, vtmp, vtmpZ);
1282 fmovd(tmp1, vtmp);
1283 } else { // UL
1284 ldrs(vtmp, Address(str2));
1285 ldr(tmp1, Address(str1));
1286 zip1(vtmp, T8B, vtmp, vtmpZ);
1287 fmovd(tmp2, vtmp);
1288 }
1289 bind(TAIL_CHECK);
1290 eor(rscratch2, tmp1, tmp2);
1291 cbz(rscratch2, DONE);
1292
1293 // Find the first different characters in the longwords and
1294 // compute their difference.
1295 bind(DIFF);
1296 rev(rscratch2, rscratch2);
1297 clz(rscratch2, rscratch2);
1298 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1299 lsrv(tmp1, tmp1, rscratch2);
1300 (this->*ext_chr)(tmp1, tmp1);
1301 lsrv(tmp2, tmp2, rscratch2);
1302 (this->*ext_chr)(tmp2, tmp2);
1303 subw(result, tmp1, tmp2);
1304 b(DONE);
1305 }
1306
1307 bind(STUB);
1308 RuntimeAddress stub = nullptr;
1309 switch(ae) {
1310 case StrIntrinsicNode::LL:
1311 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1312 break;
1313 case StrIntrinsicNode::UU:
1314 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1315 break;
1316 case StrIntrinsicNode::LU:
1317 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1318 break;
1319 case StrIntrinsicNode::UL:
1320 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1321 break;
1322 default:
1323 ShouldNotReachHere();
1324 }
1325 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1326 address call = trampoline_call(stub);
1327 if (call == nullptr) {
1328 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1329 ciEnv::current()->record_failure("CodeCache is full");
1330 return;
1331 }
1332 b(DONE);
1333
1334 bind(SHORT_STRING);
1335 // Is the minimum length zero?
1336 cbz(cnt2, DONE);
1337 // arrange code to do most branches while loading and loading next characters
1338 // while comparing previous
1339 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1340 subs(cnt2, cnt2, 1);
1341 br(EQ, SHORT_LAST_INIT);
1342 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1343 b(SHORT_LOOP_START);
1344 bind(SHORT_LOOP);
1345 subs(cnt2, cnt2, 1);
1346 br(EQ, SHORT_LAST);
1347 bind(SHORT_LOOP_START);
1348 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1349 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1350 cmp(tmp1, cnt1);
1351 br(NE, SHORT_LOOP_TAIL);
1352 subs(cnt2, cnt2, 1);
1353 br(EQ, SHORT_LAST2);
1354 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1355 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1356 cmp(tmp2, rscratch1);
1357 br(EQ, SHORT_LOOP);
1358 sub(result, tmp2, rscratch1);
1359 b(DONE);
1360 bind(SHORT_LOOP_TAIL);
1361 sub(result, tmp1, cnt1);
1362 b(DONE);
1363 bind(SHORT_LAST2);
1364 cmp(tmp2, rscratch1);
1365 br(EQ, DONE);
1366 sub(result, tmp2, rscratch1);
1367
1368 b(DONE);
1369 bind(SHORT_LAST_INIT);
1370 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1371 bind(SHORT_LAST);
1372 cmp(tmp1, cnt1);
1373 br(EQ, DONE);
1374 sub(result, tmp1, cnt1);
1375
1376 bind(DONE);
1377
1378 BLOCK_COMMENT("} string_compare");
1379 }
1380
1381 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1382 FloatRegister src2, Condition cond, bool isQ) {
1383 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1384 FloatRegister zn = src1, zm = src2;
1385 bool needs_negation = false;
1386 switch (cond) {
1387 case LT: cond = GT; zn = src2; zm = src1; break;
1388 case LE: cond = GE; zn = src2; zm = src1; break;
1389 case LO: cond = HI; zn = src2; zm = src1; break;
1390 case LS: cond = HS; zn = src2; zm = src1; break;
1391 case NE: cond = EQ; needs_negation = true; break;
1392 default:
1393 break;
1394 }
1395
1396 if (is_floating_point_type(bt)) {
1397 fcm(cond, dst, size, zn, zm);
1398 } else {
1399 cm(cond, dst, size, zn, zm);
1400 }
1401
1402 if (needs_negation) {
1403 notr(dst, isQ ? T16B : T8B, dst);
1404 }
1405 }
1406
1407 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1408 Condition cond, bool isQ) {
1409 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1410 if (bt == T_FLOAT || bt == T_DOUBLE) {
1411 if (cond == Assembler::NE) {
1412 fcm(Assembler::EQ, dst, size, src);
1413 notr(dst, isQ ? T16B : T8B, dst);
1414 } else {
1415 fcm(cond, dst, size, src);
1416 }
1417 } else {
1418 if (cond == Assembler::NE) {
1419 cm(Assembler::EQ, dst, size, src);
1420 notr(dst, isQ ? T16B : T8B, dst);
1421 } else {
1422 cm(cond, dst, size, src);
1423 }
1424 }
1425 }
1426
1427 // Compress the least significant bit of each byte to the rightmost and clear
1428 // the higher garbage bits.
1429 void C2_MacroAssembler::bytemask_compress(Register dst) {
1430 // Example input, dst = 0x01 00 00 00 01 01 00 01
1431 // The "??" bytes are garbage.
1432 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1433 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1434 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1435 andr(dst, dst, 0xff); // dst = 0x8D
1436 }
1437
1438 // Pack the value of each mask element in "src" into a long value in "dst", at most
1439 // the first 64 lane elements. The input "src" is a vector of boolean represented as
1440 // bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
1441 // one bit in "dst".
1442 //
1443 // Example: src = 0x0001010000010001 0100000001010001, lane_cnt = 16
1444 // Expected: dst = 0x658D
1445 //
1446 // Clobbers: rscratch1
1447 void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
1448 FloatRegister vtmp, int lane_cnt) {
1449 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1450 assert_different_registers(dst, rscratch1);
1451 assert_different_registers(src, vtmp);
1452 assert(UseSVE > 0, "must be");
1453
1454 // Compress the lowest 8 bytes.
1455 fmovd(dst, src);
1456 bytemask_compress(dst);
1457 if (lane_cnt <= 8) return;
1458
1459 // Repeat on higher bytes and join the results.
1460 // Compress 8 bytes in each iteration.
1461 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1462 sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
1463 bytemask_compress(rscratch1);
1464 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1465 }
1466 }
1467
1468 // The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
1469 // instruction which requires the FEAT_BITPERM feature.
1470 void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
1471 FloatRegister vtmp1, FloatRegister vtmp2,
1472 int lane_cnt) {
1473 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1474 assert_different_registers(src, vtmp1, vtmp2);
1475 assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
1476
1477 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1478 // is to compress each significant bit of the byte in a cross-lane way. Due
1479 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1480 // (bit-compress in each lane) with the biggest lane size (T = D) then
1481 // concatenate the results.
1482
1483 // The second source input of BEXT, initialized with 0x01 in each byte.
1484 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1485 sve_dup(vtmp2, B, 1);
1486
1487 // BEXT vtmp1.D, src.D, vtmp2.D
1488 // src = 0x0001010000010001 | 0x0100000001010001
1489 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1490 // ---------------------------------------
1491 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1492 sve_bext(vtmp1, D, src, vtmp2);
1493
1494 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1495 // result to dst.
1496 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1497 // dst = 0x658D
1498 if (lane_cnt <= 8) {
1499 // No need to concatenate.
1500 umov(dst, vtmp1, B, 0);
1501 } else if (lane_cnt <= 16) {
1502 ins(vtmp1, B, vtmp1, 1, 8);
1503 umov(dst, vtmp1, H, 0);
1504 } else {
1505 // As the lane count is 64 at most, the final expected value must be in
1506 // the lowest 64 bits after narrowing vtmp1 from D to B.
1507 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1508 umov(dst, vtmp1, D, 0);
1509 }
1510 }
1511
1512 // Unpack the mask, a long value in "src", into a vector register of boolean
1513 // represented as bytes with 0x00/0x01 as element values in "dst". Each bit in
1514 // "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
1515 // most 64 lanes.
1516 //
1517 // Below example gives the expected dst vector register, with a valid src(0x658D)
1518 // on a 128-bit vector size machine.
1519 // dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1520 void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
1521 FloatRegister vtmp, int lane_cnt) {
1522 assert_different_registers(dst, vtmp);
1523 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1524 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1525
1526 // Example: src = 0x658D, lane_cnt = 16
1527 // Expected: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1528
1529 // Put long value from general purpose register into the first lane of vector.
1530 // vtmp = 0x0000000000000000 | 0x000000000000658D
1531 sve_dup(vtmp, B, 0);
1532 mov(vtmp, D, 0, src);
1533
1534 // Transform the value in the first lane which is mask in bit now to the mask in
1535 // byte, which can be done by SVE2's BDEP instruction.
1536
1537 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1538 // vtmp = 0x0000000000000065 | 0x000000000000008D
1539 if (lane_cnt <= 8) {
1540 // Nothing. As only one byte exsits.
1541 } else if (lane_cnt <= 16) {
1542 ins(vtmp, B, vtmp, 8, 1);
1543 } else {
1544 sve_vector_extend(vtmp, D, vtmp, B);
1545 }
1546
1547 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1548 // dst = 0x01010101 0x01010101 0x01010101 0x01010101
1549 sve_dup(dst, B, 1);
1550
1551 // BDEP dst.D, vtmp.D, dst.D
1552 // vtmp = 0x0000000000000065 | 0x000000000000008D
1553 // dst = 0x0101010101010101 | 0x0101010101010101
1554 // ---------------------------------------
1555 // dst = 0x0001010000010001 | 0x0100000001010001
1556 sve_bdep(dst, D, vtmp, dst);
1557 }
1558
1559 // Clobbers: rflags
1560 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1561 FloatRegister zn, FloatRegister zm, Condition cond) {
1562 assert(pg->is_governing(), "This register has to be a governing predicate register");
1563 FloatRegister z1 = zn, z2 = zm;
1564 switch (cond) {
1565 case LE: z1 = zm; z2 = zn; cond = GE; break;
1566 case LT: z1 = zm; z2 = zn; cond = GT; break;
1567 case LO: z1 = zm; z2 = zn; cond = HI; break;
1568 case LS: z1 = zm; z2 = zn; cond = HS; break;
1569 default:
1570 break;
1571 }
1572
1573 SIMD_RegVariant size = elemType_to_regVariant(bt);
1574 if (is_floating_point_type(bt)) {
1575 sve_fcm(cond, pd, size, pg, z1, z2);
1576 } else {
1577 assert(is_integral_type(bt), "unsupported element type");
1578 sve_cmp(cond, pd, size, pg, z1, z2);
1579 }
1580 }
1581
1582 // Get index of the last mask lane that is set
1583 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1584 SIMD_RegVariant size = elemType_to_regVariant(bt);
1585 sve_rev(ptmp, size, src);
1586 sve_brkb(ptmp, ptrue, ptmp, false);
1587 sve_cntp(dst, size, ptrue, ptmp);
1588 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1589 subw(dst, rscratch1, dst);
1590 }
1591
1592 // Extend integer vector src to dst with the same lane count
1593 // but larger element size, e.g. 4B -> 4I
1594 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1595 FloatRegister src, BasicType src_bt, bool is_unsigned) {
1596 if (src_bt == T_BYTE) {
1597 // 4B to 4S/4I, 8B to 8S
1598 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1599 assert(dst_bt == T_SHORT || dst_bt == T_INT, "unsupported");
1600 _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1601 if (dst_bt == T_INT) {
1602 _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1603 }
1604 } else if (src_bt == T_SHORT) {
1605 // 2S to 2I/2L, 4S to 4I
1606 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1607 assert(dst_bt == T_INT || dst_bt == T_LONG, "unsupported");
1608 _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1609 if (dst_bt == T_LONG) {
1610 _xshll(is_unsigned, dst, T2D, dst, T2S, 0);
1611 }
1612 } else if (src_bt == T_INT) {
1613 // 2I to 2L
1614 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1615 _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1616 } else {
1617 ShouldNotReachHere();
1618 }
1619 }
1620
1621 // Narrow integer vector src down to dst with the same lane count
1622 // but smaller element size, e.g. 4I -> 4B
1623 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1624 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1625 if (src_bt == T_SHORT) {
1626 // 4S/8S to 4B/8B
1627 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1628 assert(dst_bt == T_BYTE, "unsupported");
1629 xtn(dst, T8B, src, T8H);
1630 } else if (src_bt == T_INT) {
1631 // 2I to 2S, 4I to 4B/4S
1632 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1633 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1634 xtn(dst, T4H, src, T4S);
1635 if (dst_bt == T_BYTE) {
1636 xtn(dst, T8B, dst, T8H);
1637 }
1638 } else if (src_bt == T_LONG) {
1639 // 2L to 2S/2I
1640 assert(src_vlen_in_bytes == 16, "unsupported");
1641 assert(dst_bt == T_INT || dst_bt == T_SHORT, "unsupported");
1642 xtn(dst, T2S, src, T2D);
1643 if (dst_bt == T_SHORT) {
1644 xtn(dst, T4H, dst, T4S);
1645 }
1646 } else {
1647 ShouldNotReachHere();
1648 }
1649 }
1650
1651 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1652 FloatRegister src, SIMD_RegVariant src_size,
1653 bool is_unsigned) {
1654 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1655
1656 if (src_size == B) {
1657 switch (dst_size) {
1658 case H:
1659 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1660 break;
1661 case S:
1662 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1663 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1664 break;
1665 case D:
1666 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1667 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1668 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1669 break;
1670 default:
1671 ShouldNotReachHere();
1672 }
1673 } else if (src_size == H) {
1674 if (dst_size == S) {
1675 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1676 } else { // D
1677 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1678 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1679 }
1680 } else if (src_size == S) {
1681 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1682 }
1683 }
1684
1685 // Vector narrow from src to dst with specified element sizes.
1686 // High part of dst vector will be filled with zero.
1687 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1688 FloatRegister src, SIMD_RegVariant src_size,
1689 FloatRegister tmp) {
1690 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1691 assert_different_registers(src, tmp);
1692 sve_dup(tmp, src_size, 0);
1693 if (src_size == D) {
1694 switch (dst_size) {
1695 case S:
1696 sve_uzp1(dst, S, src, tmp);
1697 break;
1698 case H:
1699 assert_different_registers(dst, tmp);
1700 sve_uzp1(dst, S, src, tmp);
1701 sve_uzp1(dst, H, dst, tmp);
1702 break;
1703 case B:
1704 assert_different_registers(dst, tmp);
1705 sve_uzp1(dst, S, src, tmp);
1706 sve_uzp1(dst, H, dst, tmp);
1707 sve_uzp1(dst, B, dst, tmp);
1708 break;
1709 default:
1710 ShouldNotReachHere();
1711 }
1712 } else if (src_size == S) {
1713 if (dst_size == H) {
1714 sve_uzp1(dst, H, src, tmp);
1715 } else { // B
1716 assert_different_registers(dst, tmp);
1717 sve_uzp1(dst, H, src, tmp);
1718 sve_uzp1(dst, B, dst, tmp);
1719 }
1720 } else if (src_size == H) {
1721 sve_uzp1(dst, B, src, tmp);
1722 }
1723 }
1724
1725 // Extend src predicate to dst predicate with the same lane count but larger
1726 // element size, e.g. 64Byte -> 512Long
1727 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1728 uint dst_element_length_in_bytes,
1729 uint src_element_length_in_bytes) {
1730 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1731 sve_punpklo(dst, src);
1732 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1733 sve_punpklo(dst, src);
1734 sve_punpklo(dst, dst);
1735 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1736 sve_punpklo(dst, src);
1737 sve_punpklo(dst, dst);
1738 sve_punpklo(dst, dst);
1739 } else {
1740 assert(false, "unsupported");
1741 ShouldNotReachHere();
1742 }
1743 }
1744
1745 // Narrow src predicate to dst predicate with the same lane count but
1746 // smaller element size, e.g. 512Long -> 64Byte
1747 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1748 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1749 // The insignificant bits in src predicate are expected to be zero.
1750 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1751 // passed as the second argument. An example narrowing operation with a given mask would be -
1752 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1753 // Mask (for 2 Longs) : TF
1754 // Predicate register for the above mask (16 bits) : 00000001 00000000
1755 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1756 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1757 assert_different_registers(src, ptmp);
1758 assert_different_registers(dst, ptmp);
1759 sve_pfalse(ptmp);
1760 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1761 sve_uzp1(dst, B, src, ptmp);
1762 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1763 sve_uzp1(dst, H, src, ptmp);
1764 sve_uzp1(dst, B, dst, ptmp);
1765 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1766 sve_uzp1(dst, S, src, ptmp);
1767 sve_uzp1(dst, H, dst, ptmp);
1768 sve_uzp1(dst, B, dst, ptmp);
1769 } else {
1770 assert(false, "unsupported");
1771 ShouldNotReachHere();
1772 }
1773 }
1774
1775 // Vector reduction add for integral type with ASIMD instructions.
1776 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1777 Register isrc, FloatRegister vsrc,
1778 unsigned vector_length_in_bytes,
1779 FloatRegister vtmp) {
1780 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1781 assert_different_registers(dst, isrc);
1782 bool isQ = vector_length_in_bytes == 16;
1783
1784 BLOCK_COMMENT("neon_reduce_add_integral {");
1785 switch(bt) {
1786 case T_BYTE:
1787 addv(vtmp, isQ ? T16B : T8B, vsrc);
1788 smov(dst, vtmp, B, 0);
1789 addw(dst, dst, isrc, ext::sxtb);
1790 break;
1791 case T_SHORT:
1792 addv(vtmp, isQ ? T8H : T4H, vsrc);
1793 smov(dst, vtmp, H, 0);
1794 addw(dst, dst, isrc, ext::sxth);
1795 break;
1796 case T_INT:
1797 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1798 umov(dst, vtmp, S, 0);
1799 addw(dst, dst, isrc);
1800 break;
1801 case T_LONG:
1802 assert(isQ, "unsupported");
1803 addpd(vtmp, vsrc);
1804 umov(dst, vtmp, D, 0);
1805 add(dst, dst, isrc);
1806 break;
1807 default:
1808 assert(false, "unsupported");
1809 ShouldNotReachHere();
1810 }
1811 BLOCK_COMMENT("} neon_reduce_add_integral");
1812 }
1813
1814 // Vector reduction multiply for integral type with ASIMD instructions.
1815 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1816 // Clobbers: rscratch1
1817 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1818 Register isrc, FloatRegister vsrc,
1819 unsigned vector_length_in_bytes,
1820 FloatRegister vtmp1, FloatRegister vtmp2) {
1821 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1822 bool isQ = vector_length_in_bytes == 16;
1823
1824 BLOCK_COMMENT("neon_reduce_mul_integral {");
1825 switch(bt) {
1826 case T_BYTE:
1827 if (isQ) {
1828 // Multiply the lower half and higher half of vector iteratively.
1829 // vtmp1 = vsrc[8:15]
1830 ins(vtmp1, D, vsrc, 0, 1);
1831 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1832 mulv(vtmp1, T8B, vtmp1, vsrc);
1833 // vtmp2 = vtmp1[4:7]
1834 ins(vtmp2, S, vtmp1, 0, 1);
1835 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1836 mulv(vtmp1, T8B, vtmp2, vtmp1);
1837 } else {
1838 ins(vtmp1, S, vsrc, 0, 1);
1839 mulv(vtmp1, T8B, vtmp1, vsrc);
1840 }
1841 // vtmp2 = vtmp1[2:3]
1842 ins(vtmp2, H, vtmp1, 0, 1);
1843 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1844 mulv(vtmp2, T8B, vtmp2, vtmp1);
1845 // dst = vtmp2[0] * isrc * vtmp2[1]
1846 umov(rscratch1, vtmp2, B, 0);
1847 mulw(dst, rscratch1, isrc);
1848 sxtb(dst, dst);
1849 umov(rscratch1, vtmp2, B, 1);
1850 mulw(dst, rscratch1, dst);
1851 sxtb(dst, dst);
1852 break;
1853 case T_SHORT:
1854 if (isQ) {
1855 ins(vtmp2, D, vsrc, 0, 1);
1856 mulv(vtmp2, T4H, vtmp2, vsrc);
1857 ins(vtmp1, S, vtmp2, 0, 1);
1858 mulv(vtmp1, T4H, vtmp1, vtmp2);
1859 } else {
1860 ins(vtmp1, S, vsrc, 0, 1);
1861 mulv(vtmp1, T4H, vtmp1, vsrc);
1862 }
1863 umov(rscratch1, vtmp1, H, 0);
1864 mulw(dst, rscratch1, isrc);
1865 sxth(dst, dst);
1866 umov(rscratch1, vtmp1, H, 1);
1867 mulw(dst, rscratch1, dst);
1868 sxth(dst, dst);
1869 break;
1870 case T_INT:
1871 if (isQ) {
1872 ins(vtmp1, D, vsrc, 0, 1);
1873 mulv(vtmp1, T2S, vtmp1, vsrc);
1874 } else {
1875 vtmp1 = vsrc;
1876 }
1877 umov(rscratch1, vtmp1, S, 0);
1878 mul(dst, rscratch1, isrc);
1879 umov(rscratch1, vtmp1, S, 1);
1880 mul(dst, rscratch1, dst);
1881 break;
1882 case T_LONG:
1883 umov(rscratch1, vsrc, D, 0);
1884 mul(dst, isrc, rscratch1);
1885 umov(rscratch1, vsrc, D, 1);
1886 mul(dst, dst, rscratch1);
1887 break;
1888 default:
1889 assert(false, "unsupported");
1890 ShouldNotReachHere();
1891 }
1892 BLOCK_COMMENT("} neon_reduce_mul_integral");
1893 }
1894
1895 // Vector reduction multiply for floating-point type with ASIMD instructions.
1896 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1897 FloatRegister fsrc, FloatRegister vsrc,
1898 unsigned vector_length_in_bytes,
1899 FloatRegister vtmp) {
1900 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1901 bool isQ = vector_length_in_bytes == 16;
1902
1903 BLOCK_COMMENT("neon_reduce_mul_fp {");
1904 switch(bt) {
1905 // The T_SHORT type below is for Float16 type which also uses floating-point
1906 // instructions.
1907 case T_SHORT:
1908 fmulh(dst, fsrc, vsrc);
1909 ext(vtmp, T8B, vsrc, vsrc, 2);
1910 fmulh(dst, dst, vtmp);
1911 ext(vtmp, T8B, vsrc, vsrc, 4);
1912 fmulh(dst, dst, vtmp);
1913 ext(vtmp, T8B, vsrc, vsrc, 6);
1914 fmulh(dst, dst, vtmp);
1915 if (isQ) {
1916 ext(vtmp, T16B, vsrc, vsrc, 8);
1917 fmulh(dst, dst, vtmp);
1918 ext(vtmp, T16B, vsrc, vsrc, 10);
1919 fmulh(dst, dst, vtmp);
1920 ext(vtmp, T16B, vsrc, vsrc, 12);
1921 fmulh(dst, dst, vtmp);
1922 ext(vtmp, T16B, vsrc, vsrc, 14);
1923 fmulh(dst, dst, vtmp);
1924 }
1925 break;
1926 case T_FLOAT:
1927 fmuls(dst, fsrc, vsrc);
1928 ins(vtmp, S, vsrc, 0, 1);
1929 fmuls(dst, dst, vtmp);
1930 if (isQ) {
1931 ins(vtmp, S, vsrc, 0, 2);
1932 fmuls(dst, dst, vtmp);
1933 ins(vtmp, S, vsrc, 0, 3);
1934 fmuls(dst, dst, vtmp);
1935 }
1936 break;
1937 case T_DOUBLE:
1938 assert(isQ, "unsupported");
1939 fmuld(dst, fsrc, vsrc);
1940 ins(vtmp, D, vsrc, 0, 1);
1941 fmuld(dst, dst, vtmp);
1942 break;
1943 default:
1944 assert(false, "unsupported");
1945 ShouldNotReachHere();
1946 }
1947 BLOCK_COMMENT("} neon_reduce_mul_fp");
1948 }
1949
1950 // Vector reduction add for half float type with ASIMD instructions.
1951 void C2_MacroAssembler::neon_reduce_add_fp16(FloatRegister dst, FloatRegister fsrc, FloatRegister vsrc,
1952 unsigned vector_length_in_bytes, FloatRegister vtmp) {
1953 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1954 bool isQ = vector_length_in_bytes == 16;
1955
1956 BLOCK_COMMENT("neon_reduce_add_fp16 {");
1957 faddh(dst, fsrc, vsrc);
1958 ext(vtmp, T8B, vsrc, vsrc, 2);
1959 faddh(dst, dst, vtmp);
1960 ext(vtmp, T8B, vsrc, vsrc, 4);
1961 faddh(dst, dst, vtmp);
1962 ext(vtmp, T8B, vsrc, vsrc, 6);
1963 faddh(dst, dst, vtmp);
1964 if (isQ) {
1965 ext(vtmp, T16B, vsrc, vsrc, 8);
1966 faddh(dst, dst, vtmp);
1967 ext(vtmp, T16B, vsrc, vsrc, 10);
1968 faddh(dst, dst, vtmp);
1969 ext(vtmp, T16B, vsrc, vsrc, 12);
1970 faddh(dst, dst, vtmp);
1971 ext(vtmp, T16B, vsrc, vsrc, 14);
1972 faddh(dst, dst, vtmp);
1973 }
1974 BLOCK_COMMENT("} neon_reduce_add_fp16");
1975 }
1976
1977 // Helper to select logical instruction
1978 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1979 Register Rn, Register Rm,
1980 enum shift_kind kind, unsigned shift) {
1981 switch(opc) {
1982 case Op_AndReductionV:
1983 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1984 break;
1985 case Op_OrReductionV:
1986 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1987 break;
1988 case Op_XorReductionV:
1989 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1990 break;
1991 default:
1992 assert(false, "unsupported");
1993 ShouldNotReachHere();
1994 }
1995 }
1996
1997 // Vector reduction logical operations And, Or, Xor
1998 // Clobbers: rscratch1
1999 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2000 Register isrc, FloatRegister vsrc,
2001 unsigned vector_length_in_bytes) {
2002 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2003 "unsupported");
2004 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2005 assert_different_registers(dst, isrc);
2006 bool isQ = vector_length_in_bytes == 16;
2007
2008 BLOCK_COMMENT("neon_reduce_logical {");
2009 umov(rscratch1, vsrc, isQ ? D : S, 0);
2010 umov(dst, vsrc, isQ ? D : S, 1);
2011 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2012 switch(bt) {
2013 case T_BYTE:
2014 if (isQ) {
2015 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2016 }
2017 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2018 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2019 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2020 sxtb(dst, dst);
2021 break;
2022 case T_SHORT:
2023 if (isQ) {
2024 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2025 }
2026 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2027 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2028 sxth(dst, dst);
2029 break;
2030 case T_INT:
2031 if (isQ) {
2032 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2033 }
2034 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2035 break;
2036 case T_LONG:
2037 assert(isQ, "unsupported");
2038 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2039 break;
2040 default:
2041 assert(false, "unsupported");
2042 ShouldNotReachHere();
2043 }
2044 BLOCK_COMMENT("} neon_reduce_logical");
2045 }
2046
2047 // Helper function to decode min/max reduction operation properties
2048 void C2_MacroAssembler::decode_minmax_reduction_opc(int opc, bool* is_min,
2049 bool* is_unsigned,
2050 Condition* cond) {
2051 switch(opc) {
2052 case Op_MinReductionV:
2053 *is_min = true; *is_unsigned = false; *cond = LT; break;
2054 case Op_MaxReductionV:
2055 *is_min = false; *is_unsigned = false; *cond = GT; break;
2056 case Op_UMinReductionV:
2057 *is_min = true; *is_unsigned = true; *cond = LO; break;
2058 case Op_UMaxReductionV:
2059 *is_min = false; *is_unsigned = true; *cond = HI; break;
2060 default:
2061 ShouldNotReachHere();
2062 }
2063 }
2064
2065 // Vector reduction min/max/umin/umax for integral type with ASIMD instructions.
2066 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2067 // Clobbers: rscratch1, rflags
2068 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2069 Register isrc, FloatRegister vsrc,
2070 unsigned vector_length_in_bytes,
2071 FloatRegister vtmp) {
2072 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV ||
2073 opc == Op_UMinReductionV || opc == Op_UMaxReductionV, "unsupported");
2074 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2075 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2076 assert_different_registers(dst, isrc);
2077 bool isQ = vector_length_in_bytes == 16;
2078 bool is_min;
2079 bool is_unsigned;
2080 Condition cond;
2081 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2082 BLOCK_COMMENT("neon_reduce_minmax_integral {");
2083 if (bt == T_LONG) {
2084 assert(vtmp == fnoreg, "should be");
2085 assert(isQ, "should be");
2086 umov(rscratch1, vsrc, D, 0);
2087 cmp(isrc, rscratch1);
2088 csel(dst, isrc, rscratch1, cond);
2089 umov(rscratch1, vsrc, D, 1);
2090 cmp(dst, rscratch1);
2091 csel(dst, dst, rscratch1, cond);
2092 } else {
2093 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2094 if (size == T2S) {
2095 // For T2S (2x32-bit elements), use pairwise instructions because
2096 // uminv/umaxv/sminv/smaxv don't support arrangement 2S.
2097 neon_minmaxp(is_unsigned, is_min, vtmp, size, vsrc, vsrc);
2098 } else {
2099 // For other sizes, use reduction to scalar instructions.
2100 neon_minmaxv(is_unsigned, is_min, vtmp, size, vsrc);
2101 }
2102 if (bt == T_INT) {
2103 umov(dst, vtmp, S, 0);
2104 } else if (is_unsigned) {
2105 umov(dst, vtmp, elemType_to_regVariant(bt), 0);
2106 } else {
2107 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2108 }
2109 cmpw(dst, isrc);
2110 cselw(dst, dst, isrc, cond);
2111 }
2112 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2113 }
2114
2115 // Vector reduction for integral type with SVE instruction.
2116 // Supported operations are Add, And, Or, Xor, Max, Min, UMax, UMin.
2117 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2118 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2119 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2120 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2121 assert(pg->is_governing(), "This register has to be a governing predicate register");
2122 assert_different_registers(src1, dst);
2123 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2124 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2125 switch (opc) {
2126 case Op_AddReductionVI: {
2127 sve_uaddv(tmp, size, pg, src2);
2128 if (bt == T_BYTE) {
2129 smov(dst, tmp, size, 0);
2130 addw(dst, src1, dst, ext::sxtb);
2131 } else if (bt == T_SHORT) {
2132 smov(dst, tmp, size, 0);
2133 addw(dst, src1, dst, ext::sxth);
2134 } else {
2135 umov(dst, tmp, size, 0);
2136 addw(dst, dst, src1);
2137 }
2138 break;
2139 }
2140 case Op_AddReductionVL: {
2141 sve_uaddv(tmp, size, pg, src2);
2142 umov(dst, tmp, size, 0);
2143 add(dst, dst, src1);
2144 break;
2145 }
2146 case Op_AndReductionV: {
2147 sve_andv(tmp, size, pg, src2);
2148 if (bt == T_INT || bt == T_LONG) {
2149 umov(dst, tmp, size, 0);
2150 } else {
2151 smov(dst, tmp, size, 0);
2152 }
2153 if (bt == T_LONG) {
2154 andr(dst, dst, src1);
2155 } else {
2156 andw(dst, dst, src1);
2157 }
2158 break;
2159 }
2160 case Op_OrReductionV: {
2161 sve_orv(tmp, size, pg, src2);
2162 if (bt == T_INT || bt == T_LONG) {
2163 umov(dst, tmp, size, 0);
2164 } else {
2165 smov(dst, tmp, size, 0);
2166 }
2167 if (bt == T_LONG) {
2168 orr(dst, dst, src1);
2169 } else {
2170 orrw(dst, dst, src1);
2171 }
2172 break;
2173 }
2174 case Op_XorReductionV: {
2175 sve_eorv(tmp, size, pg, src2);
2176 if (bt == T_INT || bt == T_LONG) {
2177 umov(dst, tmp, size, 0);
2178 } else {
2179 smov(dst, tmp, size, 0);
2180 }
2181 if (bt == T_LONG) {
2182 eor(dst, dst, src1);
2183 } else {
2184 eorw(dst, dst, src1);
2185 }
2186 break;
2187 }
2188 case Op_MaxReductionV:
2189 case Op_MinReductionV:
2190 case Op_UMaxReductionV:
2191 case Op_UMinReductionV: {
2192 bool is_min;
2193 bool is_unsigned;
2194 Condition cond;
2195 decode_minmax_reduction_opc(opc, &is_min, &is_unsigned, &cond);
2196 sve_minmaxv(is_unsigned, is_min, tmp, size, pg, src2);
2197 // Move result from vector to general register
2198 if (is_unsigned || bt == T_INT || bt == T_LONG) {
2199 umov(dst, tmp, size, 0);
2200 } else {
2201 smov(dst, tmp, size, 0);
2202 }
2203 if (bt == T_LONG) {
2204 cmp(dst, src1);
2205 csel(dst, dst, src1, cond);
2206 } else {
2207 cmpw(dst, src1);
2208 cselw(dst, dst, src1, cond);
2209 }
2210 break;
2211 }
2212 default:
2213 assert(false, "unsupported");
2214 ShouldNotReachHere();
2215 }
2216
2217 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2218 if (bt == T_BYTE) {
2219 sxtb(dst, dst);
2220 } else if (bt == T_SHORT) {
2221 sxth(dst, dst);
2222 }
2223 }
2224 }
2225
2226 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2227 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2228 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2229 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2230 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2231 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2232
2233 // Set all elements to false if the input "lane_cnt" is zero.
2234 if (lane_cnt == 0) {
2235 sve_pfalse(dst);
2236 return;
2237 }
2238
2239 SIMD_RegVariant size = elemType_to_regVariant(bt);
2240 assert(size != Q, "invalid size");
2241
2242 // Set all true if "lane_cnt" equals to the max lane count.
2243 if (lane_cnt == max_vector_length) {
2244 sve_ptrue(dst, size, /* ALL */ 0b11111);
2245 return;
2246 }
2247
2248 // Fixed numbers for "ptrue".
2249 switch(lane_cnt) {
2250 case 1: /* VL1 */
2251 case 2: /* VL2 */
2252 case 3: /* VL3 */
2253 case 4: /* VL4 */
2254 case 5: /* VL5 */
2255 case 6: /* VL6 */
2256 case 7: /* VL7 */
2257 case 8: /* VL8 */
2258 sve_ptrue(dst, size, lane_cnt);
2259 return;
2260 case 16:
2261 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2262 return;
2263 case 32:
2264 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2265 return;
2266 case 64:
2267 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2268 return;
2269 case 128:
2270 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2271 return;
2272 case 256:
2273 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2274 return;
2275 default:
2276 break;
2277 }
2278
2279 // Special patterns for "ptrue".
2280 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2281 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2282 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2283 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2284 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2285 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2286 } else {
2287 // Encode to "whileltw" for the remaining cases.
2288 mov(rscratch1, lane_cnt);
2289 sve_whileltw(dst, size, zr, rscratch1);
2290 }
2291 }
2292
2293 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2294 // Any remaining elements of dst will be filled with zero.
2295 // Clobbers: rscratch1
2296 // Preserves: mask, vzr
2297 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2298 FloatRegister vzr, FloatRegister vtmp,
2299 PRegister pgtmp, unsigned vector_length_in_bytes) {
2300 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2301 // When called by sve_compress_byte, src and vtmp may be the same register.
2302 assert_different_registers(dst, src, vzr);
2303 assert_different_registers(dst, vtmp, vzr);
2304 assert_different_registers(mask, pgtmp);
2305 // high <-- low
2306 // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2307 // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2308 // Expected result: dst = 00 00 00 hh ee dd bb aa
2309
2310 // Extend lowest half to type INT.
2311 // dst = 00dd 00cc 00bb 00aa
2312 sve_uunpklo(dst, S, src);
2313 // pgtmp = 0001 0000 0001 0001
2314 sve_punpklo(pgtmp, mask);
2315 // Pack the active elements in size of type INT to the right,
2316 // and fill the remainings with zero.
2317 // dst = 0000 00dd 00bb 00aa
2318 sve_compact(dst, S, dst, pgtmp);
2319 // Narrow the result back to type SHORT.
2320 // dst = 00 00 00 00 00 dd bb aa
2321 sve_uzp1(dst, H, dst, vzr);
2322
2323 // Return if the vector length is no more than MaxVectorSize/2, since the
2324 // highest half is invalid.
2325 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2326 return;
2327 }
2328
2329 // Count the active elements of lowest half.
2330 // rscratch1 = 3
2331 sve_cntp(rscratch1, S, ptrue, pgtmp);
2332
2333 // Repeat to the highest half.
2334 // pgtmp = 0001 0000 0000 0001
2335 sve_punpkhi(pgtmp, mask);
2336 // vtmp = 00hh 00gg 00ff 00ee
2337 sve_uunpkhi(vtmp, S, src);
2338 // vtmp = 0000 0000 00hh 00ee
2339 sve_compact(vtmp, S, vtmp, pgtmp);
2340 // vtmp = 00 00 00 00 00 00 hh ee
2341 sve_uzp1(vtmp, H, vtmp, vzr);
2342
2343 // pgtmp = 00 00 00 00 00 01 01 01
2344 sve_whilelt(pgtmp, H, zr, rscratch1);
2345 // Compressed low: dst = 00 00 00 00 00 dd bb aa
2346 // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2347 // Combine the compressed low with the compressed high:
2348 // dst = 00 00 00 hh ee dd bb aa
2349 sve_splice(dst, H, pgtmp, vtmp);
2350 }
2351
2352 // Clobbers: rscratch1, rscratch2
2353 // Preserves: src, mask
2354 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2355 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2356 PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
2357 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2358 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
2359 assert_different_registers(mask, ptmp, pgtmp);
2360 // high <-- low
2361 // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2362 // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2363 // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2364 FloatRegister vzr = vtmp3;
2365 sve_dup(vzr, B, 0);
2366
2367 // Extend lowest half to type SHORT.
2368 // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
2369 sve_uunpklo(vtmp1, H, src);
2370 // ptmp = 00 01 00 00 00 01 00 01
2371 sve_punpklo(ptmp, mask);
2372 // Pack the active elements in size of type SHORT to the right,
2373 // and fill the remainings with zero.
2374 // dst = 00 00 00 00 00 0g 0c 0a
2375 unsigned extended_size = vector_length_in_bytes << 1;
2376 sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
2377 // Narrow the result back to type BYTE.
2378 // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2379 sve_uzp1(dst, B, dst, vzr);
2380
2381 // Return if the vector length is no more than MaxVectorSize/2, since the
2382 // highest half is invalid.
2383 if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
2384 return;
2385 }
2386 // Count the active elements of lowest half.
2387 // rscratch2 = 3
2388 sve_cntp(rscratch2, H, ptrue, ptmp);
2389
2390 // Repeat to the highest half.
2391 // ptmp = 00 01 00 00 00 00 00 01
2392 sve_punpkhi(ptmp, mask);
2393 // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
2394 sve_uunpkhi(vtmp2, H, src);
2395 // vtmp1 = 00 00 00 00 00 00 0p 0i
2396 sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2397 // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2398 sve_uzp1(vtmp1, B, vtmp1, vzr);
2399
2400 // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2401 sve_whilelt(ptmp, B, zr, rscratch2);
2402 // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2403 // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2404 // Combine the compressed low with the compressed high:
2405 // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2406 sve_splice(dst, B, ptmp, vtmp1);
2407 }
2408
2409 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2410 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2411 SIMD_Arrangement size = isQ ? T16B : T8B;
2412 if (bt == T_BYTE) {
2413 rbit(dst, size, src);
2414 } else {
2415 neon_reverse_bytes(dst, src, bt, isQ);
2416 rbit(dst, size, dst);
2417 }
2418 }
2419
2420 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2421 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2422 SIMD_Arrangement size = isQ ? T16B : T8B;
2423 switch (bt) {
2424 case T_BYTE:
2425 if (dst != src) {
2426 orr(dst, size, src, src);
2427 }
2428 break;
2429 case T_SHORT:
2430 rev16(dst, size, src);
2431 break;
2432 case T_INT:
2433 rev32(dst, size, src);
2434 break;
2435 case T_LONG:
2436 rev64(dst, size, src);
2437 break;
2438 default:
2439 assert(false, "unsupported");
2440 ShouldNotReachHere();
2441 }
2442 }
2443
2444 // VectorRearrange implementation for short/int/float/long/double types with NEON
2445 // instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
2446 // But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
2447 // For VectorRearrange long/double, we compare the shuffle input with iota indices,
2448 // and use bsl to implement the operation.
2449 void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
2450 FloatRegister shuffle, FloatRegister tmp,
2451 BasicType bt, bool isQ) {
2452 assert_different_registers(dst, src, shuffle, tmp);
2453 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2454 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2455
2456 // Here is an example that rearranges a NEON vector with 4 ints:
2457 // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
2458 // 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
2459 // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
2460 // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
2461 // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
2462 // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
2463 // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
2464 // 4. Use Vm as index register, and use V1 as table register.
2465 // Then get V2 as the result by tbl NEON instructions.
2466 switch (bt) {
2467 case T_SHORT:
2468 mov(tmp, size1, 0x02);
2469 mulv(dst, size2, shuffle, tmp);
2470 mov(tmp, size2, 0x0100);
2471 addv(dst, size1, dst, tmp);
2472 tbl(dst, size1, src, 1, dst);
2473 break;
2474 case T_INT:
2475 case T_FLOAT:
2476 mov(tmp, size1, 0x04);
2477 mulv(dst, size2, shuffle, tmp);
2478 mov(tmp, size2, 0x03020100);
2479 addv(dst, size1, dst, tmp);
2480 tbl(dst, size1, src, 1, dst);
2481 break;
2482 case T_LONG:
2483 case T_DOUBLE:
2484 {
2485 int idx = vector_iota_entry_index(T_LONG);
2486 lea(rscratch1,
2487 ExternalAddress(StubRoutines::aarch64::vector_iota_indices(idx)));
2488 ldrq(tmp, rscratch1);
2489 // Check whether the input "shuffle" is the same with iota indices.
2490 // Return "src" if true, otherwise swap the two elements of "src".
2491 cm(EQ, dst, size2, shuffle, tmp);
2492 ext(tmp, size1, src, src, 8);
2493 bsl(dst, size1, src, tmp);
2494 }
2495 break;
2496 default:
2497 assert(false, "unsupported element type");
2498 ShouldNotReachHere();
2499 }
2500 }
2501
2502 // Extract a scalar element from an sve vector at position 'idx'.
2503 // The input elements in src are expected to be of integral type.
2504 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2505 int idx, FloatRegister vtmp) {
2506 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2507 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2508 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2509 if (bt == T_INT || bt == T_LONG) {
2510 umov(dst, src, size, idx);
2511 } else {
2512 smov(dst, src, size, idx);
2513 }
2514 } else {
2515 sve_movprfx(vtmp, src);
2516 // Although vtmp and src hold the same value after movprfx, we must use src
2517 // (not vtmp) as the second source of ext. The movprfx destination register
2518 // must not appear in any source operand of the following instruction except
2519 // as the destructive operand.
2520 sve_ext(vtmp, src, idx << size);
2521 if (bt == T_INT || bt == T_LONG) {
2522 umov(dst, vtmp, size, 0);
2523 } else {
2524 smov(dst, vtmp, size, 0);
2525 }
2526 }
2527 }
2528
2529 // java.lang.Math::round intrinsics
2530
2531 // Clobbers: rscratch1, rflags
2532 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2533 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2534 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2535 switch (T) {
2536 case T2S:
2537 case T4S:
2538 fmovs(tmp1, T, 0.5f);
2539 mov(rscratch1, jint_cast(0x1.0p23f));
2540 break;
2541 case T2D:
2542 fmovd(tmp1, T, 0.5);
2543 mov(rscratch1, julong_cast(0x1.0p52));
2544 break;
2545 default:
2546 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2547 }
2548 fadd(tmp1, T, tmp1, src);
2549 fcvtms(tmp1, T, tmp1);
2550 // tmp1 = floor(src + 0.5, ties to even)
2551
2552 fcvtas(dst, T, src);
2553 // dst = round(src), ties to away
2554
2555 fneg(tmp3, T, src);
2556 dup(tmp2, T, rscratch1);
2557 cm(HS, tmp3, T, tmp3, tmp2);
2558 // tmp3 is now a set of flags
2559
2560 bif(dst, T16B, tmp1, tmp3);
2561 // result in dst
2562 }
2563
2564 // Clobbers: rscratch1, rflags
2565 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2566 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2567 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2568 assert_different_registers(tmp1, tmp2, src, dst);
2569
2570 switch (T) {
2571 case S:
2572 mov(rscratch1, jint_cast(0x1.0p23f));
2573 break;
2574 case D:
2575 mov(rscratch1, julong_cast(0x1.0p52));
2576 break;
2577 default:
2578 assert(T == S || T == D, "invalid register variant");
2579 }
2580
2581 sve_frinta(dst, T, ptrue, src);
2582 // dst = round(src), ties to away
2583
2584 Label none;
2585
2586 sve_fneg(tmp1, T, ptrue, src);
2587 sve_dup(tmp2, T, rscratch1);
2588 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2589 br(EQ, none);
2590 {
2591 sve_cpy(tmp1, T, pgtmp, 0.5);
2592 sve_fadd(tmp1, T, pgtmp, src);
2593 sve_frintm(dst, T, pgtmp, tmp1);
2594 // dst = floor(src + 0.5, ties to even)
2595 }
2596 bind(none);
2597
2598 sve_fcvtzs(dst, T, ptrue, dst, T);
2599 // result in dst
2600 }
2601
2602 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2603 FloatRegister one, SIMD_Arrangement T) {
2604 assert_different_registers(dst, src, zero, one);
2605 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2606
2607 facgt(dst, T, src, zero);
2608 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2609 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2610 }
2611
2612 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2613 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2614 assert_different_registers(dst, src, zero, one, vtmp);
2615 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2616
2617 sve_orr(vtmp, src, src);
2618 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2619 switch (T) {
2620 case S:
2621 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2622 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2623 // on the sign of the float value
2624 break;
2625 case D:
2626 sve_and(vtmp, T, min_jlong);
2627 sve_orr(vtmp, T, jlong_cast(1.0));
2628 break;
2629 default:
2630 assert(false, "unsupported");
2631 ShouldNotReachHere();
2632 }
2633 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2634 // Result in dst
2635 }
2636
2637 bool C2_MacroAssembler::in_scratch_emit_size() {
2638 if (ciEnv::current()->task() != nullptr) {
2639 PhaseOutput* phase_output = Compile::current()->output();
2640 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2641 return true;
2642 }
2643 }
2644 return MacroAssembler::in_scratch_emit_size();
2645 }
2646
2647 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
2648 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
2649 }
2650
2651 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register rval, Register rtmp) {
2652 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2653 if (t == TypeInt::INT) {
2654 return;
2655 }
2656
2657 BLOCK_COMMENT("verify_int_in_range {");
2658 Label L_success, L_failure;
2659
2660 jint lo = t->_lo;
2661 jint hi = t->_hi;
2662
2663 if (lo != min_jint) {
2664 subsw(rtmp, rval, lo);
2665 br(Assembler::LT, L_failure);
2666 }
2667 if (hi != max_jint) {
2668 subsw(rtmp, rval, hi);
2669 br(Assembler::GT, L_failure);
2670 }
2671 b(L_success);
2672
2673 bind(L_failure);
2674 movw(c_rarg0, idx);
2675 mov(c_rarg1, rval);
2676 movw(c_rarg2, lo);
2677 movw(c_rarg3, hi);
2678 reconstruct_frame_pointer(rtmp);
2679 rt_call(CAST_FROM_FN_PTR(address, abort_verify_int_in_range), rtmp);
2680 hlt(0);
2681
2682 bind(L_success);
2683 BLOCK_COMMENT("} verify_int_in_range");
2684 }
2685
2686 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
2687 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
2688 }
2689
2690 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register rval, Register rtmp) {
2691 assert(!t->empty() && !t->singleton(), "%s", Type::str(t));
2692 if (t == TypeLong::LONG) {
2693 return;
2694 }
2695
2696 BLOCK_COMMENT("verify_long_in_range {");
2697 Label L_success, L_failure;
2698
2699 jlong lo = t->_lo;
2700 jlong hi = t->_hi;
2701
2702 if (lo != min_jlong) {
2703 subs(rtmp, rval, lo);
2704 br(Assembler::LT, L_failure);
2705 }
2706 if (hi != max_jlong) {
2707 subs(rtmp, rval, hi);
2708 br(Assembler::GT, L_failure);
2709 }
2710 b(L_success);
2711
2712 bind(L_failure);
2713 movw(c_rarg0, idx);
2714 mov(c_rarg1, rval);
2715 mov(c_rarg2, lo);
2716 mov(c_rarg3, hi);
2717 reconstruct_frame_pointer(rtmp);
2718 rt_call(CAST_FROM_FN_PTR(address, abort_verify_long_in_range), rtmp);
2719 hlt(0);
2720
2721 bind(L_success);
2722 BLOCK_COMMENT("} verify_long_in_range");
2723 }
2724
2725 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
2726 const int framesize = Compile::current()->output()->frame_size_in_bytes();
2727 if (PreserveFramePointer) {
2728 // frame pointer is valid
2729 #ifdef ASSERT
2730 // Verify frame pointer value in rfp.
2731 add(rtmp, sp, framesize - 2 * wordSize);
2732 Label L_success;
2733 cmp(rfp, rtmp);
2734 br(Assembler::EQ, L_success);
2735 stop("frame pointer mismatch");
2736 bind(L_success);
2737 #endif // ASSERT
2738 } else {
2739 add(rfp, sp, framesize - 2 * wordSize);
2740 }
2741 }
2742
2743 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2744 // using Neon instructions and places it in the destination vector element corresponding to the
2745 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2746 // where NUM_ELEM is the number of BasicType elements per vector.
2747 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2748 // Otherwise, selects src2[idx – NUM_ELEM]
2749 void C2_MacroAssembler::select_from_two_vectors_neon(FloatRegister dst, FloatRegister src1,
2750 FloatRegister src2, FloatRegister index,
2751 FloatRegister tmp, unsigned vector_length_in_bytes) {
2752 assert_different_registers(dst, src1, src2, tmp);
2753 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2754
2755 if (vector_length_in_bytes == 16) {
2756 assert(UseSVE <= 1, "sve must be <= 1");
2757 assert(src1->successor() == src2, "Source registers must be ordered");
2758 // If the vector length is 16B, then use the Neon "tbl" instruction with two vector table
2759 tbl(dst, size, src1, 2, index);
2760 } else { // vector length == 8
2761 assert(UseSVE == 0, "must be Neon only");
2762 // We need to fit both the source vectors (src1, src2) in a 128-bit register because the
2763 // Neon "tbl" instruction supports only looking up 16B vectors. We then use the Neon "tbl"
2764 // instruction with one vector lookup
2765 ins(tmp, D, src1, 0, 0);
2766 ins(tmp, D, src2, 1, 0);
2767 tbl(dst, size, tmp, 1, index);
2768 }
2769 }
2770
2771 // Selects elements from two source vectors (src1, src2) based on index values in the index register
2772 // using SVE/SVE2 instructions and places it in the destination vector element corresponding to the
2773 // index vector element. Each index in the index register must be in the range - [0, 2 * NUM_ELEM),
2774 // where NUM_ELEM is the number of BasicType elements per vector.
2775 // If idx < NUM_ELEM --> selects src1[idx] (idx is an element of the index register)
2776 // Otherwise, selects src2[idx – NUM_ELEM]
2777 void C2_MacroAssembler::select_from_two_vectors_sve(FloatRegister dst, FloatRegister src1,
2778 FloatRegister src2, FloatRegister index,
2779 FloatRegister tmp, SIMD_RegVariant T,
2780 unsigned vector_length_in_bytes) {
2781 assert_different_registers(dst, src1, src2, index, tmp);
2782
2783 if (vector_length_in_bytes == 8) {
2784 // We need to fit both the source vectors (src1, src2) in a single vector register because the
2785 // SVE "tbl" instruction is unpredicated and works on the entire vector which can lead to
2786 // incorrect results if each source vector is only partially filled. We then use the SVE "tbl"
2787 // instruction with one vector lookup
2788 assert(UseSVE >= 1, "sve must be >= 1");
2789 ins(tmp, D, src1, 0, 0);
2790 ins(tmp, D, src2, 1, 0);
2791 sve_tbl(dst, T, tmp, index);
2792 } else { // UseSVE == 2 and vector_length_in_bytes > 8
2793 // If the vector length is > 8, then use the SVE2 "tbl" instruction with the two vector table.
2794 // The assertion - vector_length_in_bytes == MaxVectorSize ensures that this operation
2795 // is not executed on machines where vector_length_in_bytes < MaxVectorSize
2796 // with the only exception of 8B vector length.
2797 assert(UseSVE == 2 && vector_length_in_bytes == MaxVectorSize, "must be");
2798 assert(src1->successor() == src2, "Source registers must be ordered");
2799 sve_tbl(dst, T, src1, src2, index);
2800 }
2801 }
2802
2803 void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1,
2804 FloatRegister src2, FloatRegister index,
2805 FloatRegister tmp, BasicType bt,
2806 unsigned vector_length_in_bytes) {
2807
2808 assert_different_registers(dst, src1, src2, index, tmp);
2809
2810 // The cases that can reach this method are -
2811 // - UseSVE = 0/1, vector_length_in_bytes = 8 or 16, excluding double and long types
2812 // - UseSVE = 2, vector_length_in_bytes >= 8, for all types
2813 //
2814 // SVE/SVE2 tbl instructions are generated when UseSVE = 1 with vector_length_in_bytes = 8
2815 // and UseSVE = 2 with vector_length_in_bytes >= 8
2816 //
2817 // Neon instructions are generated when UseSVE = 0 with vector_length_in_bytes = 8 or 16 and
2818 // UseSVE = 1 with vector_length_in_bytes = 16
2819
2820 if ((UseSVE == 1 && vector_length_in_bytes == 8) || UseSVE == 2) {
2821 SIMD_RegVariant T = elemType_to_regVariant(bt);
2822 select_from_two_vectors_sve(dst, src1, src2, index, tmp, T, vector_length_in_bytes);
2823 return;
2824 }
2825
2826 // The only BasicTypes that can reach here are T_SHORT, T_BYTE, T_INT and T_FLOAT
2827 assert(bt != T_DOUBLE && bt != T_LONG, "unsupported basic type");
2828 assert(vector_length_in_bytes <= 16, "length_in_bytes must be <= 16");
2829
2830 bool isQ = vector_length_in_bytes == 16;
2831
2832 SIMD_Arrangement size1 = isQ ? T16B : T8B;
2833 SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
2834
2835 // Neon "tbl" instruction only supports byte tables, so we need to look at chunks of
2836 // 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table.
2837 // The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM
2838 // is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length,
2839 // the indices can range from [0, 8).
2840 // As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0]
2841 // Move a constant 0x02 in every byte of tmp - tmp = [0x0202, 0x0202, 0x0202, 0x0202]
2842 // Multiply index vector with tmp to yield - dst = [0x0404, 0x0a0a, 0x0202, 0x0000]
2843 // Move a constant 0x0100 in every 2B of tmp - tmp = [0x0100, 0x0100, 0x0100, 0x0100]
2844 // Add the multiplied result to the vector in tmp to obtain the byte level
2845 // offsets - dst = [0x0504, 0x0b0a, 0x0302, 0x0100]
2846 // Use these offsets in the "tbl" instruction to select chunks of 2B.
2847
2848 if (bt == T_BYTE) {
2849 select_from_two_vectors_neon(dst, src1, src2, index, tmp, vector_length_in_bytes);
2850 } else {
2851 int elem_size = (bt == T_SHORT) ? 2 : 4;
2852 uint64_t tbl_offset = (bt == T_SHORT) ? 0x0100u : 0x03020100u;
2853
2854 mov(tmp, size1, elem_size);
2855 mulv(dst, size2, index, tmp);
2856 mov(tmp, size2, tbl_offset);
2857 addv(dst, size1, dst, tmp); // "dst" now contains the processed index elements
2858 // to select a set of 2B/4B
2859 select_from_two_vectors_neon(dst, src1, src2, dst, tmp, vector_length_in_bytes);
2860 }
2861 }
2862
2863 // Vector expand implementation. Elements from the src vector are expanded into
2864 // the dst vector under the control of the vector mask.
2865 // Since there are no native instructions directly corresponding to expand before
2866 // SVE2p2, the following implementations mainly leverages the TBL instruction to
2867 // implement expand. To compute the index input for TBL, the prefix sum algorithm
2868 // (https://en.wikipedia.org/wiki/Prefix_sum) is used. The same algorithm is used
2869 // for NEON and SVE, but with different instructions where appropriate.
2870
2871 // Vector expand implementation for NEON.
2872 //
2873 // An example of 128-bit Byte vector:
2874 // Data direction: high <== low
2875 // Input:
2876 // src = g f e d c b a 9 8 7 6 5 4 3 2 1
2877 // mask = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2878 // Expected result:
2879 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2880 void C2_MacroAssembler::vector_expand_neon(FloatRegister dst, FloatRegister src, FloatRegister mask,
2881 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2882 int vector_length_in_bytes) {
2883 assert(vector_length_in_bytes <= 16, "the vector length in bytes for NEON must be <= 16");
2884 assert_different_registers(dst, src, mask, tmp1, tmp2);
2885 // Since the TBL instruction only supports byte table, we need to
2886 // compute indices in byte type for all types.
2887 SIMD_Arrangement size = vector_length_in_bytes == 16 ? T16B : T8B;
2888 // tmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2889 dup(tmp1, size, zr);
2890 // dst = 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1
2891 negr(dst, size, mask);
2892 // Calculate vector index for TBL with prefix sum algorithm.
2893 // dst = 8 8 8 7 6 6 6 5 4 4 4 3 2 2 2 1
2894 for (int i = 1; i < vector_length_in_bytes; i <<= 1) {
2895 ext(tmp2, size, tmp1, dst, vector_length_in_bytes - i);
2896 addv(dst, size, tmp2, dst);
2897 }
2898 // tmp2 = 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1 0 0 -1 -1
2899 orr(tmp2, size, mask, mask);
2900 // tmp2 = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2901 bsl(tmp2, size, dst, tmp1);
2902 // tmp1 = 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2903 movi(tmp1, size, 1);
2904 // dst = -1 -1 7 6 -1 -1 5 4 -1 -1 3 2 -1 -1 1 0
2905 subv(dst, size, tmp2, tmp1);
2906 // dst = 0 0 8 7 0 0 6 5 0 0 4 3 0 0 2 1
2907 tbl(dst, size, src, 1, dst);
2908 }
2909
2910 // Vector expand implementation for SVE.
2911 //
2912 // An example of 128-bit Short vector:
2913 // Data direction: high <== low
2914 // Input:
2915 // src = gf ed cb a9 87 65 43 21
2916 // pg = 00 01 00 01 00 01 00 01
2917 // Expected result:
2918 // dst = 00 87 00 65 00 43 00 21
2919 void C2_MacroAssembler::vector_expand_sve(FloatRegister dst, FloatRegister src, PRegister pg,
2920 FloatRegister tmp1, FloatRegister tmp2, BasicType bt,
2921 int vector_length_in_bytes) {
2922 assert(UseSVE > 0, "expand implementation only for SVE");
2923 assert_different_registers(dst, src, tmp1, tmp2);
2924 SIMD_RegVariant size = elemType_to_regVariant(bt);
2925
2926 // tmp1 = 00 00 00 00 00 00 00 00
2927 sve_dup(tmp1, size, 0);
2928 sve_movprfx(tmp2, tmp1);
2929 // tmp2 = 00 01 00 01 00 01 00 01
2930 sve_cpy(tmp2, size, pg, 1, true);
2931 // Calculate vector index for TBL with prefix sum algorithm.
2932 // tmp2 = 04 04 03 03 02 02 01 01
2933 for (int i = type2aelembytes(bt); i < vector_length_in_bytes; i <<= 1) {
2934 sve_movprfx(dst, tmp1);
2935 // The EXT instruction operates on the full-width sve register. The correct
2936 // index calculation method is:
2937 // vector_length_in_bytes - i + MaxVectorSize - vector_length_in_bytes =>
2938 // MaxVectorSize - i.
2939 sve_ext(dst, tmp2, MaxVectorSize - i);
2940 sve_add(tmp2, size, dst, tmp2);
2941 }
2942 // dst = 00 04 00 03 00 02 00 01
2943 sve_sel(dst, size, pg, tmp2, tmp1);
2944 // dst = -1 03 -1 02 -1 01 -1 00
2945 sve_sub(dst, size, 1);
2946 // dst = 00 87 00 65 00 43 00 21
2947 sve_tbl(dst, size, src, dst);
2948 }
2949
2950 // Optimized SVE cpy (imm, zeroing) instruction.
2951 //
2952 // `movi; cpy(imm, merging)` and `cpy(imm, zeroing)` have the same
2953 // functionality, but test results show that `movi; cpy(imm, merging)` has
2954 // higher throughput on some microarchitectures. This would depend on
2955 // microarchitecture and so may vary between implementations.
2956 void C2_MacroAssembler::sve_cpy(FloatRegister dst, SIMD_RegVariant T,
2957 PRegister pg, int imm8, bool isMerge) {
2958 if (VM_Version::prefer_sve_merging_mode_cpy() && !isMerge) {
2959 // Generates a NEON instruction `movi V<dst>.2d, #0`.
2960 // On AArch64, Z and V registers alias in the low 128 bits, so V<dst> is
2961 // the low 128 bits of Z<dst>. A write to V<dst> also clears all bits of
2962 // Z<dst> above 128, so this `movi` instruction effectively zeroes the
2963 // entire Z<dst> register. According to the Arm Software Optimization
2964 // Guide, `movi` is zero latency.
2965 movi(dst, T2D, 0);
2966 isMerge = true;
2967 }
2968 Assembler::sve_cpy(dst, T, pg, imm8, isMerge);
2969 }
2970
2971 int C2_MacroAssembler::vector_iota_entry_index(BasicType bt) {
2972 // The vector iota entries array is ordered by type B/S/I/L/F/D, and
2973 // the offset between two types is 16.
2974 switch(bt) {
2975 case T_BYTE:
2976 return 0;
2977 case T_SHORT:
2978 return 1;
2979 case T_INT:
2980 return 2;
2981 case T_LONG:
2982 return 3;
2983 case T_FLOAT:
2984 return 4;
2985 case T_DOUBLE:
2986 return 5;
2987 default:
2988 ShouldNotReachHere();
2989 }
2990 }