1 /*
2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/matcher.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/stubRoutines.hpp"
35 #include "utilities/globalDefinitions.hpp"
36
37 #ifdef PRODUCT
38 #define BLOCK_COMMENT(str) /* nothing */
39 #define STOP(error) stop(error)
40 #else
41 #define BLOCK_COMMENT(str) block_comment(str)
42 #define STOP(error) block_comment(error); stop(error)
43 #endif
44
45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46
47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
48
49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
50 Register tmp2Reg, Register tmp3Reg) {
51 Register oop = objectReg;
52 Register box = boxReg;
53 Register disp_hdr = tmpReg;
54 Register tmp = tmp2Reg;
55 Label cont;
56 Label object_has_monitor;
57 Label count, no_count;
58
59 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
60 assert_different_registers(oop, box, tmp, disp_hdr);
61
62 // Load markWord from object into displaced_header.
63 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
64
65 if (DiagnoseSyncOnValueBasedClasses != 0) {
66 load_klass(tmp, oop);
67 ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
68 tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
69 br(Assembler::NE, cont);
70 }
71
72 // Check for existing monitor
73 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
74
75 if (LockingMode == LM_MONITOR) {
76 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
77 b(cont);
78 } else {
79 assert(LockingMode == LM_LEGACY, "must be");
80 // Set tmp to be (markWord of object | UNLOCK_VALUE).
81 orr(tmp, disp_hdr, markWord::unlocked_value);
82
83 // Initialize the box. (Must happen before we update the object mark!)
84 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
85
86 // Compare object markWord with an unlocked value (tmp) and if
87 // equal exchange the stack address of our box with object markWord.
88 // On failure disp_hdr contains the possibly locked markWord.
89 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
90 /*release*/ true, /*weak*/ false, disp_hdr);
91 br(Assembler::EQ, cont);
92
93 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
94
95 // If the compare-and-exchange succeeded, then we found an unlocked
96 // object, will have now locked it will continue at label cont
97
98 // Check if the owner is self by comparing the value in the
99 // markWord of object (disp_hdr) with the stack pointer.
100 mov(rscratch1, sp);
101 sub(disp_hdr, disp_hdr, rscratch1);
102 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
103 // If condition is true we are cont and hence we can store 0 as the
104 // displaced header in the box, which indicates that it is a recursive lock.
105 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result
106 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
107 b(cont);
108 }
109
110 // Handle existing monitor.
111 bind(object_has_monitor);
112
113 // The object's monitor m is unlocked iff m->owner == NULL,
114 // otherwise m->owner may contain a thread or a stack address.
115 //
116 // Try to CAS m->owner from NULL to current thread.
117 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
118 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
119 /*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result
120
121 // Store a non-null value into the box to avoid looking like a re-entrant
122 // lock. The fast-path monitor unlock code checks for
123 // markWord::monitor_value so use markWord::unused_mark which has the
124 // relevant bit set, and also matches ObjectSynchronizer::enter.
125 mov(tmp, (address)markWord::unused_mark().value());
126 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
127
128 br(Assembler::EQ, cont); // CAS success means locking succeeded
129
130 cmp(rscratch1, rthread);
131 br(Assembler::NE, cont); // Check for recursive locking
132
133 // Recursive lock case
134 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
135 // flag == EQ still from the cmp above, checking if this is a reentrant lock
136
137 bind(cont);
138 // flag == EQ indicates success
139 // flag == NE indicates failure
140 br(Assembler::NE, no_count);
141
142 bind(count);
143 increment(Address(rthread, JavaThread::held_monitor_count_offset()));
144
145 bind(no_count);
146 }
147
148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
149 Register tmp2Reg) {
150 Register oop = objectReg;
151 Register box = boxReg;
152 Register disp_hdr = tmpReg;
153 Register tmp = tmp2Reg;
154 Label cont;
155 Label object_has_monitor;
156 Label count, no_count;
157
158 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
159 assert_different_registers(oop, box, tmp, disp_hdr);
160
161 if (LockingMode == LM_LEGACY) {
162 // Find the lock address and load the displaced header from the stack.
163 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
164
165 // If the displaced header is 0, we have a recursive unlock.
166 cmp(disp_hdr, zr);
167 br(Assembler::EQ, cont);
168 }
169
170 // Handle existing monitor.
171 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
172 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
173
174 if (LockingMode == LM_MONITOR) {
175 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
176 b(cont);
177 } else {
178 assert(LockingMode == LM_LEGACY, "must be");
179 // Check if it is still a light weight lock, this is is true if we
180 // see the stack address of the basicLock in the markWord of the
181 // object.
182
183 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
184 /*release*/ true, /*weak*/ false, tmp);
185 b(cont);
186 }
187
188 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
189
190 // Handle existing monitor.
191 bind(object_has_monitor);
192 STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
193 add(tmp, tmp, -(int)markWord::monitor_value); // monitor
194
195 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
196
197 Label notRecursive;
198 cbz(disp_hdr, notRecursive);
199
200 // Recursive lock
201 sub(disp_hdr, disp_hdr, 1u);
202 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
203 cmp(disp_hdr, disp_hdr); // Sets flags for result
204 b(cont);
205
206 bind(notRecursive);
207 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
208 ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
209 orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
210 cmp(rscratch1, zr); // Sets flags for result
211 cbnz(rscratch1, cont);
212 // need a release store here
213 lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
214 stlr(zr, tmp); // set unowned
215
216 bind(cont);
217 // flag == EQ indicates success
218 // flag == NE indicates failure
219 br(Assembler::NE, no_count);
220
221 bind(count);
222 decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
223
224 bind(no_count);
225 }
226
227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
228 Register t2, Register t3) {
229 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
230 assert_different_registers(obj, t1, t2, t3);
231
232 // Handle inflated monitor.
233 Label inflated;
234 // Finish fast lock successfully. MUST branch to with flag == EQ
235 Label locked;
236 // Finish fast lock unsuccessfully. MUST branch to with flag == NE
237 Label slow_path;
238
239 if (DiagnoseSyncOnValueBasedClasses != 0) {
240 load_klass(t1, obj);
241 ldrw(t1, Address(t1, Klass::access_flags_offset()));
242 tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
243 br(Assembler::NE, slow_path);
244 }
245
246 const Register t1_mark = t1;
247
248 { // Lightweight locking
249
250 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
251 Label push;
252
253 const Register t2_top = t2;
254 const Register t3_t = t3;
255
256 // Check if lock-stack is full.
257 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
258 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
259 br(Assembler::GT, slow_path);
260
261 // Check if recursive.
262 subw(t3_t, t2_top, oopSize);
263 ldr(t3_t, Address(rthread, t3_t));
264 cmp(obj, t3_t);
265 br(Assembler::EQ, push);
266
267 // Relaxed normal load to check for monitor. Optimization for monitor case.
268 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
269 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
270
271 // Not inflated
272 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
273
274 // Try to lock. Transition lock-bits 0b01 => 0b00
275 orr(t1_mark, t1_mark, markWord::unlocked_value);
276 eor(t3_t, t1_mark, markWord::unlocked_value);
277 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
278 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
279 br(Assembler::NE, slow_path);
280
281 bind(push);
282 // After successful lock, push object on lock-stack.
283 str(obj, Address(rthread, t2_top));
284 addw(t2_top, t2_top, oopSize);
285 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
286 b(locked);
287 }
288
289 { // Handle inflated monitor.
290 bind(inflated);
291
292 // mark contains the tagged ObjectMonitor*.
293 const Register t1_tagged_monitor = t1_mark;
294 const uintptr_t monitor_tag = markWord::monitor_value;
295 const Register t2_owner_addr = t2;
296 const Register t3_owner = t3;
297
298 // Compute owner address.
299 lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
300
301 // CAS owner (null => current thread).
302 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
303 /*release*/ false, /*weak*/ false, t3_owner);
304 br(Assembler::EQ, locked);
305
306 // Check if recursive.
307 cmp(t3_owner, rthread);
308 br(Assembler::NE, slow_path);
309
310 // Recursive.
311 increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1);
312 }
313
314 bind(locked);
315 increment(Address(rthread, JavaThread::held_monitor_count_offset()));
316
317 #ifdef ASSERT
318 // Check that locked label is reached with Flags == EQ.
319 Label flag_correct;
320 br(Assembler::EQ, flag_correct);
321 stop("Fast Lock Flag != EQ");
322 #endif
323
324 bind(slow_path);
325 #ifdef ASSERT
326 // Check that slow_path label is reached with Flags == NE.
327 br(Assembler::NE, flag_correct);
328 stop("Fast Lock Flag != NE");
329 bind(flag_correct);
330 #endif
331 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
332 }
333
334 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
335 Register t3) {
336 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
337 assert_different_registers(obj, t1, t2, t3);
338
339 // Handle inflated monitor.
340 Label inflated, inflated_load_monitor;
341 // Finish fast unlock successfully. MUST branch to with flag == EQ
342 Label unlocked;
343 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
344 Label slow_path;
345
346 const Register t1_mark = t1;
347 const Register t2_top = t2;
348 const Register t3_t = t3;
349
350 { // Lightweight unlock
351
352 // Check if obj is top of lock-stack.
353 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
354 subw(t2_top, t2_top, oopSize);
355 ldr(t3_t, Address(rthread, t2_top));
356 cmp(obj, t3_t);
357 // Top of lock stack was not obj. Must be monitor.
358 br(Assembler::NE, inflated_load_monitor);
359
360 // Pop lock-stack.
361 DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
362 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
363
364 // Check if recursive.
365 subw(t3_t, t2_top, oopSize);
366 ldr(t3_t, Address(rthread, t3_t));
367 cmp(obj, t3_t);
368 br(Assembler::EQ, unlocked);
369
370 // Not recursive.
371 // Load Mark.
372 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
373
374 // Check header for monitor (0b10).
375 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
376
377 // Try to unlock. Transition lock bits 0b00 => 0b01
378 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
379 orr(t3_t, t1_mark, markWord::unlocked_value);
380 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
381 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
382 br(Assembler::EQ, unlocked);
383
384 // Compare and exchange failed.
385 // Restore lock-stack and handle the unlock in runtime.
386 DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
387 addw(t2_top, t2_top, oopSize);
388 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
389 b(slow_path);
390 }
391
392
393 { // Handle inflated monitor.
394 bind(inflated_load_monitor);
395 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
396 #ifdef ASSERT
397 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
398 stop("Fast Unlock not monitor");
399 #endif
400
401 bind(inflated);
402
403 #ifdef ASSERT
404 Label check_done;
405 subw(t2_top, t2_top, oopSize);
406 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
407 br(Assembler::LT, check_done);
408 ldr(t3_t, Address(rthread, t2_top));
409 cmp(obj, t3_t);
410 br(Assembler::NE, inflated);
411 stop("Fast Unlock lock on stack");
412 bind(check_done);
413 #endif
414
415 // mark contains the tagged ObjectMonitor*.
416 const Register t1_monitor = t1_mark;
417 const uintptr_t monitor_tag = markWord::monitor_value;
418
419 // Untag the monitor.
420 sub(t1_monitor, t1_mark, monitor_tag);
421
422 const Register t2_recursions = t2;
423 Label not_recursive;
424
425 // Check if recursive.
426 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
427 cbz(t2_recursions, not_recursive);
428
429 // Recursive unlock.
430 sub(t2_recursions, t2_recursions, 1u);
431 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
432 // Set flag == EQ
433 cmp(t2_recursions, t2_recursions);
434 b(unlocked);
435
436 bind(not_recursive);
437
438 Label release;
439 const Register t2_owner_addr = t2;
440
441 // Compute owner address.
442 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
443
444 // Check if the entry lists are empty.
445 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
446 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
447 orr(rscratch1, rscratch1, t3_t);
448 cmp(rscratch1, zr);
449 br(Assembler::EQ, release);
450
451 // The owner may be anonymous and we removed the last obj entry in
452 // the lock-stack. This loses the information about the owner.
453 // Write the thread to the owner field so the runtime knows the owner.
454 str(rthread, Address(t2_owner_addr));
455 b(slow_path);
456
457 bind(release);
458 // Set owner to null.
459 // Release to satisfy the JMM
460 stlr(zr, t2_owner_addr);
461 }
462
463 bind(unlocked);
464 decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
465
466 #ifdef ASSERT
467 // Check that unlocked label is reached with Flags == EQ.
468 Label flag_correct;
469 br(Assembler::EQ, flag_correct);
470 stop("Fast Unlock Flag != EQ");
471 #endif
472
473 bind(slow_path);
474 #ifdef ASSERT
475 // Check that slow_path label is reached with Flags == NE.
476 br(Assembler::NE, flag_correct);
477 stop("Fast Unlock Flag != NE");
478 bind(flag_correct);
479 #endif
480 // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
481 }
482
483 // Search for str1 in str2 and return index or -1
484 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
485 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
486 Register cnt2, Register cnt1,
487 Register tmp1, Register tmp2,
488 Register tmp3, Register tmp4,
489 Register tmp5, Register tmp6,
490 int icnt1, Register result, int ae) {
491 // NOTE: tmp5, tmp6 can be zr depending on specific method version
492 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
493
494 Register ch1 = rscratch1;
495 Register ch2 = rscratch2;
496 Register cnt1tmp = tmp1;
497 Register cnt2tmp = tmp2;
498 Register cnt1_neg = cnt1;
499 Register cnt2_neg = cnt2;
500 Register result_tmp = tmp4;
501
502 bool isL = ae == StrIntrinsicNode::LL;
503
504 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
505 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
506 int str1_chr_shift = str1_isL ? 0:1;
507 int str2_chr_shift = str2_isL ? 0:1;
508 int str1_chr_size = str1_isL ? 1:2;
509 int str2_chr_size = str2_isL ? 1:2;
510 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
511 (chr_insn)&MacroAssembler::ldrh;
512 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
513 (chr_insn)&MacroAssembler::ldrh;
514 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
515 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
516
517 // Note, inline_string_indexOf() generates checks:
518 // if (substr.count > string.count) return -1;
519 // if (substr.count == 0) return 0;
520
521 // We have two strings, a source string in str2, cnt2 and a pattern string
522 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
523
524 // For larger pattern and source we use a simplified Boyer Moore algorithm.
525 // With a small pattern and source we use linear scan.
526
527 if (icnt1 == -1) {
528 sub(result_tmp, cnt2, cnt1);
529 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
530 br(LT, LINEARSEARCH);
531 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
532 subs(zr, cnt1, 256);
533 lsr(tmp1, cnt2, 2);
534 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
535 br(GE, LINEARSTUB);
536 }
537
538 // The Boyer Moore alogorithm is based on the description here:-
539 //
540 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
541 //
542 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
543 // and the 'Good Suffix' rule.
544 //
545 // These rules are essentially heuristics for how far we can shift the
546 // pattern along the search string.
547 //
548 // The implementation here uses the 'Bad Character' rule only because of the
549 // complexity of initialisation for the 'Good Suffix' rule.
550 //
551 // This is also known as the Boyer-Moore-Horspool algorithm:-
552 //
553 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
554 //
555 // This particular implementation has few java-specific optimizations.
556 //
557 // #define ASIZE 256
558 //
559 // int bm(unsigned char *x, int m, unsigned char *y, int n) {
560 // int i, j;
561 // unsigned c;
562 // unsigned char bc[ASIZE];
563 //
564 // /* Preprocessing */
565 // for (i = 0; i < ASIZE; ++i)
566 // bc[i] = m;
567 // for (i = 0; i < m - 1; ) {
568 // c = x[i];
569 // ++i;
570 // // c < 256 for Latin1 string, so, no need for branch
571 // #ifdef PATTERN_STRING_IS_LATIN1
572 // bc[c] = m - i;
573 // #else
574 // if (c < ASIZE) bc[c] = m - i;
575 // #endif
576 // }
577 //
578 // /* Searching */
579 // j = 0;
580 // while (j <= n - m) {
581 // c = y[i+j];
582 // if (x[m-1] == c)
583 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
584 // if (i < 0) return j;
585 // // c < 256 for Latin1 string, so, no need for branch
586 // #ifdef SOURCE_STRING_IS_LATIN1
587 // // LL case: (c< 256) always true. Remove branch
588 // j += bc[y[j+m-1]];
589 // #endif
590 // #ifndef PATTERN_STRING_IS_UTF
591 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
592 // if (c < ASIZE)
593 // j += bc[y[j+m-1]];
594 // else
595 // j += 1
596 // #endif
597 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
598 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
599 // if (c < ASIZE)
600 // j += bc[y[j+m-1]];
601 // else
602 // j += m
603 // #endif
604 // }
605 // }
606
607 if (icnt1 == -1) {
608 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
609 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
610 Register cnt1end = tmp2;
611 Register str2end = cnt2;
612 Register skipch = tmp2;
613
614 // str1 length is >=8, so, we can read at least 1 register for cases when
615 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
616 // UL case. We'll re-read last character in inner pre-loop code to have
617 // single outer pre-loop load
618 const int firstStep = isL ? 7 : 3;
619
620 const int ASIZE = 256;
621 const int STORED_BYTES = 32; // amount of bytes stored per instruction
622 sub(sp, sp, ASIZE);
623 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
624 mov(ch1, sp);
625 BIND(BM_INIT_LOOP);
626 stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
627 subs(tmp5, tmp5, 1);
628 br(GT, BM_INIT_LOOP);
629
630 sub(cnt1tmp, cnt1, 1);
631 mov(tmp5, str2);
632 add(str2end, str2, result_tmp, LSL, str2_chr_shift);
633 sub(ch2, cnt1, 1);
634 mov(tmp3, str1);
635 BIND(BCLOOP);
636 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
637 if (!str1_isL) {
638 subs(zr, ch1, ASIZE);
639 br(HS, BCSKIP);
640 }
641 strb(ch2, Address(sp, ch1));
642 BIND(BCSKIP);
643 subs(ch2, ch2, 1);
644 br(GT, BCLOOP);
645
646 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
647 if (str1_isL == str2_isL) {
648 // load last 8 bytes (8LL/4UU symbols)
649 ldr(tmp6, Address(tmp6, -wordSize));
650 } else {
651 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
652 // convert Latin1 to UTF. We'll have to wait until load completed, but
653 // it's still faster than per-character loads+checks
654 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
655 ubfx(ch1, tmp6, 8, 8); // str1[N-2]
656 ubfx(ch2, tmp6, 16, 8); // str1[N-3]
657 andr(tmp6, tmp6, 0xFF); // str1[N-4]
658 orr(ch2, ch1, ch2, LSL, 16);
659 orr(tmp6, tmp6, tmp3, LSL, 48);
660 orr(tmp6, tmp6, ch2, LSL, 16);
661 }
662 BIND(BMLOOPSTR2);
663 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
664 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
665 if (str1_isL == str2_isL) {
666 // re-init tmp3. It's for free because it's executed in parallel with
667 // load above. Alternative is to initialize it before loop, but it'll
668 // affect performance on in-order systems with 2 or more ld/st pipelines
669 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
670 }
671 if (!isL) { // UU/UL case
672 lsl(ch2, cnt1tmp, 1); // offset in bytes
673 }
674 cmp(tmp3, skipch);
675 br(NE, BMSKIP);
676 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
677 mov(ch1, tmp6);
678 if (isL) {
679 b(BMLOOPSTR1_AFTER_LOAD);
680 } else {
681 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
682 b(BMLOOPSTR1_CMP);
683 }
684 BIND(BMLOOPSTR1);
685 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
686 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
687 BIND(BMLOOPSTR1_AFTER_LOAD);
688 subs(cnt1tmp, cnt1tmp, 1);
689 br(LT, BMLOOPSTR1_LASTCMP);
690 BIND(BMLOOPSTR1_CMP);
691 cmp(ch1, ch2);
692 br(EQ, BMLOOPSTR1);
693 BIND(BMSKIP);
694 if (!isL) {
695 // if we've met UTF symbol while searching Latin1 pattern, then we can
696 // skip cnt1 symbols
697 if (str1_isL != str2_isL) {
698 mov(result_tmp, cnt1);
699 } else {
700 mov(result_tmp, 1);
701 }
702 subs(zr, skipch, ASIZE);
703 br(HS, BMADV);
704 }
705 ldrb(result_tmp, Address(sp, skipch)); // load skip distance
706 BIND(BMADV);
707 sub(cnt1tmp, cnt1, 1);
708 add(str2, str2, result_tmp, LSL, str2_chr_shift);
709 cmp(str2, str2end);
710 br(LE, BMLOOPSTR2);
711 add(sp, sp, ASIZE);
712 b(NOMATCH);
713 BIND(BMLOOPSTR1_LASTCMP);
714 cmp(ch1, ch2);
715 br(NE, BMSKIP);
716 BIND(BMMATCH);
717 sub(result, str2, tmp5);
718 if (!str2_isL) lsr(result, result, 1);
719 add(sp, sp, ASIZE);
720 b(DONE);
721
722 BIND(LINEARSTUB);
723 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
724 br(LT, LINEAR_MEDIUM);
725 mov(result, zr);
726 RuntimeAddress stub = nullptr;
727 if (isL) {
728 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
729 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
730 } else if (str1_isL) {
731 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
732 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
733 } else {
734 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
735 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
736 }
737 address call = trampoline_call(stub);
738 if (call == nullptr) {
739 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
740 ciEnv::current()->record_failure("CodeCache is full");
741 return;
742 }
743 b(DONE);
744 }
745
746 BIND(LINEARSEARCH);
747 {
748 Label DO1, DO2, DO3;
749
750 Register str2tmp = tmp2;
751 Register first = tmp3;
752
753 if (icnt1 == -1)
754 {
755 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
756
757 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
758 br(LT, DOSHORT);
759 BIND(LINEAR_MEDIUM);
760 (this->*str1_load_1chr)(first, Address(str1));
761 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
762 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
763 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
764 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
765
766 BIND(FIRST_LOOP);
767 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
768 cmp(first, ch2);
769 br(EQ, STR1_LOOP);
770 BIND(STR2_NEXT);
771 adds(cnt2_neg, cnt2_neg, str2_chr_size);
772 br(LE, FIRST_LOOP);
773 b(NOMATCH);
774
775 BIND(STR1_LOOP);
776 adds(cnt1tmp, cnt1_neg, str1_chr_size);
777 add(cnt2tmp, cnt2_neg, str2_chr_size);
778 br(GE, MATCH);
779
780 BIND(STR1_NEXT);
781 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
782 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
783 cmp(ch1, ch2);
784 br(NE, STR2_NEXT);
785 adds(cnt1tmp, cnt1tmp, str1_chr_size);
786 add(cnt2tmp, cnt2tmp, str2_chr_size);
787 br(LT, STR1_NEXT);
788 b(MATCH);
789
790 BIND(DOSHORT);
791 if (str1_isL == str2_isL) {
792 cmp(cnt1, (u1)2);
793 br(LT, DO1);
794 br(GT, DO3);
795 }
796 }
797
798 if (icnt1 == 4) {
799 Label CH1_LOOP;
800
801 (this->*load_4chr)(ch1, str1);
802 sub(result_tmp, cnt2, 4);
803 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
804 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
805
806 BIND(CH1_LOOP);
807 (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
808 cmp(ch1, ch2);
809 br(EQ, MATCH);
810 adds(cnt2_neg, cnt2_neg, str2_chr_size);
811 br(LE, CH1_LOOP);
812 b(NOMATCH);
813 }
814
815 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
816 Label CH1_LOOP;
817
818 BIND(DO2);
819 (this->*load_2chr)(ch1, str1);
820 if (icnt1 == 2) {
821 sub(result_tmp, cnt2, 2);
822 }
823 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
824 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
825 BIND(CH1_LOOP);
826 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
827 cmp(ch1, ch2);
828 br(EQ, MATCH);
829 adds(cnt2_neg, cnt2_neg, str2_chr_size);
830 br(LE, CH1_LOOP);
831 b(NOMATCH);
832 }
833
834 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
835 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
836
837 BIND(DO3);
838 (this->*load_2chr)(first, str1);
839 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
840 if (icnt1 == 3) {
841 sub(result_tmp, cnt2, 3);
842 }
843 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
844 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
845 BIND(FIRST_LOOP);
846 (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
847 cmpw(first, ch2);
848 br(EQ, STR1_LOOP);
849 BIND(STR2_NEXT);
850 adds(cnt2_neg, cnt2_neg, str2_chr_size);
851 br(LE, FIRST_LOOP);
852 b(NOMATCH);
853
854 BIND(STR1_LOOP);
855 add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
856 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
857 cmp(ch1, ch2);
858 br(NE, STR2_NEXT);
859 b(MATCH);
860 }
861
862 if (icnt1 == -1 || icnt1 == 1) {
863 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
864
865 BIND(DO1);
866 (this->*str1_load_1chr)(ch1, str1);
867 cmp(cnt2, (u1)8);
868 br(LT, DO1_SHORT);
869
870 sub(result_tmp, cnt2, 8/str2_chr_size);
871 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
872 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
873 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
874
875 if (str2_isL) {
876 orr(ch1, ch1, ch1, LSL, 8);
877 }
878 orr(ch1, ch1, ch1, LSL, 16);
879 orr(ch1, ch1, ch1, LSL, 32);
880 BIND(CH1_LOOP);
881 ldr(ch2, Address(str2, cnt2_neg));
882 eor(ch2, ch1, ch2);
883 sub(tmp1, ch2, tmp3);
884 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
885 bics(tmp1, tmp1, tmp2);
886 br(NE, HAS_ZERO);
887 adds(cnt2_neg, cnt2_neg, 8);
888 br(LT, CH1_LOOP);
889
890 cmp(cnt2_neg, (u1)8);
891 mov(cnt2_neg, 0);
892 br(LT, CH1_LOOP);
893 b(NOMATCH);
894
895 BIND(HAS_ZERO);
896 rev(tmp1, tmp1);
897 clz(tmp1, tmp1);
898 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
899 b(MATCH);
900
901 BIND(DO1_SHORT);
902 mov(result_tmp, cnt2);
903 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
904 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
905 BIND(DO1_LOOP);
906 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
907 cmpw(ch1, ch2);
908 br(EQ, MATCH);
909 adds(cnt2_neg, cnt2_neg, str2_chr_size);
910 br(LT, DO1_LOOP);
911 }
912 }
913 BIND(NOMATCH);
914 mov(result, -1);
915 b(DONE);
916 BIND(MATCH);
917 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
918 BIND(DONE);
919 }
920
921 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
922 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
923
924 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
925 Register ch, Register result,
926 Register tmp1, Register tmp2, Register tmp3)
927 {
928 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
929 Register cnt1_neg = cnt1;
930 Register ch1 = rscratch1;
931 Register result_tmp = rscratch2;
932
933 cbz(cnt1, NOMATCH);
934
935 cmp(cnt1, (u1)4);
936 br(LT, DO1_SHORT);
937
938 orr(ch, ch, ch, LSL, 16);
939 orr(ch, ch, ch, LSL, 32);
940
941 sub(cnt1, cnt1, 4);
942 mov(result_tmp, cnt1);
943 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
944 sub(cnt1_neg, zr, cnt1, LSL, 1);
945
946 mov(tmp3, 0x0001000100010001);
947
948 BIND(CH1_LOOP);
949 ldr(ch1, Address(str1, cnt1_neg));
950 eor(ch1, ch, ch1);
951 sub(tmp1, ch1, tmp3);
952 orr(tmp2, ch1, 0x7fff7fff7fff7fff);
953 bics(tmp1, tmp1, tmp2);
954 br(NE, HAS_ZERO);
955 adds(cnt1_neg, cnt1_neg, 8);
956 br(LT, CH1_LOOP);
957
958 cmp(cnt1_neg, (u1)8);
959 mov(cnt1_neg, 0);
960 br(LT, CH1_LOOP);
961 b(NOMATCH);
962
963 BIND(HAS_ZERO);
964 rev(tmp1, tmp1);
965 clz(tmp1, tmp1);
966 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
967 b(MATCH);
968
969 BIND(DO1_SHORT);
970 mov(result_tmp, cnt1);
971 lea(str1, Address(str1, cnt1, Address::uxtw(1)));
972 sub(cnt1_neg, zr, cnt1, LSL, 1);
973 BIND(DO1_LOOP);
974 ldrh(ch1, Address(str1, cnt1_neg));
975 cmpw(ch, ch1);
976 br(EQ, MATCH);
977 adds(cnt1_neg, cnt1_neg, 2);
978 br(LT, DO1_LOOP);
979 BIND(NOMATCH);
980 mov(result, -1);
981 b(DONE);
982 BIND(MATCH);
983 add(result, result_tmp, cnt1_neg, ASR, 1);
984 BIND(DONE);
985 }
986
987 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
988 Register ch, Register result,
989 FloatRegister ztmp1,
990 FloatRegister ztmp2,
991 PRegister tmp_pg,
992 PRegister tmp_pdn, bool isL)
993 {
994 // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
995 assert(tmp_pg->is_governing(),
996 "this register has to be a governing predicate register");
997
998 Label LOOP, MATCH, DONE, NOMATCH;
999 Register vec_len = rscratch1;
1000 Register idx = rscratch2;
1001
1002 SIMD_RegVariant T = (isL == true) ? B : H;
1003
1004 cbz(cnt1, NOMATCH);
1005
1006 // Assign the particular char throughout the vector.
1007 sve_dup(ztmp2, T, ch);
1008 if (isL) {
1009 sve_cntb(vec_len);
1010 } else {
1011 sve_cnth(vec_len);
1012 }
1013 mov(idx, 0);
1014
1015 // Generate a predicate to control the reading of input string.
1016 sve_whilelt(tmp_pg, T, idx, cnt1);
1017
1018 BIND(LOOP);
1019 // Read a vector of 8- or 16-bit data depending on the string type. Note
1020 // that inactive elements indicated by the predicate register won't cause
1021 // a data read from memory to the destination vector.
1022 if (isL) {
1023 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1024 } else {
1025 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1026 }
1027 add(idx, idx, vec_len);
1028
1029 // Perform the comparison. An element of the destination predicate is set
1030 // to active if the particular char is matched.
1031 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1032
1033 // Branch if the particular char is found.
1034 br(NE, MATCH);
1035
1036 sve_whilelt(tmp_pg, T, idx, cnt1);
1037
1038 // Loop back if the particular char not found.
1039 br(MI, LOOP);
1040
1041 BIND(NOMATCH);
1042 mov(result, -1);
1043 b(DONE);
1044
1045 BIND(MATCH);
1046 // Undo the index increment.
1047 sub(idx, idx, vec_len);
1048
1049 // Crop the vector to find its location.
1050 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1051 add(result, idx, -1);
1052 sve_incp(result, T, tmp_pdn);
1053 BIND(DONE);
1054 }
1055
1056 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1057 Register ch, Register result,
1058 Register tmp1, Register tmp2, Register tmp3)
1059 {
1060 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1061 Register cnt1_neg = cnt1;
1062 Register ch1 = rscratch1;
1063 Register result_tmp = rscratch2;
1064
1065 cbz(cnt1, NOMATCH);
1066
1067 cmp(cnt1, (u1)8);
1068 br(LT, DO1_SHORT);
1069
1070 orr(ch, ch, ch, LSL, 8);
1071 orr(ch, ch, ch, LSL, 16);
1072 orr(ch, ch, ch, LSL, 32);
1073
1074 sub(cnt1, cnt1, 8);
1075 mov(result_tmp, cnt1);
1076 lea(str1, Address(str1, cnt1));
1077 sub(cnt1_neg, zr, cnt1);
1078
1079 mov(tmp3, 0x0101010101010101);
1080
1081 BIND(CH1_LOOP);
1082 ldr(ch1, Address(str1, cnt1_neg));
1083 eor(ch1, ch, ch1);
1084 sub(tmp1, ch1, tmp3);
1085 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1086 bics(tmp1, tmp1, tmp2);
1087 br(NE, HAS_ZERO);
1088 adds(cnt1_neg, cnt1_neg, 8);
1089 br(LT, CH1_LOOP);
1090
1091 cmp(cnt1_neg, (u1)8);
1092 mov(cnt1_neg, 0);
1093 br(LT, CH1_LOOP);
1094 b(NOMATCH);
1095
1096 BIND(HAS_ZERO);
1097 rev(tmp1, tmp1);
1098 clz(tmp1, tmp1);
1099 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1100 b(MATCH);
1101
1102 BIND(DO1_SHORT);
1103 mov(result_tmp, cnt1);
1104 lea(str1, Address(str1, cnt1));
1105 sub(cnt1_neg, zr, cnt1);
1106 BIND(DO1_LOOP);
1107 ldrb(ch1, Address(str1, cnt1_neg));
1108 cmp(ch, ch1);
1109 br(EQ, MATCH);
1110 adds(cnt1_neg, cnt1_neg, 1);
1111 br(LT, DO1_LOOP);
1112 BIND(NOMATCH);
1113 mov(result, -1);
1114 b(DONE);
1115 BIND(MATCH);
1116 add(result, result_tmp, cnt1_neg);
1117 BIND(DONE);
1118 }
1119
1120 // Compare strings.
1121 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1122 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1123 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1124 PRegister pgtmp1, PRegister pgtmp2, int ae) {
1125 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1126 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1127 SHORT_LOOP_START, TAIL_CHECK;
1128
1129 bool isLL = ae == StrIntrinsicNode::LL;
1130 bool isLU = ae == StrIntrinsicNode::LU;
1131 bool isUL = ae == StrIntrinsicNode::UL;
1132
1133 // The stub threshold for LL strings is: 72 (64 + 8) chars
1134 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1135 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1136 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1137
1138 bool str1_isL = isLL || isLU;
1139 bool str2_isL = isLL || isUL;
1140
1141 int str1_chr_shift = str1_isL ? 0 : 1;
1142 int str2_chr_shift = str2_isL ? 0 : 1;
1143 int str1_chr_size = str1_isL ? 1 : 2;
1144 int str2_chr_size = str2_isL ? 1 : 2;
1145 int minCharsInWord = isLL ? wordSize : wordSize/2;
1146
1147 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1148 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1149 (chr_insn)&MacroAssembler::ldrh;
1150 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1151 (chr_insn)&MacroAssembler::ldrh;
1152 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1153 (uxt_insn)&MacroAssembler::uxthw;
1154
1155 BLOCK_COMMENT("string_compare {");
1156
1157 // Bizzarely, the counts are passed in bytes, regardless of whether they
1158 // are L or U strings, however the result is always in characters.
1159 if (!str1_isL) asrw(cnt1, cnt1, 1);
1160 if (!str2_isL) asrw(cnt2, cnt2, 1);
1161
1162 // Compute the minimum of the string lengths and save the difference.
1163 subsw(result, cnt1, cnt2);
1164 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1165
1166 // A very short string
1167 cmpw(cnt2, minCharsInWord);
1168 br(Assembler::LE, SHORT_STRING);
1169
1170 // Compare longwords
1171 // load first parts of strings and finish initialization while loading
1172 {
1173 if (str1_isL == str2_isL) { // LL or UU
1174 ldr(tmp1, Address(str1));
1175 cmp(str1, str2);
1176 br(Assembler::EQ, DONE);
1177 ldr(tmp2, Address(str2));
1178 cmp(cnt2, stub_threshold);
1179 br(GE, STUB);
1180 subsw(cnt2, cnt2, minCharsInWord);
1181 br(EQ, TAIL_CHECK);
1182 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1184 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1185 } else if (isLU) {
1186 ldrs(vtmp, Address(str1));
1187 ldr(tmp2, Address(str2));
1188 cmp(cnt2, stub_threshold);
1189 br(GE, STUB);
1190 subw(cnt2, cnt2, 4);
1191 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1192 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1193 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1194 zip1(vtmp, T8B, vtmp, vtmpZ);
1195 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1196 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1197 add(cnt1, cnt1, 4);
1198 fmovd(tmp1, vtmp);
1199 } else { // UL case
1200 ldr(tmp1, Address(str1));
1201 ldrs(vtmp, Address(str2));
1202 cmp(cnt2, stub_threshold);
1203 br(GE, STUB);
1204 subw(cnt2, cnt2, 4);
1205 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1206 eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1207 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1208 sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1209 zip1(vtmp, T8B, vtmp, vtmpZ);
1210 sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1211 add(cnt1, cnt1, 8);
1212 fmovd(tmp2, vtmp);
1213 }
1214 adds(cnt2, cnt2, isUL ? 4 : 8);
1215 br(GE, TAIL);
1216 eor(rscratch2, tmp1, tmp2);
1217 cbnz(rscratch2, DIFF);
1218 // main loop
1219 bind(NEXT_WORD);
1220 if (str1_isL == str2_isL) {
1221 ldr(tmp1, Address(str1, cnt2));
1222 ldr(tmp2, Address(str2, cnt2));
1223 adds(cnt2, cnt2, 8);
1224 } else if (isLU) {
1225 ldrs(vtmp, Address(str1, cnt1));
1226 ldr(tmp2, Address(str2, cnt2));
1227 add(cnt1, cnt1, 4);
1228 zip1(vtmp, T8B, vtmp, vtmpZ);
1229 fmovd(tmp1, vtmp);
1230 adds(cnt2, cnt2, 8);
1231 } else { // UL
1232 ldrs(vtmp, Address(str2, cnt2));
1233 ldr(tmp1, Address(str1, cnt1));
1234 zip1(vtmp, T8B, vtmp, vtmpZ);
1235 add(cnt1, cnt1, 8);
1236 fmovd(tmp2, vtmp);
1237 adds(cnt2, cnt2, 4);
1238 }
1239 br(GE, TAIL);
1240
1241 eor(rscratch2, tmp1, tmp2);
1242 cbz(rscratch2, NEXT_WORD);
1243 b(DIFF);
1244 bind(TAIL);
1245 eor(rscratch2, tmp1, tmp2);
1246 cbnz(rscratch2, DIFF);
1247 // Last longword. In the case where length == 4 we compare the
1248 // same longword twice, but that's still faster than another
1249 // conditional branch.
1250 if (str1_isL == str2_isL) {
1251 ldr(tmp1, Address(str1));
1252 ldr(tmp2, Address(str2));
1253 } else if (isLU) {
1254 ldrs(vtmp, Address(str1));
1255 ldr(tmp2, Address(str2));
1256 zip1(vtmp, T8B, vtmp, vtmpZ);
1257 fmovd(tmp1, vtmp);
1258 } else { // UL
1259 ldrs(vtmp, Address(str2));
1260 ldr(tmp1, Address(str1));
1261 zip1(vtmp, T8B, vtmp, vtmpZ);
1262 fmovd(tmp2, vtmp);
1263 }
1264 bind(TAIL_CHECK);
1265 eor(rscratch2, tmp1, tmp2);
1266 cbz(rscratch2, DONE);
1267
1268 // Find the first different characters in the longwords and
1269 // compute their difference.
1270 bind(DIFF);
1271 rev(rscratch2, rscratch2);
1272 clz(rscratch2, rscratch2);
1273 andr(rscratch2, rscratch2, isLL ? -8 : -16);
1274 lsrv(tmp1, tmp1, rscratch2);
1275 (this->*ext_chr)(tmp1, tmp1);
1276 lsrv(tmp2, tmp2, rscratch2);
1277 (this->*ext_chr)(tmp2, tmp2);
1278 subw(result, tmp1, tmp2);
1279 b(DONE);
1280 }
1281
1282 bind(STUB);
1283 RuntimeAddress stub = nullptr;
1284 switch(ae) {
1285 case StrIntrinsicNode::LL:
1286 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1287 break;
1288 case StrIntrinsicNode::UU:
1289 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1290 break;
1291 case StrIntrinsicNode::LU:
1292 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1293 break;
1294 case StrIntrinsicNode::UL:
1295 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1296 break;
1297 default:
1298 ShouldNotReachHere();
1299 }
1300 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1301 address call = trampoline_call(stub);
1302 if (call == nullptr) {
1303 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1304 ciEnv::current()->record_failure("CodeCache is full");
1305 return;
1306 }
1307 b(DONE);
1308
1309 bind(SHORT_STRING);
1310 // Is the minimum length zero?
1311 cbz(cnt2, DONE);
1312 // arrange code to do most branches while loading and loading next characters
1313 // while comparing previous
1314 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1315 subs(cnt2, cnt2, 1);
1316 br(EQ, SHORT_LAST_INIT);
1317 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1318 b(SHORT_LOOP_START);
1319 bind(SHORT_LOOP);
1320 subs(cnt2, cnt2, 1);
1321 br(EQ, SHORT_LAST);
1322 bind(SHORT_LOOP_START);
1323 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1324 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1325 cmp(tmp1, cnt1);
1326 br(NE, SHORT_LOOP_TAIL);
1327 subs(cnt2, cnt2, 1);
1328 br(EQ, SHORT_LAST2);
1329 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1330 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1331 cmp(tmp2, rscratch1);
1332 br(EQ, SHORT_LOOP);
1333 sub(result, tmp2, rscratch1);
1334 b(DONE);
1335 bind(SHORT_LOOP_TAIL);
1336 sub(result, tmp1, cnt1);
1337 b(DONE);
1338 bind(SHORT_LAST2);
1339 cmp(tmp2, rscratch1);
1340 br(EQ, DONE);
1341 sub(result, tmp2, rscratch1);
1342
1343 b(DONE);
1344 bind(SHORT_LAST_INIT);
1345 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346 bind(SHORT_LAST);
1347 cmp(tmp1, cnt1);
1348 br(EQ, DONE);
1349 sub(result, tmp1, cnt1);
1350
1351 bind(DONE);
1352
1353 BLOCK_COMMENT("} string_compare");
1354 }
1355
1356 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1357 FloatRegister src2, Condition cond, bool isQ) {
1358 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1359 FloatRegister zn = src1, zm = src2;
1360 bool needs_negation = false;
1361 switch (cond) {
1362 case LT: cond = GT; zn = src2; zm = src1; break;
1363 case LE: cond = GE; zn = src2; zm = src1; break;
1364 case LO: cond = HI; zn = src2; zm = src1; break;
1365 case LS: cond = HS; zn = src2; zm = src1; break;
1366 case NE: cond = EQ; needs_negation = true; break;
1367 default:
1368 break;
1369 }
1370
1371 if (is_floating_point_type(bt)) {
1372 fcm(cond, dst, size, zn, zm);
1373 } else {
1374 cm(cond, dst, size, zn, zm);
1375 }
1376
1377 if (needs_negation) {
1378 notr(dst, isQ ? T16B : T8B, dst);
1379 }
1380 }
1381
1382 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1383 Condition cond, bool isQ) {
1384 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1385 if (bt == T_FLOAT || bt == T_DOUBLE) {
1386 if (cond == Assembler::NE) {
1387 fcm(Assembler::EQ, dst, size, src);
1388 notr(dst, isQ ? T16B : T8B, dst);
1389 } else {
1390 fcm(cond, dst, size, src);
1391 }
1392 } else {
1393 if (cond == Assembler::NE) {
1394 cm(Assembler::EQ, dst, size, src);
1395 notr(dst, isQ ? T16B : T8B, dst);
1396 } else {
1397 cm(cond, dst, size, src);
1398 }
1399 }
1400 }
1401
1402 // Compress the least significant bit of each byte to the rightmost and clear
1403 // the higher garbage bits.
1404 void C2_MacroAssembler::bytemask_compress(Register dst) {
1405 // Example input, dst = 0x01 00 00 00 01 01 00 01
1406 // The "??" bytes are garbage.
1407 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1408 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1409 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1410 andr(dst, dst, 0xff); // dst = 0x8D
1411 }
1412
1413 // Pack the lowest-numbered bit of each mask element in src into a long value
1414 // in dst, at most the first 64 lane elements.
1415 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1416 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1417 FloatRegister vtmp1, FloatRegister vtmp2) {
1418 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1419 assert_different_registers(dst, rscratch1);
1420 assert_different_registers(vtmp1, vtmp2);
1421
1422 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1423 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1424 // Expected: dst = 0x658D
1425
1426 // Convert the mask into vector with sequential bytes.
1427 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1428 sve_cpy(vtmp1, size, src, 1, false);
1429 if (bt != T_BYTE) {
1430 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1431 }
1432
1433 if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1434 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1435 // is to compress each significant bit of the byte in a cross-lane way. Due
1436 // to the lack of a cross-lane bit-compress instruction, we use BEXT
1437 // (bit-compress in each lane) with the biggest lane size (T = D) then
1438 // concatenate the results.
1439
1440 // The second source input of BEXT, initialized with 0x01 in each byte.
1441 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1442 sve_dup(vtmp2, B, 1);
1443
1444 // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1445 // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1446 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1447 // ---------------------------------------
1448 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1449 sve_bext(vtmp1, D, vtmp1, vtmp2);
1450
1451 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1452 // result to dst.
1453 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1454 // dst = 0x658D
1455 if (lane_cnt <= 8) {
1456 // No need to concatenate.
1457 umov(dst, vtmp1, B, 0);
1458 } else if (lane_cnt <= 16) {
1459 ins(vtmp1, B, vtmp1, 1, 8);
1460 umov(dst, vtmp1, H, 0);
1461 } else {
1462 // As the lane count is 64 at most, the final expected value must be in
1463 // the lowest 64 bits after narrowing vtmp1 from D to B.
1464 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1465 umov(dst, vtmp1, D, 0);
1466 }
1467 } else if (UseSVE > 0) {
1468 // Compress the lowest 8 bytes.
1469 fmovd(dst, vtmp1);
1470 bytemask_compress(dst);
1471 if (lane_cnt <= 8) return;
1472
1473 // Repeat on higher bytes and join the results.
1474 // Compress 8 bytes in each iteration.
1475 for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1476 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1477 bytemask_compress(rscratch1);
1478 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1479 }
1480 } else {
1481 assert(false, "unsupported");
1482 ShouldNotReachHere();
1483 }
1484 }
1485
1486 // Unpack the mask, a long value in src, into predicate register dst based on the
1487 // corresponding data type. Note that dst can support at most 64 lanes.
1488 // Below example gives the expected dst predicate register in different types, with
1489 // a valid src(0x658D) on a 1024-bit vector size machine.
1490 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1491 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1492 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1493 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1494 //
1495 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1496 // has 24 significant bits would be an invalid input if dst predicate register refers to
1497 // a LONG type 1024-bit vector, which has at most 16 lanes.
1498 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1499 FloatRegister vtmp1, FloatRegister vtmp2) {
1500 assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1501 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1502 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1503 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1504 // Expected: dst = 0b01101001 10001101
1505
1506 // Put long value from general purpose register into the first lane of vector.
1507 // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1508 sve_dup(vtmp1, B, 0);
1509 mov(vtmp1, D, 0, src);
1510
1511 // As sve_cmp generates mask value with the minimum unit in byte, we should
1512 // transform the value in the first lane which is mask in bit now to the
1513 // mask in byte, which can be done by SVE2's BDEP instruction.
1514
1515 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1516 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1517 if (lane_cnt <= 8) {
1518 // Nothing. As only one byte exsits.
1519 } else if (lane_cnt <= 16) {
1520 ins(vtmp1, B, vtmp1, 8, 1);
1521 mov(vtmp1, B, 1, zr);
1522 } else {
1523 sve_vector_extend(vtmp1, D, vtmp1, B);
1524 }
1525
1526 // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1527 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1528 sve_dup(vtmp2, B, 1);
1529
1530 // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1531 // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1532 // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1533 // ---------------------------------------
1534 // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1535 sve_bdep(vtmp1, D, vtmp1, vtmp2);
1536
1537 if (bt != T_BYTE) {
1538 sve_vector_extend(vtmp1, size, vtmp1, B);
1539 }
1540 // Generate mask according to the given vector, in which the elements have been
1541 // extended to expected type.
1542 // dst = 0b01101001 10001101
1543 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1544 }
1545
1546 // Clobbers: rflags
1547 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1548 FloatRegister zn, FloatRegister zm, Condition cond) {
1549 assert(pg->is_governing(), "This register has to be a governing predicate register");
1550 FloatRegister z1 = zn, z2 = zm;
1551 switch (cond) {
1552 case LE: z1 = zm; z2 = zn; cond = GE; break;
1553 case LT: z1 = zm; z2 = zn; cond = GT; break;
1554 case LO: z1 = zm; z2 = zn; cond = HI; break;
1555 case LS: z1 = zm; z2 = zn; cond = HS; break;
1556 default:
1557 break;
1558 }
1559
1560 SIMD_RegVariant size = elemType_to_regVariant(bt);
1561 if (is_floating_point_type(bt)) {
1562 sve_fcm(cond, pd, size, pg, z1, z2);
1563 } else {
1564 assert(is_integral_type(bt), "unsupported element type");
1565 sve_cmp(cond, pd, size, pg, z1, z2);
1566 }
1567 }
1568
1569 // Get index of the last mask lane that is set
1570 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1571 SIMD_RegVariant size = elemType_to_regVariant(bt);
1572 sve_rev(ptmp, size, src);
1573 sve_brkb(ptmp, ptrue, ptmp, false);
1574 sve_cntp(dst, size, ptrue, ptmp);
1575 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1576 subw(dst, rscratch1, dst);
1577 }
1578
1579 // Extend integer vector src to dst with the same lane count
1580 // but larger element size, e.g. 4B -> 4I
1581 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1582 FloatRegister src, BasicType src_bt) {
1583 if (src_bt == T_BYTE) {
1584 if (dst_bt == T_SHORT) {
1585 // 4B/8B to 4S/8S
1586 assert(dst_vlen_in_bytes == 8 || dst_vlen_in_bytes == 16, "unsupported");
1587 sxtl(dst, T8H, src, T8B);
1588 } else {
1589 // 4B to 4I
1590 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1591 sxtl(dst, T8H, src, T8B);
1592 sxtl(dst, T4S, dst, T4H);
1593 }
1594 } else if (src_bt == T_SHORT) {
1595 // 4S to 4I
1596 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1597 sxtl(dst, T4S, src, T4H);
1598 } else if (src_bt == T_INT) {
1599 // 2I to 2L
1600 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1601 sxtl(dst, T2D, src, T2S);
1602 } else {
1603 ShouldNotReachHere();
1604 }
1605 }
1606
1607 // Narrow integer vector src down to dst with the same lane count
1608 // but smaller element size, e.g. 4I -> 4B
1609 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1610 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1611 if (src_bt == T_SHORT) {
1612 // 4S/8S to 4B/8B
1613 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1614 assert(dst_bt == T_BYTE, "unsupported");
1615 xtn(dst, T8B, src, T8H);
1616 } else if (src_bt == T_INT) {
1617 // 4I to 4B/4S
1618 assert(src_vlen_in_bytes == 16, "unsupported");
1619 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1620 xtn(dst, T4H, src, T4S);
1621 if (dst_bt == T_BYTE) {
1622 xtn(dst, T8B, dst, T8H);
1623 }
1624 } else if (src_bt == T_LONG) {
1625 // 2L to 2I
1626 assert(src_vlen_in_bytes == 16, "unsupported");
1627 assert(dst_bt == T_INT, "unsupported");
1628 xtn(dst, T2S, src, T2D);
1629 } else {
1630 ShouldNotReachHere();
1631 }
1632 }
1633
1634 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1635 FloatRegister src, SIMD_RegVariant src_size) {
1636 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1637 if (src_size == B) {
1638 switch (dst_size) {
1639 case H:
1640 sve_sunpklo(dst, H, src);
1641 break;
1642 case S:
1643 sve_sunpklo(dst, H, src);
1644 sve_sunpklo(dst, S, dst);
1645 break;
1646 case D:
1647 sve_sunpklo(dst, H, src);
1648 sve_sunpklo(dst, S, dst);
1649 sve_sunpklo(dst, D, dst);
1650 break;
1651 default:
1652 ShouldNotReachHere();
1653 }
1654 } else if (src_size == H) {
1655 if (dst_size == S) {
1656 sve_sunpklo(dst, S, src);
1657 } else { // D
1658 sve_sunpklo(dst, S, src);
1659 sve_sunpklo(dst, D, dst);
1660 }
1661 } else if (src_size == S) {
1662 sve_sunpklo(dst, D, src);
1663 }
1664 }
1665
1666 // Vector narrow from src to dst with specified element sizes.
1667 // High part of dst vector will be filled with zero.
1668 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1669 FloatRegister src, SIMD_RegVariant src_size,
1670 FloatRegister tmp) {
1671 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1672 assert_different_registers(src, tmp);
1673 sve_dup(tmp, src_size, 0);
1674 if (src_size == D) {
1675 switch (dst_size) {
1676 case S:
1677 sve_uzp1(dst, S, src, tmp);
1678 break;
1679 case H:
1680 assert_different_registers(dst, tmp);
1681 sve_uzp1(dst, S, src, tmp);
1682 sve_uzp1(dst, H, dst, tmp);
1683 break;
1684 case B:
1685 assert_different_registers(dst, tmp);
1686 sve_uzp1(dst, S, src, tmp);
1687 sve_uzp1(dst, H, dst, tmp);
1688 sve_uzp1(dst, B, dst, tmp);
1689 break;
1690 default:
1691 ShouldNotReachHere();
1692 }
1693 } else if (src_size == S) {
1694 if (dst_size == H) {
1695 sve_uzp1(dst, H, src, tmp);
1696 } else { // B
1697 assert_different_registers(dst, tmp);
1698 sve_uzp1(dst, H, src, tmp);
1699 sve_uzp1(dst, B, dst, tmp);
1700 }
1701 } else if (src_size == H) {
1702 sve_uzp1(dst, B, src, tmp);
1703 }
1704 }
1705
1706 // Extend src predicate to dst predicate with the same lane count but larger
1707 // element size, e.g. 64Byte -> 512Long
1708 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1709 uint dst_element_length_in_bytes,
1710 uint src_element_length_in_bytes) {
1711 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1712 sve_punpklo(dst, src);
1713 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1714 sve_punpklo(dst, src);
1715 sve_punpklo(dst, dst);
1716 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1717 sve_punpklo(dst, src);
1718 sve_punpklo(dst, dst);
1719 sve_punpklo(dst, dst);
1720 } else {
1721 assert(false, "unsupported");
1722 ShouldNotReachHere();
1723 }
1724 }
1725
1726 // Narrow src predicate to dst predicate with the same lane count but
1727 // smaller element size, e.g. 512Long -> 64Byte
1728 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1729 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1730 // The insignificant bits in src predicate are expected to be zero.
1731 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1732 // passed as the second argument. An example narrowing operation with a given mask would be -
1733 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1734 // Mask (for 2 Longs) : TF
1735 // Predicate register for the above mask (16 bits) : 00000001 00000000
1736 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1737 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1738 assert_different_registers(src, ptmp);
1739 assert_different_registers(dst, ptmp);
1740 sve_pfalse(ptmp);
1741 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1742 sve_uzp1(dst, B, src, ptmp);
1743 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1744 sve_uzp1(dst, H, src, ptmp);
1745 sve_uzp1(dst, B, dst, ptmp);
1746 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1747 sve_uzp1(dst, S, src, ptmp);
1748 sve_uzp1(dst, H, dst, ptmp);
1749 sve_uzp1(dst, B, dst, ptmp);
1750 } else {
1751 assert(false, "unsupported");
1752 ShouldNotReachHere();
1753 }
1754 }
1755
1756 // Vector reduction add for integral type with ASIMD instructions.
1757 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1758 Register isrc, FloatRegister vsrc,
1759 unsigned vector_length_in_bytes,
1760 FloatRegister vtmp) {
1761 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1762 assert_different_registers(dst, isrc);
1763 bool isQ = vector_length_in_bytes == 16;
1764
1765 BLOCK_COMMENT("neon_reduce_add_integral {");
1766 switch(bt) {
1767 case T_BYTE:
1768 addv(vtmp, isQ ? T16B : T8B, vsrc);
1769 smov(dst, vtmp, B, 0);
1770 addw(dst, dst, isrc, ext::sxtb);
1771 break;
1772 case T_SHORT:
1773 addv(vtmp, isQ ? T8H : T4H, vsrc);
1774 smov(dst, vtmp, H, 0);
1775 addw(dst, dst, isrc, ext::sxth);
1776 break;
1777 case T_INT:
1778 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1779 umov(dst, vtmp, S, 0);
1780 addw(dst, dst, isrc);
1781 break;
1782 case T_LONG:
1783 assert(isQ, "unsupported");
1784 addpd(vtmp, vsrc);
1785 umov(dst, vtmp, D, 0);
1786 add(dst, dst, isrc);
1787 break;
1788 default:
1789 assert(false, "unsupported");
1790 ShouldNotReachHere();
1791 }
1792 BLOCK_COMMENT("} neon_reduce_add_integral");
1793 }
1794
1795 // Vector reduction multiply for integral type with ASIMD instructions.
1796 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1797 // Clobbers: rscratch1
1798 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1799 Register isrc, FloatRegister vsrc,
1800 unsigned vector_length_in_bytes,
1801 FloatRegister vtmp1, FloatRegister vtmp2) {
1802 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1803 bool isQ = vector_length_in_bytes == 16;
1804
1805 BLOCK_COMMENT("neon_reduce_mul_integral {");
1806 switch(bt) {
1807 case T_BYTE:
1808 if (isQ) {
1809 // Multiply the lower half and higher half of vector iteratively.
1810 // vtmp1 = vsrc[8:15]
1811 ins(vtmp1, D, vsrc, 0, 1);
1812 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1813 mulv(vtmp1, T8B, vtmp1, vsrc);
1814 // vtmp2 = vtmp1[4:7]
1815 ins(vtmp2, S, vtmp1, 0, 1);
1816 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1817 mulv(vtmp1, T8B, vtmp2, vtmp1);
1818 } else {
1819 ins(vtmp1, S, vsrc, 0, 1);
1820 mulv(vtmp1, T8B, vtmp1, vsrc);
1821 }
1822 // vtmp2 = vtmp1[2:3]
1823 ins(vtmp2, H, vtmp1, 0, 1);
1824 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1825 mulv(vtmp2, T8B, vtmp2, vtmp1);
1826 // dst = vtmp2[0] * isrc * vtmp2[1]
1827 umov(rscratch1, vtmp2, B, 0);
1828 mulw(dst, rscratch1, isrc);
1829 sxtb(dst, dst);
1830 umov(rscratch1, vtmp2, B, 1);
1831 mulw(dst, rscratch1, dst);
1832 sxtb(dst, dst);
1833 break;
1834 case T_SHORT:
1835 if (isQ) {
1836 ins(vtmp2, D, vsrc, 0, 1);
1837 mulv(vtmp2, T4H, vtmp2, vsrc);
1838 ins(vtmp1, S, vtmp2, 0, 1);
1839 mulv(vtmp1, T4H, vtmp1, vtmp2);
1840 } else {
1841 ins(vtmp1, S, vsrc, 0, 1);
1842 mulv(vtmp1, T4H, vtmp1, vsrc);
1843 }
1844 umov(rscratch1, vtmp1, H, 0);
1845 mulw(dst, rscratch1, isrc);
1846 sxth(dst, dst);
1847 umov(rscratch1, vtmp1, H, 1);
1848 mulw(dst, rscratch1, dst);
1849 sxth(dst, dst);
1850 break;
1851 case T_INT:
1852 if (isQ) {
1853 ins(vtmp1, D, vsrc, 0, 1);
1854 mulv(vtmp1, T2S, vtmp1, vsrc);
1855 } else {
1856 vtmp1 = vsrc;
1857 }
1858 umov(rscratch1, vtmp1, S, 0);
1859 mul(dst, rscratch1, isrc);
1860 umov(rscratch1, vtmp1, S, 1);
1861 mul(dst, rscratch1, dst);
1862 break;
1863 case T_LONG:
1864 umov(rscratch1, vsrc, D, 0);
1865 mul(dst, isrc, rscratch1);
1866 umov(rscratch1, vsrc, D, 1);
1867 mul(dst, dst, rscratch1);
1868 break;
1869 default:
1870 assert(false, "unsupported");
1871 ShouldNotReachHere();
1872 }
1873 BLOCK_COMMENT("} neon_reduce_mul_integral");
1874 }
1875
1876 // Vector reduction multiply for floating-point type with ASIMD instructions.
1877 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1878 FloatRegister fsrc, FloatRegister vsrc,
1879 unsigned vector_length_in_bytes,
1880 FloatRegister vtmp) {
1881 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1882 bool isQ = vector_length_in_bytes == 16;
1883
1884 BLOCK_COMMENT("neon_reduce_mul_fp {");
1885 switch(bt) {
1886 case T_FLOAT:
1887 fmuls(dst, fsrc, vsrc);
1888 ins(vtmp, S, vsrc, 0, 1);
1889 fmuls(dst, dst, vtmp);
1890 if (isQ) {
1891 ins(vtmp, S, vsrc, 0, 2);
1892 fmuls(dst, dst, vtmp);
1893 ins(vtmp, S, vsrc, 0, 3);
1894 fmuls(dst, dst, vtmp);
1895 }
1896 break;
1897 case T_DOUBLE:
1898 assert(isQ, "unsupported");
1899 fmuld(dst, fsrc, vsrc);
1900 ins(vtmp, D, vsrc, 0, 1);
1901 fmuld(dst, dst, vtmp);
1902 break;
1903 default:
1904 assert(false, "unsupported");
1905 ShouldNotReachHere();
1906 }
1907 BLOCK_COMMENT("} neon_reduce_mul_fp");
1908 }
1909
1910 // Helper to select logical instruction
1911 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1912 Register Rn, Register Rm,
1913 enum shift_kind kind, unsigned shift) {
1914 switch(opc) {
1915 case Op_AndReductionV:
1916 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1917 break;
1918 case Op_OrReductionV:
1919 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1920 break;
1921 case Op_XorReductionV:
1922 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1923 break;
1924 default:
1925 assert(false, "unsupported");
1926 ShouldNotReachHere();
1927 }
1928 }
1929
1930 // Vector reduction logical operations And, Or, Xor
1931 // Clobbers: rscratch1
1932 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1933 Register isrc, FloatRegister vsrc,
1934 unsigned vector_length_in_bytes) {
1935 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1936 "unsupported");
1937 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1938 assert_different_registers(dst, isrc);
1939 bool isQ = vector_length_in_bytes == 16;
1940
1941 BLOCK_COMMENT("neon_reduce_logical {");
1942 umov(rscratch1, vsrc, isQ ? D : S, 0);
1943 umov(dst, vsrc, isQ ? D : S, 1);
1944 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1945 switch(bt) {
1946 case T_BYTE:
1947 if (isQ) {
1948 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1949 }
1950 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1951 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1952 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1953 sxtb(dst, dst);
1954 break;
1955 case T_SHORT:
1956 if (isQ) {
1957 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1958 }
1959 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1960 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1961 sxth(dst, dst);
1962 break;
1963 case T_INT:
1964 if (isQ) {
1965 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1966 }
1967 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1968 break;
1969 case T_LONG:
1970 assert(isQ, "unsupported");
1971 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1972 break;
1973 default:
1974 assert(false, "unsupported");
1975 ShouldNotReachHere();
1976 }
1977 BLOCK_COMMENT("} neon_reduce_logical");
1978 }
1979
1980 // Vector reduction min/max for integral type with ASIMD instructions.
1981 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1982 // Clobbers: rscratch1, rflags
1983 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1984 Register isrc, FloatRegister vsrc,
1985 unsigned vector_length_in_bytes,
1986 FloatRegister vtmp) {
1987 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1988 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1989 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1990 assert_different_registers(dst, isrc);
1991 bool isQ = vector_length_in_bytes == 16;
1992 bool is_min = opc == Op_MinReductionV;
1993
1994 BLOCK_COMMENT("neon_reduce_minmax_integral {");
1995 if (bt == T_LONG) {
1996 assert(vtmp == fnoreg, "should be");
1997 assert(isQ, "should be");
1998 umov(rscratch1, vsrc, D, 0);
1999 cmp(isrc, rscratch1);
2000 csel(dst, isrc, rscratch1, is_min ? LT : GT);
2001 umov(rscratch1, vsrc, D, 1);
2002 cmp(dst, rscratch1);
2003 csel(dst, dst, rscratch1, is_min ? LT : GT);
2004 } else {
2005 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2006 if (size == T2S) {
2007 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2008 } else {
2009 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2010 }
2011 if (bt == T_INT) {
2012 umov(dst, vtmp, S, 0);
2013 } else {
2014 smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2015 }
2016 cmpw(dst, isrc);
2017 cselw(dst, dst, isrc, is_min ? LT : GT);
2018 }
2019 BLOCK_COMMENT("} neon_reduce_minmax_integral");
2020 }
2021
2022 // Vector reduction for integral type with SVE instruction.
2023 // Supported operations are Add, And, Or, Xor, Max, Min.
2024 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2025 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2026 FloatRegister src2, PRegister pg, FloatRegister tmp) {
2027 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2028 assert(pg->is_governing(), "This register has to be a governing predicate register");
2029 assert_different_registers(src1, dst);
2030 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2031 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2032 switch (opc) {
2033 case Op_AddReductionVI: {
2034 sve_uaddv(tmp, size, pg, src2);
2035 if (bt == T_BYTE) {
2036 smov(dst, tmp, size, 0);
2037 addw(dst, src1, dst, ext::sxtb);
2038 } else if (bt == T_SHORT) {
2039 smov(dst, tmp, size, 0);
2040 addw(dst, src1, dst, ext::sxth);
2041 } else {
2042 umov(dst, tmp, size, 0);
2043 addw(dst, dst, src1);
2044 }
2045 break;
2046 }
2047 case Op_AddReductionVL: {
2048 sve_uaddv(tmp, size, pg, src2);
2049 umov(dst, tmp, size, 0);
2050 add(dst, dst, src1);
2051 break;
2052 }
2053 case Op_AndReductionV: {
2054 sve_andv(tmp, size, pg, src2);
2055 if (bt == T_INT || bt == T_LONG) {
2056 umov(dst, tmp, size, 0);
2057 } else {
2058 smov(dst, tmp, size, 0);
2059 }
2060 if (bt == T_LONG) {
2061 andr(dst, dst, src1);
2062 } else {
2063 andw(dst, dst, src1);
2064 }
2065 break;
2066 }
2067 case Op_OrReductionV: {
2068 sve_orv(tmp, size, pg, src2);
2069 if (bt == T_INT || bt == T_LONG) {
2070 umov(dst, tmp, size, 0);
2071 } else {
2072 smov(dst, tmp, size, 0);
2073 }
2074 if (bt == T_LONG) {
2075 orr(dst, dst, src1);
2076 } else {
2077 orrw(dst, dst, src1);
2078 }
2079 break;
2080 }
2081 case Op_XorReductionV: {
2082 sve_eorv(tmp, size, pg, src2);
2083 if (bt == T_INT || bt == T_LONG) {
2084 umov(dst, tmp, size, 0);
2085 } else {
2086 smov(dst, tmp, size, 0);
2087 }
2088 if (bt == T_LONG) {
2089 eor(dst, dst, src1);
2090 } else {
2091 eorw(dst, dst, src1);
2092 }
2093 break;
2094 }
2095 case Op_MaxReductionV: {
2096 sve_smaxv(tmp, size, pg, src2);
2097 if (bt == T_INT || bt == T_LONG) {
2098 umov(dst, tmp, size, 0);
2099 } else {
2100 smov(dst, tmp, size, 0);
2101 }
2102 if (bt == T_LONG) {
2103 cmp(dst, src1);
2104 csel(dst, dst, src1, Assembler::GT);
2105 } else {
2106 cmpw(dst, src1);
2107 cselw(dst, dst, src1, Assembler::GT);
2108 }
2109 break;
2110 }
2111 case Op_MinReductionV: {
2112 sve_sminv(tmp, size, pg, src2);
2113 if (bt == T_INT || bt == T_LONG) {
2114 umov(dst, tmp, size, 0);
2115 } else {
2116 smov(dst, tmp, size, 0);
2117 }
2118 if (bt == T_LONG) {
2119 cmp(dst, src1);
2120 csel(dst, dst, src1, Assembler::LT);
2121 } else {
2122 cmpw(dst, src1);
2123 cselw(dst, dst, src1, Assembler::LT);
2124 }
2125 break;
2126 }
2127 default:
2128 assert(false, "unsupported");
2129 ShouldNotReachHere();
2130 }
2131
2132 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2133 if (bt == T_BYTE) {
2134 sxtb(dst, dst);
2135 } else if (bt == T_SHORT) {
2136 sxth(dst, dst);
2137 }
2138 }
2139 }
2140
2141 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2142 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2143 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2144 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2145 uint32_t max_vector_length = Matcher::max_vector_size(bt);
2146 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2147
2148 // Set all elements to false if the input "lane_cnt" is zero.
2149 if (lane_cnt == 0) {
2150 sve_pfalse(dst);
2151 return;
2152 }
2153
2154 SIMD_RegVariant size = elemType_to_regVariant(bt);
2155 assert(size != Q, "invalid size");
2156
2157 // Set all true if "lane_cnt" equals to the max lane count.
2158 if (lane_cnt == max_vector_length) {
2159 sve_ptrue(dst, size, /* ALL */ 0b11111);
2160 return;
2161 }
2162
2163 // Fixed numbers for "ptrue".
2164 switch(lane_cnt) {
2165 case 1: /* VL1 */
2166 case 2: /* VL2 */
2167 case 3: /* VL3 */
2168 case 4: /* VL4 */
2169 case 5: /* VL5 */
2170 case 6: /* VL6 */
2171 case 7: /* VL7 */
2172 case 8: /* VL8 */
2173 sve_ptrue(dst, size, lane_cnt);
2174 return;
2175 case 16:
2176 sve_ptrue(dst, size, /* VL16 */ 0b01001);
2177 return;
2178 case 32:
2179 sve_ptrue(dst, size, /* VL32 */ 0b01010);
2180 return;
2181 case 64:
2182 sve_ptrue(dst, size, /* VL64 */ 0b01011);
2183 return;
2184 case 128:
2185 sve_ptrue(dst, size, /* VL128 */ 0b01100);
2186 return;
2187 case 256:
2188 sve_ptrue(dst, size, /* VL256 */ 0b01101);
2189 return;
2190 default:
2191 break;
2192 }
2193
2194 // Special patterns for "ptrue".
2195 if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2196 sve_ptrue(dst, size, /* POW2 */ 0b00000);
2197 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2198 sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2199 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2200 sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2201 } else {
2202 // Encode to "whileltw" for the remaining cases.
2203 mov(rscratch1, lane_cnt);
2204 sve_whileltw(dst, size, zr, rscratch1);
2205 }
2206 }
2207
2208 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2209 // Any remaining elements of dst will be filled with zero.
2210 // Clobbers: rscratch1
2211 // Preserves: src, mask
2212 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2213 FloatRegister vtmp1, FloatRegister vtmp2,
2214 PRegister pgtmp) {
2215 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2216 assert_different_registers(dst, src, vtmp1, vtmp2);
2217 assert_different_registers(mask, pgtmp);
2218
2219 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111
2220 // mask = 0001 0000 0000 0001 0001 0000 0001 0001
2221 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111
2222 sve_dup(vtmp2, H, 0);
2223
2224 // Extend lowest half to type INT.
2225 // dst = 00004444 00003333 00002222 00001111
2226 sve_uunpklo(dst, S, src);
2227 // pgtmp = 00000001 00000000 00000001 00000001
2228 sve_punpklo(pgtmp, mask);
2229 // Pack the active elements in size of type INT to the right,
2230 // and fill the remainings with zero.
2231 // dst = 00000000 00004444 00002222 00001111
2232 sve_compact(dst, S, dst, pgtmp);
2233 // Narrow the result back to type SHORT.
2234 // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2235 sve_uzp1(dst, H, dst, vtmp2);
2236 // Count the active elements of lowest half.
2237 // rscratch1 = 3
2238 sve_cntp(rscratch1, S, ptrue, pgtmp);
2239
2240 // Repeat to the highest half.
2241 // pgtmp = 00000001 00000000 00000000 00000001
2242 sve_punpkhi(pgtmp, mask);
2243 // vtmp1 = 00008888 00007777 00006666 00005555
2244 sve_uunpkhi(vtmp1, S, src);
2245 // vtmp1 = 00000000 00000000 00008888 00005555
2246 sve_compact(vtmp1, S, vtmp1, pgtmp);
2247 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2248 sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2249
2250 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111
2251 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2252 // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2253 // TRUE_CNT is the number of active elements in the compressed low.
2254 neg(rscratch1, rscratch1);
2255 // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2256 sve_index(vtmp2, H, rscratch1, 1);
2257 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2258 sve_tbl(vtmp1, H, vtmp1, vtmp2);
2259
2260 // Combine the compressed high(after shifted) with the compressed low.
2261 // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2262 sve_orr(dst, dst, vtmp1);
2263 }
2264
2265 // Clobbers: rscratch1, rscratch2
2266 // Preserves: src, mask
2267 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2268 FloatRegister vtmp1, FloatRegister vtmp2,
2269 FloatRegister vtmp3, FloatRegister vtmp4,
2270 PRegister ptmp, PRegister pgtmp) {
2271 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2272 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2273 assert_different_registers(mask, ptmp, pgtmp);
2274 // Example input: src = 88 77 66 55 44 33 22 11
2275 // mask = 01 00 00 01 01 00 01 01
2276 // Expected result: dst = 00 00 00 88 55 44 22 11
2277
2278 sve_dup(vtmp4, B, 0);
2279 // Extend lowest half to type SHORT.
2280 // vtmp1 = 0044 0033 0022 0011
2281 sve_uunpklo(vtmp1, H, src);
2282 // ptmp = 0001 0000 0001 0001
2283 sve_punpklo(ptmp, mask);
2284 // Count the active elements of lowest half.
2285 // rscratch2 = 3
2286 sve_cntp(rscratch2, H, ptrue, ptmp);
2287 // Pack the active elements in size of type SHORT to the right,
2288 // and fill the remainings with zero.
2289 // dst = 0000 0044 0022 0011
2290 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2291 // Narrow the result back to type BYTE.
2292 // dst = 00 00 00 00 00 44 22 11
2293 sve_uzp1(dst, B, dst, vtmp4);
2294
2295 // Repeat to the highest half.
2296 // ptmp = 0001 0000 0000 0001
2297 sve_punpkhi(ptmp, mask);
2298 // vtmp1 = 0088 0077 0066 0055
2299 sve_uunpkhi(vtmp2, H, src);
2300 // vtmp1 = 0000 0000 0088 0055
2301 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2302
2303 sve_dup(vtmp4, B, 0);
2304 // vtmp1 = 00 00 00 00 00 00 88 55
2305 sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2306
2307 // Compressed low: dst = 00 00 00 00 00 44 22 11
2308 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55
2309 // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2310 // TRUE_CNT is the number of active elements in the compressed low.
2311 neg(rscratch2, rscratch2);
2312 // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2313 sve_index(vtmp2, B, rscratch2, 1);
2314 // vtmp1 = 00 00 00 88 55 00 00 00
2315 sve_tbl(vtmp1, B, vtmp1, vtmp2);
2316 // Combine the compressed high(after shifted) with the compressed low.
2317 // dst = 00 00 00 88 55 44 22 11
2318 sve_orr(dst, dst, vtmp1);
2319 }
2320
2321 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2322 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2323 SIMD_Arrangement size = isQ ? T16B : T8B;
2324 if (bt == T_BYTE) {
2325 rbit(dst, size, src);
2326 } else {
2327 neon_reverse_bytes(dst, src, bt, isQ);
2328 rbit(dst, size, dst);
2329 }
2330 }
2331
2332 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2333 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2334 SIMD_Arrangement size = isQ ? T16B : T8B;
2335 switch (bt) {
2336 case T_BYTE:
2337 if (dst != src) {
2338 orr(dst, size, src, src);
2339 }
2340 break;
2341 case T_SHORT:
2342 rev16(dst, size, src);
2343 break;
2344 case T_INT:
2345 rev32(dst, size, src);
2346 break;
2347 case T_LONG:
2348 rev64(dst, size, src);
2349 break;
2350 default:
2351 assert(false, "unsupported");
2352 ShouldNotReachHere();
2353 }
2354 }
2355
2356 // Extract a scalar element from an sve vector at position 'idx'.
2357 // The input elements in src are expected to be of integral type.
2358 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2359 int idx, FloatRegister vtmp) {
2360 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2361 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2362 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2363 if (bt == T_INT || bt == T_LONG) {
2364 umov(dst, src, size, idx);
2365 } else {
2366 smov(dst, src, size, idx);
2367 }
2368 } else {
2369 sve_orr(vtmp, src, src);
2370 sve_ext(vtmp, vtmp, idx << size);
2371 if (bt == T_INT || bt == T_LONG) {
2372 umov(dst, vtmp, size, 0);
2373 } else {
2374 smov(dst, vtmp, size, 0);
2375 }
2376 }
2377 }
2378
2379 // java.lang.Math::round intrinsics
2380
2381 // Clobbers: rscratch1, rflags
2382 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2383 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2384 assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2385 switch (T) {
2386 case T2S:
2387 case T4S:
2388 fmovs(tmp1, T, 0.5f);
2389 mov(rscratch1, jint_cast(0x1.0p23f));
2390 break;
2391 case T2D:
2392 fmovd(tmp1, T, 0.5);
2393 mov(rscratch1, julong_cast(0x1.0p52));
2394 break;
2395 default:
2396 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2397 }
2398 fadd(tmp1, T, tmp1, src);
2399 fcvtms(tmp1, T, tmp1);
2400 // tmp1 = floor(src + 0.5, ties to even)
2401
2402 fcvtas(dst, T, src);
2403 // dst = round(src), ties to away
2404
2405 fneg(tmp3, T, src);
2406 dup(tmp2, T, rscratch1);
2407 cm(HS, tmp3, T, tmp3, tmp2);
2408 // tmp3 is now a set of flags
2409
2410 bif(dst, T16B, tmp1, tmp3);
2411 // result in dst
2412 }
2413
2414 // Clobbers: rscratch1, rflags
2415 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2416 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2417 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2418 assert_different_registers(tmp1, tmp2, src, dst);
2419
2420 switch (T) {
2421 case S:
2422 mov(rscratch1, jint_cast(0x1.0p23f));
2423 break;
2424 case D:
2425 mov(rscratch1, julong_cast(0x1.0p52));
2426 break;
2427 default:
2428 assert(T == S || T == D, "invalid register variant");
2429 }
2430
2431 sve_frinta(dst, T, ptrue, src);
2432 // dst = round(src), ties to away
2433
2434 Label none;
2435
2436 sve_fneg(tmp1, T, ptrue, src);
2437 sve_dup(tmp2, T, rscratch1);
2438 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2439 br(EQ, none);
2440 {
2441 sve_cpy(tmp1, T, pgtmp, 0.5);
2442 sve_fadd(tmp1, T, pgtmp, src);
2443 sve_frintm(dst, T, pgtmp, tmp1);
2444 // dst = floor(src + 0.5, ties to even)
2445 }
2446 bind(none);
2447
2448 sve_fcvtzs(dst, T, ptrue, dst, T);
2449 // result in dst
2450 }
2451
2452 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2453 FloatRegister one, SIMD_Arrangement T) {
2454 assert_different_registers(dst, src, zero, one);
2455 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2456
2457 facgt(dst, T, src, zero);
2458 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2459 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2460 }
2461
2462 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2463 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2464 assert_different_registers(dst, src, zero, one, vtmp);
2465 assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2466
2467 sve_orr(vtmp, src, src);
2468 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2469 switch (T) {
2470 case S:
2471 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2472 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2473 // on the sign of the float value
2474 break;
2475 case D:
2476 sve_and(vtmp, T, min_jlong);
2477 sve_orr(vtmp, T, jlong_cast(1.0));
2478 break;
2479 default:
2480 assert(false, "unsupported");
2481 ShouldNotReachHere();
2482 }
2483 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2484 // Result in dst
2485 }
2486
2487 bool C2_MacroAssembler::in_scratch_emit_size() {
2488 if (ciEnv::current()->task() != nullptr) {
2489 PhaseOutput* phase_output = Compile::current()->output();
2490 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2491 return true;
2492 }
2493 }
2494 return MacroAssembler::in_scratch_emit_size();
2495 }
2496
2497 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) {
2498 C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst);
2499 Compile::current()->output()->add_stub(stub);
2500
2501 // Note: Don't clobber obj anywhere in that method!
2502
2503 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
2504 // obj-start, so that we can load from the object's mark-word instead. Usually the address
2505 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
2506 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
2507 // then passes that register as obj and 0 in disp. The following code extracts the base
2508 // and offset to load the mark-word.
2509 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
2510 if (index == noreg) {
2511 ldr(dst, Address(obj, offset));
2512 } else {
2513 lea(dst, Address(obj, index, Address::lsl(scale)));
2514 ldr(dst, Address(dst, offset));
2515 }
2516 // NOTE: We can't use tbnz here, because the target is sometimes too far away
2517 // and cannot be encoded.
2518 tst(dst, markWord::monitor_value);
2519 br(Assembler::NE, stub->entry());
2520 bind(stub->continuation());
2521 lsr(dst, dst, markWord::klass_shift);
2522 }