1 /*
2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "opto/c2_MacroAssembler.hpp"
30 #include "opto/compile.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/stubRoutines.hpp"
35 #include "utilities/globalDefinitions.hpp"
36
37 #ifdef PRODUCT
38 #define BLOCK_COMMENT(str) /* nothing */
39 #define STOP(error) stop(error)
40 #else
41 #define BLOCK_COMMENT(str) block_comment(str)
42 #define STOP(error) block_comment(error); stop(error)
43 #endif
44
45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46
47 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
48 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
49 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
50 Register flag = t1;
51 Register oop = objectReg;
52 Register box = boxReg;
53 Register disp_hdr = tmp1Reg;
54 Register tmp = tmp2Reg;
55 Label object_has_monitor;
56 // Finish fast lock successfully. MUST branch to with flag == 0
57 Label locked;
58 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
59 Label slow_path;
60
61 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
62 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
63
64 mv(flag, 1);
65
66 // Load markWord from object into displaced_header.
67 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
68
69 if (DiagnoseSyncOnValueBasedClasses != 0) {
70 load_klass(tmp, oop);
71 lwu(tmp, Address(tmp, Klass::access_flags_offset()));
72 test_bit(tmp, tmp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
73 bnez(tmp, slow_path);
74 }
75
76 // Check for existing monitor
77 test_bit(tmp, disp_hdr, exact_log2(markWord::monitor_value));
78 bnez(tmp, object_has_monitor);
79
80 if (LockingMode == LM_MONITOR) {
81 j(slow_path);
82 } else {
83 assert(LockingMode == LM_LEGACY, "must be");
84 // Set tmp to be (markWord of object | UNLOCK_VALUE).
85 ori(tmp, disp_hdr, markWord::unlocked_value);
86
87 // Initialize the box. (Must happen before we update the object mark!)
88 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
89
90 // Compare object markWord with an unlocked value (tmp) and if
91 // equal exchange the stack address of our box with object markWord.
92 // On failure disp_hdr contains the possibly locked markWord.
93 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64,
94 Assembler::aq, Assembler::rl, /*result*/disp_hdr);
95 beq(disp_hdr, tmp, locked);
96
97 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
98
99 // If the compare-and-exchange succeeded, then we found an unlocked
100 // object, will have now locked it will continue at label locked
101 // We did not see an unlocked object so try the fast recursive case.
102
103 // Check if the owner is self by comparing the value in the
104 // markWord of object (disp_hdr) with the stack pointer.
105 sub(disp_hdr, disp_hdr, sp);
106 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
107 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto label locked,
108 // hence we can store 0 as the displaced header in the box, which indicates that it is a
109 // recursive lock.
110 andr(tmp/*==0?*/, disp_hdr, tmp);
111 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
112 beqz(tmp, locked);
113 j(slow_path);
114 }
115
116 // Handle existing monitor.
117 bind(object_has_monitor);
118 // The object's monitor m is unlocked iff m->owner == NULL,
119 // otherwise m->owner may contain a thread or a stack address.
120 //
121 // Try to CAS m->owner from NULL to current thread.
122 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
123 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64,
124 Assembler::aq, Assembler::rl, /*result*/tmp3Reg); // cas succeeds if tmp3Reg == zr(expected)
125
126 // Store a non-null value into the box to avoid looking like a re-entrant
127 // lock. The fast-path monitor unlock code checks for
128 // markWord::monitor_value so use markWord::unused_mark which has the
129 // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
130 mv(tmp, (address)markWord::unused_mark().value());
131 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
132
133 beqz(tmp3Reg, locked); // CAS success means locking succeeded
134
135 bne(tmp3Reg, xthread, slow_path); // Check for recursive locking
136
137 // Recursive lock case
138 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, tmp2Reg, tmp3Reg);
139
140 bind(locked);
141 mv(flag, zr);
142 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2Reg, tmp3Reg);
143
144 #ifdef ASSERT
145 // Check that locked label is reached with flag == 0.
146 Label flag_correct;
147 beqz(flag, flag_correct);
148 stop("Fast Lock Flag != 0");
149 #endif
150
151 bind(slow_path);
152 #ifdef ASSERT
153 // Check that slow_path label is reached with flag != 0.
154 bnez(flag, flag_correct);
155 stop("Fast Lock Flag == 0");
156 bind(flag_correct);
157 #endif
158 // C2 uses the value of flag (0 vs !0) to determine the continuation.
159 }
160
161 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
162 Register tmp1Reg, Register tmp2Reg) {
163 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
164 Register flag = t1;
165 Register oop = objectReg;
166 Register box = boxReg;
167 Register disp_hdr = tmp1Reg;
168 Register tmp = tmp2Reg;
169 Label object_has_monitor;
170 // Finish fast lock successfully. MUST branch to with flag == 0
171 Label unlocked;
172 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
173 Label slow_path;
174
175 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
176 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
177
178 mv(flag, 1);
179
180 if (LockingMode == LM_LEGACY) {
181 // Find the lock address and load the displaced header from the stack.
182 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
183
184 // If the displaced header is 0, we have a recursive unlock.
185 beqz(disp_hdr, unlocked);
186 }
187
188 // Handle existing monitor.
189 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
190 test_bit(t0, tmp, exact_log2(markWord::monitor_value));
191 bnez(t0, object_has_monitor);
192
193 if (LockingMode == LM_MONITOR) {
194 j(slow_path);
195 } else {
196 assert(LockingMode == LM_LEGACY, "must be");
197 // Check if it is still a light weight lock, this is true if we
198 // see the stack address of the basicLock in the markWord of the
199 // object.
200
201 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64,
202 Assembler::relaxed, Assembler::rl, /*result*/tmp);
203 beq(box, tmp, unlocked); // box == tmp if cas succeeds
204 j(slow_path);
205 }
206
207 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
208
209 // Handle existing monitor.
210 bind(object_has_monitor);
211 STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
212 add(tmp, tmp, -(int)markWord::monitor_value); // monitor
213
214 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
215
216 Label notRecursive;
217 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
218
219 // Recursive lock
220 addi(disp_hdr, disp_hdr, -1);
221 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
222 j(unlocked);
223
224 bind(notRecursive);
225 ld(t0, Address(tmp, ObjectMonitor::EntryList_offset()));
226 ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
227 orr(t0, t0, disp_hdr); // Will be 0 if both are 0.
228 bnez(t0, slow_path);
229
230 // need a release store here
231 la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
232 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
233 sd(zr, Address(tmp)); // set unowned
234
235 bind(unlocked);
236 mv(flag, zr);
237 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp1Reg, tmp2Reg);
238
239 #ifdef ASSERT
240 // Check that unlocked label is reached with flag == 0.
241 Label flag_correct;
242 beqz(flag, flag_correct);
243 stop("Fast Lock Flag != 0");
244 #endif
245
246 bind(slow_path);
247 #ifdef ASSERT
248 // Check that slow_path label is reached with flag != 0.
249 bnez(flag, flag_correct);
250 stop("Fast Lock Flag == 0");
251 bind(flag_correct);
252 #endif
253 // C2 uses the value of flag (0 vs !0) to determine the continuation.
254 }
255
256 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register tmp1, Register tmp2, Register tmp3) {
257 // Flag register, zero for success; non-zero for failure.
258 Register flag = t1;
259
260 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
261 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
262
263 mv(flag, 1);
264
265 // Handle inflated monitor.
266 Label inflated;
267 // Finish fast lock successfully. MUST branch to with flag == 0
268 Label locked;
269 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
270 Label slow_path;
271
272 if (DiagnoseSyncOnValueBasedClasses != 0) {
273 load_klass(tmp1, obj);
274 lwu(tmp1, Address(tmp1, Klass::access_flags_offset()));
275 test_bit(tmp1, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
276 bnez(tmp1, slow_path);
277 }
278
279 const Register tmp1_mark = tmp1;
280
281 { // Lightweight locking
282
283 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
284 Label push;
285
286 const Register tmp2_top = tmp2;
287 const Register tmp3_t = tmp3;
288
289 // Check if lock-stack is full.
290 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
291 mv(tmp3_t, (unsigned)LockStack::end_offset());
292 bge(tmp2_top, tmp3_t, slow_path);
293
294 // Check if recursive.
295 add(tmp3_t, xthread, tmp2_top);
296 ld(tmp3_t, Address(tmp3_t, -oopSize));
297 beq(obj, tmp3_t, push);
298
299 // Relaxed normal load to check for monitor. Optimization for monitor case.
300 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
301 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
302 bnez(tmp3_t, inflated);
303
304 // Not inflated
305 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
306
307 // Try to lock. Transition lock-bits 0b01 => 0b00
308 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
309 xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
310 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
311 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
312 bne(tmp1_mark, tmp3_t, slow_path);
313
314 bind(push);
315 // After successful lock, push object on lock-stack.
316 add(tmp3_t, xthread, tmp2_top);
317 sd(obj, Address(tmp3_t));
318 addw(tmp2_top, tmp2_top, oopSize);
319 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
320 j(locked);
321 }
322
323 { // Handle inflated monitor.
324 bind(inflated);
325
326 // mark contains the tagged ObjectMonitor*.
327 const Register tmp1_tagged_monitor = tmp1_mark;
328 const uintptr_t monitor_tag = markWord::monitor_value;
329 const Register tmp2_owner_addr = tmp2;
330 const Register tmp3_owner = tmp3;
331
332 // Compute owner address.
333 la(tmp2_owner_addr, Address(tmp1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
334
335 // CAS owner (null => current thread).
336 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ xthread, Assembler::int64,
337 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
338 beqz(tmp3_owner, locked);
339
340 // Check if recursive.
341 bne(tmp3_owner, xthread, slow_path);
342
343 // Recursive.
344 increment(Address(tmp1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1, tmp2, tmp3);
345 }
346
347 bind(locked);
348 mv(flag, zr);
349 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
350
351 #ifdef ASSERT
352 // Check that locked label is reached with flag == 0.
353 Label flag_correct;
354 beqz(flag, flag_correct);
355 stop("Fast Lock Flag != 0");
356 #endif
357
358 bind(slow_path);
359 #ifdef ASSERT
360 // Check that slow_path label is reached with flag != 0.
361 bnez(flag, flag_correct);
362 stop("Fast Lock Flag == 0");
363 bind(flag_correct);
364 #endif
365 // C2 uses the value of flag (0 vs !0) to determine the continuation.
366 }
367
368 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register tmp1, Register tmp2,
369 Register tmp3) {
370 // Flag register, zero for success; non-zero for failure.
371 Register flag = t1;
372
373 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
374 assert_different_registers(obj, tmp1, tmp2, tmp3, flag, t0);
375
376 mv(flag, 1);
377
378 // Handle inflated monitor.
379 Label inflated, inflated_load_monitor;
380 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
381 Label unlocked;
382 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
383 Label slow_path;
384
385 const Register tmp1_mark = tmp1;
386 const Register tmp2_top = tmp2;
387 const Register tmp3_t = tmp3;
388
389 { // Lightweight unlock
390
391 // Check if obj is top of lock-stack.
392 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
393 subw(tmp2_top, tmp2_top, oopSize);
394 add(tmp3_t, xthread, tmp2_top);
395 ld(tmp3_t, Address(tmp3_t));
396 // Top of lock stack was not obj. Must be monitor.
397 bne(obj, tmp3_t, inflated_load_monitor);
398
399 // Pop lock-stack.
400 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
401 DEBUG_ONLY(sd(zr, Address(tmp3_t));)
402 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
403
404 // Check if recursive.
405 add(tmp3_t, xthread, tmp2_top);
406 ld(tmp3_t, Address(tmp3_t, -oopSize));
407 beq(obj, tmp3_t, unlocked);
408
409 // Not recursive.
410 // Load Mark.
411 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
412
413 // Check header for monitor (0b10).
414 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
415 bnez(tmp3_t, inflated);
416
417 // Try to unlock. Transition lock bits 0b00 => 0b01
418 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
419 ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
420 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
421 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
422 beq(tmp1_mark, tmp3_t, unlocked);
423
424 // Compare and exchange failed.
425 // Restore lock-stack and handle the unlock in runtime.
426 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
427 DEBUG_ONLY(sd(obj, Address(tmp3_t));)
428 addw(tmp2_top, tmp2_top, oopSize);
429 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
430 j(slow_path);
431 }
432
433 { // Handle inflated monitor.
434 bind(inflated_load_monitor);
435 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
436 #ifdef ASSERT
437 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
438 bnez(tmp3_t, inflated);
439 stop("Fast Unlock not monitor");
440 #endif
441
442 bind(inflated);
443
444 #ifdef ASSERT
445 Label check_done;
446 subw(tmp2_top, tmp2_top, oopSize);
447 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
448 blt(tmp2_top, tmp3_t, check_done);
449 add(tmp3_t, xthread, tmp2_top);
450 ld(tmp3_t, Address(tmp3_t));
451 bne(obj, tmp3_t, inflated);
452 stop("Fast Unlock lock on stack");
453 bind(check_done);
454 #endif
455
456 // mark contains the tagged ObjectMonitor*.
457 const Register tmp1_monitor = tmp1_mark;
458 const uintptr_t monitor_tag = markWord::monitor_value;
459
460 // Untag the monitor.
461 sub(tmp1_monitor, tmp1_mark, monitor_tag);
462
463 const Register tmp2_recursions = tmp2;
464 Label not_recursive;
465
466 // Check if recursive.
467 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
468 beqz(tmp2_recursions, not_recursive);
469
470 // Recursive unlock.
471 addi(tmp2_recursions, tmp2_recursions, -1);
472 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
473 j(unlocked);
474
475 bind(not_recursive);
476
477 Label release;
478 const Register tmp2_owner_addr = tmp2;
479
480 // Compute owner address.
481 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
482
483 // Check if the entry lists are empty.
484 ld(t0, Address(tmp1_monitor, ObjectMonitor::EntryList_offset()));
485 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::cxq_offset()));
486 orr(t0, t0, tmp3_t);
487 beqz(t0, release);
488
489 // The owner may be anonymous and we removed the last obj entry in
490 // the lock-stack. This loses the information about the owner.
491 // Write the thread to the owner field so the runtime knows the owner.
492 sd(xthread, Address(tmp2_owner_addr));
493 j(slow_path);
494
495 bind(release);
496 // Set owner to null.
497 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
498 sd(zr, Address(tmp2_owner_addr));
499 }
500
501 bind(unlocked);
502 mv(flag, zr);
503 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, tmp2, tmp3);
504
505 #ifdef ASSERT
506 // Check that unlocked label is reached with flag == 0.
507 Label flag_correct;
508 beqz(flag, flag_correct);
509 stop("Fast Lock Flag != 0");
510 #endif
511
512 bind(slow_path);
513 #ifdef ASSERT
514 // Check that slow_path label is reached with flag != 0.
515 bnez(flag, flag_correct);
516 stop("Fast Lock Flag == 0");
517 bind(flag_correct);
518 #endif
519 // C2 uses the value of flag (0 vs !0) to determine the continuation.
520 }
521
522 // short string
523 // StringUTF16.indexOfChar
524 // StringLatin1.indexOfChar
525 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
526 Register ch, Register result,
527 bool isL)
528 {
529 Register ch1 = t0;
530 Register index = t1;
531
532 BLOCK_COMMENT("string_indexof_char_short {");
533
534 Label LOOP, LOOP1, LOOP4, LOOP8;
535 Label MATCH, MATCH1, MATCH2, MATCH3,
536 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
537
538 mv(result, -1);
539 mv(index, zr);
540
541 bind(LOOP);
542 addi(t0, index, 8);
543 ble(t0, cnt1, LOOP8);
544 addi(t0, index, 4);
545 ble(t0, cnt1, LOOP4);
546 j(LOOP1);
547
548 bind(LOOP8);
549 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
550 beq(ch, ch1, MATCH);
551 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
552 beq(ch, ch1, MATCH1);
553 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
554 beq(ch, ch1, MATCH2);
555 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
556 beq(ch, ch1, MATCH3);
557 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
558 beq(ch, ch1, MATCH4);
559 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
560 beq(ch, ch1, MATCH5);
561 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
562 beq(ch, ch1, MATCH6);
563 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
564 beq(ch, ch1, MATCH7);
565 addi(index, index, 8);
566 addi(str1, str1, isL ? 8 : 16);
567 blt(index, cnt1, LOOP);
568 j(NOMATCH);
569
570 bind(LOOP4);
571 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
572 beq(ch, ch1, MATCH);
573 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
574 beq(ch, ch1, MATCH1);
575 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
576 beq(ch, ch1, MATCH2);
577 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
578 beq(ch, ch1, MATCH3);
579 addi(index, index, 4);
580 addi(str1, str1, isL ? 4 : 8);
581 bge(index, cnt1, NOMATCH);
582
583 bind(LOOP1);
584 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
585 beq(ch, ch1, MATCH);
586 addi(index, index, 1);
587 addi(str1, str1, isL ? 1 : 2);
588 blt(index, cnt1, LOOP1);
589 j(NOMATCH);
590
591 bind(MATCH1);
592 addi(index, index, 1);
593 j(MATCH);
594
595 bind(MATCH2);
596 addi(index, index, 2);
597 j(MATCH);
598
599 bind(MATCH3);
600 addi(index, index, 3);
601 j(MATCH);
602
603 bind(MATCH4);
604 addi(index, index, 4);
605 j(MATCH);
606
607 bind(MATCH5);
608 addi(index, index, 5);
609 j(MATCH);
610
611 bind(MATCH6);
612 addi(index, index, 6);
613 j(MATCH);
614
615 bind(MATCH7);
616 addi(index, index, 7);
617
618 bind(MATCH);
619 mv(result, index);
620 bind(NOMATCH);
621 BLOCK_COMMENT("} string_indexof_char_short");
622 }
623
624 // StringUTF16.indexOfChar
625 // StringLatin1.indexOfChar
626 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
627 Register ch, Register result,
628 Register tmp1, Register tmp2,
629 Register tmp3, Register tmp4,
630 bool isL)
631 {
632 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
633 Register ch1 = t0;
634 Register orig_cnt = t1;
635 Register mask1 = tmp3;
636 Register mask2 = tmp2;
637 Register match_mask = tmp1;
638 Register trailing_char = tmp4;
639 Register unaligned_elems = tmp4;
640
641 BLOCK_COMMENT("string_indexof_char {");
642 beqz(cnt1, NOMATCH);
643
644 addi(t0, cnt1, isL ? -32 : -16);
645 bgtz(t0, DO_LONG);
646 string_indexof_char_short(str1, cnt1, ch, result, isL);
647 j(DONE);
648
649 bind(DO_LONG);
650 mv(orig_cnt, cnt1);
651 if (AvoidUnalignedAccesses) {
652 Label ALIGNED;
653 andi(unaligned_elems, str1, 0x7);
654 beqz(unaligned_elems, ALIGNED);
655 sub(unaligned_elems, unaligned_elems, 8);
656 neg(unaligned_elems, unaligned_elems);
657 if (!isL) {
658 srli(unaligned_elems, unaligned_elems, 1);
659 }
660 // do unaligned part per element
661 string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
662 bgez(result, DONE);
663 mv(orig_cnt, cnt1);
664 sub(cnt1, cnt1, unaligned_elems);
665 bind(ALIGNED);
666 }
667
668 // duplicate ch
669 if (isL) {
670 slli(ch1, ch, 8);
671 orr(ch, ch1, ch);
672 }
673 slli(ch1, ch, 16);
674 orr(ch, ch1, ch);
675 slli(ch1, ch, 32);
676 orr(ch, ch1, ch);
677
678 if (!isL) {
679 slli(cnt1, cnt1, 1);
680 }
681
682 uint64_t mask0101 = UCONST64(0x0101010101010101);
683 uint64_t mask0001 = UCONST64(0x0001000100010001);
684 mv(mask1, isL ? mask0101 : mask0001);
685 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
686 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
687 mv(mask2, isL ? mask7f7f : mask7fff);
688
689 bind(CH1_LOOP);
690 ld(ch1, Address(str1));
691 addi(str1, str1, 8);
692 addi(cnt1, cnt1, -8);
693 compute_match_mask(ch1, ch, match_mask, mask1, mask2);
694 bnez(match_mask, HIT);
695 bgtz(cnt1, CH1_LOOP);
696 j(NOMATCH);
697
698 bind(HIT);
699 ctzc_bit(trailing_char, match_mask, isL, ch1, result);
700 srli(trailing_char, trailing_char, 3);
701 addi(cnt1, cnt1, 8);
702 ble(cnt1, trailing_char, NOMATCH);
703 // match case
704 if (!isL) {
705 srli(cnt1, cnt1, 1);
706 srli(trailing_char, trailing_char, 1);
707 }
708
709 sub(result, orig_cnt, cnt1);
710 add(result, result, trailing_char);
711 j(DONE);
712
713 bind(NOMATCH);
714 mv(result, -1);
715
716 bind(DONE);
717 BLOCK_COMMENT("} string_indexof_char");
718 }
719
720 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
721
722 // Search for needle in haystack and return index or -1
723 // x10: result
724 // x11: haystack
725 // x12: haystack_len
726 // x13: needle
727 // x14: needle_len
728 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
729 Register haystack_len, Register needle_len,
730 Register tmp1, Register tmp2,
731 Register tmp3, Register tmp4,
732 Register tmp5, Register tmp6,
733 Register result, int ae)
734 {
735 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
736
737 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
738
739 Register ch1 = t0;
740 Register ch2 = t1;
741 Register nlen_tmp = tmp1; // needle len tmp
742 Register hlen_tmp = tmp2; // haystack len tmp
743 Register result_tmp = tmp4;
744
745 bool isLL = ae == StrIntrinsicNode::LL;
746
747 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
748 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
749 int needle_chr_shift = needle_isL ? 0 : 1;
750 int haystack_chr_shift = haystack_isL ? 0 : 1;
751 int needle_chr_size = needle_isL ? 1 : 2;
752 int haystack_chr_size = haystack_isL ? 1 : 2;
753 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
754 (load_chr_insn)&MacroAssembler::lhu;
755 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
756 (load_chr_insn)&MacroAssembler::lhu;
757
758 BLOCK_COMMENT("string_indexof {");
759
760 // Note, inline_string_indexOf() generates checks:
761 // if (pattern.count > src.count) return -1;
762 // if (pattern.count == 0) return 0;
763
764 // We have two strings, a source string in haystack, haystack_len and a pattern string
765 // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
766
767 // For larger pattern and source we use a simplified Boyer Moore algorithm.
768 // With a small pattern and source we use linear scan.
769
770 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
771 sub(result_tmp, haystack_len, needle_len);
772 // needle_len < 8, use linear scan
773 sub(t0, needle_len, 8);
774 bltz(t0, LINEARSEARCH);
775 // needle_len >= 256, use linear scan
776 sub(t0, needle_len, 256);
777 bgez(t0, LINEARSTUB);
778 // needle_len >= haystack_len/4, use linear scan
779 srli(t0, haystack_len, 2);
780 bge(needle_len, t0, LINEARSTUB);
781
782 // Boyer-Moore-Horspool introduction:
783 // The Boyer Moore alogorithm is based on the description here:-
784 //
785 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
786 //
787 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
788 // and the 'Good Suffix' rule.
789 //
790 // These rules are essentially heuristics for how far we can shift the
791 // pattern along the search string.
792 //
793 // The implementation here uses the 'Bad Character' rule only because of the
794 // complexity of initialisation for the 'Good Suffix' rule.
795 //
796 // This is also known as the Boyer-Moore-Horspool algorithm:
797 //
798 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
799 //
800 // #define ASIZE 256
801 //
802 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
803 // int i, j;
804 // unsigned c;
805 // unsigned char bc[ASIZE];
806 //
807 // /* Preprocessing */
808 // for (i = 0; i < ASIZE; ++i)
809 // bc[i] = m;
810 // for (i = 0; i < m - 1; ) {
811 // c = pattern[i];
812 // ++i;
813 // // c < 256 for Latin1 string, so, no need for branch
814 // #ifdef PATTERN_STRING_IS_LATIN1
815 // bc[c] = m - i;
816 // #else
817 // if (c < ASIZE) bc[c] = m - i;
818 // #endif
819 // }
820 //
821 // /* Searching */
822 // j = 0;
823 // while (j <= n - m) {
824 // c = src[i+j];
825 // if (pattern[m-1] == c)
826 // int k;
827 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
828 // if (k < 0) return j;
829 // // c < 256 for Latin1 string, so, no need for branch
830 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
831 // // LL case: (c< 256) always true. Remove branch
832 // j += bc[pattern[j+m-1]];
833 // #endif
834 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
835 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
836 // if (c < ASIZE)
837 // j += bc[pattern[j+m-1]];
838 // else
839 // j += 1
840 // #endif
841 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
842 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
843 // if (c < ASIZE)
844 // j += bc[pattern[j+m-1]];
845 // else
846 // j += m
847 // #endif
848 // }
849 // return -1;
850 // }
851
852 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
853 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
854 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
855
856 Register haystack_end = haystack_len;
857 Register skipch = tmp2;
858
859 // pattern length is >=8, so, we can read at least 1 register for cases when
860 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
861 // UL case. We'll re-read last character in inner pre-loop code to have
862 // single outer pre-loop load
863 const int firstStep = isLL ? 7 : 3;
864
865 const int ASIZE = 256;
866 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
867
868 sub(sp, sp, ASIZE);
869
870 // init BC offset table with default value: needle_len
871 slli(t0, needle_len, 8);
872 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
873 slli(tmp1, t0, 16);
874 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
875 slli(tmp1, t0, 32);
876 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
877
878 mv(ch1, sp); // ch1 is t0
879 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
880
881 bind(BM_INIT_LOOP);
882 // for (i = 0; i < ASIZE; ++i)
883 // bc[i] = m;
884 for (int i = 0; i < 4; i++) {
885 sd(tmp5, Address(ch1, i * wordSize));
886 }
887 add(ch1, ch1, 32);
888 sub(tmp6, tmp6, 4);
889 bgtz(tmp6, BM_INIT_LOOP);
890
891 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
892 Register orig_haystack = tmp5;
893 mv(orig_haystack, haystack);
894 // result_tmp = tmp4
895 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
896 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
897 mv(tmp3, needle);
898
899 // for (i = 0; i < m - 1; ) {
900 // c = pattern[i];
901 // ++i;
902 // // c < 256 for Latin1 string, so, no need for branch
903 // #ifdef PATTERN_STRING_IS_LATIN1
904 // bc[c] = m - i;
905 // #else
906 // if (c < ASIZE) bc[c] = m - i;
907 // #endif
908 // }
909 bind(BCLOOP);
910 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
911 add(tmp3, tmp3, needle_chr_size);
912 if (!needle_isL) {
913 // ae == StrIntrinsicNode::UU
914 mv(tmp6, ASIZE);
915 bgeu(ch1, tmp6, BCSKIP);
916 }
917 add(tmp4, sp, ch1);
918 sb(ch2, Address(tmp4)); // store skip offset to BC offset table
919
920 bind(BCSKIP);
921 sub(ch2, ch2, 1); // for next pattern element, skip distance -1
922 bgtz(ch2, BCLOOP);
923
924 // tmp6: pattern end, address after needle
925 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
926 if (needle_isL == haystack_isL) {
927 // load last 8 bytes (8LL/4UU symbols)
928 ld(tmp6, Address(tmp6, -wordSize));
929 } else {
930 // UL: from UTF-16(source) search Latin1(pattern)
931 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
932 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
933 // We'll have to wait until load completed, but it's still faster than per-character loads+checks
934 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
935 slli(ch2, tmp6, XLEN - 24);
936 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
937 slli(ch1, tmp6, XLEN - 16);
938 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
939 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
940 slli(ch2, ch2, 16);
941 orr(ch2, ch2, ch1); // 0x00000b0c
942 slli(result, tmp3, 48); // use result as temp register
943 orr(tmp6, tmp6, result); // 0x0a00000d
944 slli(result, ch2, 16);
945 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
946 }
947
948 // i = m - 1;
949 // skipch = j + i;
950 // if (skipch == pattern[m - 1]
951 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
952 // else
953 // move j with bad char offset table
954 bind(BMLOOPSTR2);
955 // compare pattern to source string backward
956 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
957 (this->*haystack_load_1chr)(skipch, Address(result), noreg);
958 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
959 if (needle_isL == haystack_isL) {
960 // re-init tmp3. It's for free because it's executed in parallel with
961 // load above. Alternative is to initialize it before loop, but it'll
962 // affect performance on in-order systems with 2 or more ld/st pipelines
963 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
964 }
965 if (!isLL) { // UU/UL case
966 slli(ch2, nlen_tmp, 1); // offsets in bytes
967 }
968 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
969 add(result, haystack, isLL ? nlen_tmp : ch2);
970 // load 8 bytes from source string
971 // if isLL is false then read granularity can be 2
972 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
973 mv(ch1, tmp6);
974 if (isLL) {
975 j(BMLOOPSTR1_AFTER_LOAD);
976 } else {
977 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
978 j(BMLOOPSTR1_CMP);
979 }
980
981 bind(BMLOOPSTR1);
982 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
983 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
984 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
985 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
986
987 bind(BMLOOPSTR1_AFTER_LOAD);
988 sub(nlen_tmp, nlen_tmp, 1);
989 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
990
991 bind(BMLOOPSTR1_CMP);
992 beq(ch1, ch2, BMLOOPSTR1);
993
994 bind(BMSKIP);
995 if (!isLL) {
996 // if we've met UTF symbol while searching Latin1 pattern, then we can
997 // skip needle_len symbols
998 if (needle_isL != haystack_isL) {
999 mv(result_tmp, needle_len);
1000 } else {
1001 mv(result_tmp, 1);
1002 }
1003 mv(t0, ASIZE);
1004 bgeu(skipch, t0, BMADV);
1005 }
1006 add(result_tmp, sp, skipch);
1007 lbu(result_tmp, Address(result_tmp)); // load skip offset
1008
1009 bind(BMADV);
1010 sub(nlen_tmp, needle_len, 1);
1011 // move haystack after bad char skip offset
1012 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
1013 ble(haystack, haystack_end, BMLOOPSTR2);
1014 add(sp, sp, ASIZE);
1015 j(NOMATCH);
1016
1017 bind(BMLOOPSTR1_LASTCMP);
1018 bne(ch1, ch2, BMSKIP);
1019
1020 bind(BMMATCH);
1021 sub(result, haystack, orig_haystack);
1022 if (!haystack_isL) {
1023 srli(result, result, 1);
1024 }
1025 add(sp, sp, ASIZE);
1026 j(DONE);
1027
1028 bind(LINEARSTUB);
1029 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
1030 bltz(t0, LINEARSEARCH);
1031 mv(result, zr);
1032 RuntimeAddress stub = nullptr;
1033 if (isLL) {
1034 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
1035 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
1036 } else if (needle_isL) {
1037 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
1038 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
1039 } else {
1040 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
1041 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
1042 }
1043 address call = trampoline_call(stub);
1044 if (call == nullptr) {
1045 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
1046 ciEnv::current()->record_failure("CodeCache is full");
1047 return;
1048 }
1049 j(DONE);
1050
1051 bind(NOMATCH);
1052 mv(result, -1);
1053 j(DONE);
1054
1055 bind(LINEARSEARCH);
1056 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
1057
1058 bind(DONE);
1059 BLOCK_COMMENT("} string_indexof");
1060 }
1061
1062 // string_indexof
1063 // result: x10
1064 // src: x11
1065 // src_count: x12
1066 // pattern: x13
1067 // pattern_count: x14 or 1/2/3/4
1068 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
1069 Register haystack_len, Register needle_len,
1070 Register tmp1, Register tmp2,
1071 Register tmp3, Register tmp4,
1072 int needle_con_cnt, Register result, int ae)
1073 {
1074 // Note:
1075 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
1076 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
1077 assert(needle_con_cnt <= 4, "Invalid needle constant count");
1078 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
1079
1080 Register ch1 = t0;
1081 Register ch2 = t1;
1082 Register hlen_neg = haystack_len, nlen_neg = needle_len;
1083 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
1084
1085 bool isLL = ae == StrIntrinsicNode::LL;
1086
1087 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
1088 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
1089 int needle_chr_shift = needle_isL ? 0 : 1;
1090 int haystack_chr_shift = haystack_isL ? 0 : 1;
1091 int needle_chr_size = needle_isL ? 1 : 2;
1092 int haystack_chr_size = haystack_isL ? 1 : 2;
1093
1094 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
1095 (load_chr_insn)&MacroAssembler::lhu;
1096 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
1097 (load_chr_insn)&MacroAssembler::lhu;
1098 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
1099 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
1100
1101 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
1102
1103 Register first = tmp3;
1104
1105 if (needle_con_cnt == -1) {
1106 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1107
1108 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1109 bltz(t0, DOSHORT);
1110
1111 (this->*needle_load_1chr)(first, Address(needle), noreg);
1112 slli(t0, needle_len, needle_chr_shift);
1113 add(needle, needle, t0);
1114 neg(nlen_neg, t0);
1115 slli(t0, result_tmp, haystack_chr_shift);
1116 add(haystack, haystack, t0);
1117 neg(hlen_neg, t0);
1118
1119 bind(FIRST_LOOP);
1120 add(t0, haystack, hlen_neg);
1121 (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1122 beq(first, ch2, STR1_LOOP);
1123
1124 bind(STR2_NEXT);
1125 add(hlen_neg, hlen_neg, haystack_chr_size);
1126 blez(hlen_neg, FIRST_LOOP);
1127 j(NOMATCH);
1128
1129 bind(STR1_LOOP);
1130 add(nlen_tmp, nlen_neg, needle_chr_size);
1131 add(hlen_tmp, hlen_neg, haystack_chr_size);
1132 bgez(nlen_tmp, MATCH);
1133
1134 bind(STR1_NEXT);
1135 add(ch1, needle, nlen_tmp);
1136 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1137 add(ch2, haystack, hlen_tmp);
1138 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1139 bne(ch1, ch2, STR2_NEXT);
1140 add(nlen_tmp, nlen_tmp, needle_chr_size);
1141 add(hlen_tmp, hlen_tmp, haystack_chr_size);
1142 bltz(nlen_tmp, STR1_NEXT);
1143 j(MATCH);
1144
1145 bind(DOSHORT);
1146 if (needle_isL == haystack_isL) {
1147 sub(t0, needle_len, 2);
1148 bltz(t0, DO1);
1149 bgtz(t0, DO3);
1150 }
1151 }
1152
1153 if (needle_con_cnt == 4) {
1154 Label CH1_LOOP;
1155 (this->*load_4chr)(ch1, Address(needle), noreg);
1156 sub(result_tmp, haystack_len, 4);
1157 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1158 add(haystack, haystack, tmp3);
1159 neg(hlen_neg, tmp3);
1160 if (AvoidUnalignedAccesses) {
1161 // preload first value, then we will read by 1 character per loop, instead of four
1162 // just shifting previous ch2 right by size of character in bits
1163 add(tmp3, haystack, hlen_neg);
1164 (this->*load_4chr)(ch2, Address(tmp3), noreg);
1165 if (isLL) {
1166 // need to erase 1 most significant byte in 32-bit value of ch2
1167 slli(ch2, ch2, 40);
1168 srli(ch2, ch2, 32);
1169 } else {
1170 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1171 }
1172 }
1173
1174 bind(CH1_LOOP);
1175 add(tmp3, haystack, hlen_neg);
1176 if (AvoidUnalignedAccesses) {
1177 srli(ch2, ch2, isLL ? 8 : 16);
1178 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1179 slli(tmp3, tmp3, isLL ? 24 : 48);
1180 add(ch2, ch2, tmp3);
1181 } else {
1182 (this->*load_4chr)(ch2, Address(tmp3), noreg);
1183 }
1184 beq(ch1, ch2, MATCH);
1185 add(hlen_neg, hlen_neg, haystack_chr_size);
1186 blez(hlen_neg, CH1_LOOP);
1187 j(NOMATCH);
1188 }
1189
1190 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1191 Label CH1_LOOP;
1192 BLOCK_COMMENT("string_indexof DO2 {");
1193 bind(DO2);
1194 (this->*load_2chr)(ch1, Address(needle), noreg);
1195 if (needle_con_cnt == 2) {
1196 sub(result_tmp, haystack_len, 2);
1197 }
1198 slli(tmp3, result_tmp, haystack_chr_shift);
1199 add(haystack, haystack, tmp3);
1200 neg(hlen_neg, tmp3);
1201 if (AvoidUnalignedAccesses) {
1202 // preload first value, then we will read by 1 character per loop, instead of two
1203 // just shifting previous ch2 right by size of character in bits
1204 add(tmp3, haystack, hlen_neg);
1205 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1206 slli(ch2, ch2, isLL ? 8 : 16);
1207 }
1208 bind(CH1_LOOP);
1209 add(tmp3, haystack, hlen_neg);
1210 if (AvoidUnalignedAccesses) {
1211 srli(ch2, ch2, isLL ? 8 : 16);
1212 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1213 slli(tmp3, tmp3, isLL ? 8 : 16);
1214 add(ch2, ch2, tmp3);
1215 } else {
1216 (this->*load_2chr)(ch2, Address(tmp3), noreg);
1217 }
1218 beq(ch1, ch2, MATCH);
1219 add(hlen_neg, hlen_neg, haystack_chr_size);
1220 blez(hlen_neg, CH1_LOOP);
1221 j(NOMATCH);
1222 BLOCK_COMMENT("} string_indexof DO2");
1223 }
1224
1225 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1226 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1227 BLOCK_COMMENT("string_indexof DO3 {");
1228
1229 bind(DO3);
1230 (this->*load_2chr)(first, Address(needle), noreg);
1231 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1232 if (needle_con_cnt == 3) {
1233 sub(result_tmp, haystack_len, 3);
1234 }
1235 slli(hlen_tmp, result_tmp, haystack_chr_shift);
1236 add(haystack, haystack, hlen_tmp);
1237 neg(hlen_neg, hlen_tmp);
1238
1239 bind(FIRST_LOOP);
1240 add(ch2, haystack, hlen_neg);
1241 if (AvoidUnalignedAccesses) {
1242 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1243 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1244 slli(tmp2, tmp2, isLL ? 8 : 16);
1245 add(ch2, ch2, tmp2);
1246 } else {
1247 (this->*load_2chr)(ch2, Address(ch2), noreg);
1248 }
1249 beq(first, ch2, STR1_LOOP);
1250
1251 bind(STR2_NEXT);
1252 add(hlen_neg, hlen_neg, haystack_chr_size);
1253 blez(hlen_neg, FIRST_LOOP);
1254 j(NOMATCH);
1255
1256 bind(STR1_LOOP);
1257 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1258 add(ch2, haystack, hlen_tmp);
1259 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1260 bne(ch1, ch2, STR2_NEXT);
1261 j(MATCH);
1262 BLOCK_COMMENT("} string_indexof DO3");
1263 }
1264
1265 if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1266 Label DO1_LOOP;
1267
1268 BLOCK_COMMENT("string_indexof DO1 {");
1269 bind(DO1);
1270 (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1271 sub(result_tmp, haystack_len, 1);
1272 slli(tmp3, result_tmp, haystack_chr_shift);
1273 add(haystack, haystack, tmp3);
1274 neg(hlen_neg, tmp3);
1275
1276 bind(DO1_LOOP);
1277 add(tmp3, haystack, hlen_neg);
1278 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1279 beq(ch1, ch2, MATCH);
1280 add(hlen_neg, hlen_neg, haystack_chr_size);
1281 blez(hlen_neg, DO1_LOOP);
1282 BLOCK_COMMENT("} string_indexof DO1");
1283 }
1284
1285 bind(NOMATCH);
1286 mv(result, -1);
1287 j(DONE);
1288
1289 bind(MATCH);
1290 srai(t0, hlen_neg, haystack_chr_shift);
1291 add(result, result_tmp, t0);
1292
1293 bind(DONE);
1294 }
1295
1296 // Compare strings.
1297 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1298 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1299 Register tmp3, int ae)
1300 {
1301 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1302 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1303 SHORT_LOOP_START, TAIL_CHECK, L;
1304
1305 const int STUB_THRESHOLD = 64 + 8;
1306 bool isLL = ae == StrIntrinsicNode::LL;
1307 bool isLU = ae == StrIntrinsicNode::LU;
1308 bool isUL = ae == StrIntrinsicNode::UL;
1309
1310 bool str1_isL = isLL || isLU;
1311 bool str2_isL = isLL || isUL;
1312
1313 // for L strings, 1 byte for 1 character
1314 // for U strings, 2 bytes for 1 character
1315 int str1_chr_size = str1_isL ? 1 : 2;
1316 int str2_chr_size = str2_isL ? 1 : 2;
1317 int minCharsInWord = isLL ? wordSize : wordSize / 2;
1318
1319 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1320 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1321
1322 BLOCK_COMMENT("string_compare {");
1323
1324 // Bizzarely, the counts are passed in bytes, regardless of whether they
1325 // are L or U strings, however the result is always in characters.
1326 if (!str1_isL) {
1327 sraiw(cnt1, cnt1, 1);
1328 }
1329 if (!str2_isL) {
1330 sraiw(cnt2, cnt2, 1);
1331 }
1332
1333 // Compute the minimum of the string lengths and save the difference in result.
1334 sub(result, cnt1, cnt2);
1335 bgt(cnt1, cnt2, L);
1336 mv(cnt2, cnt1);
1337 bind(L);
1338
1339 // A very short string
1340 mv(t0, minCharsInWord);
1341 ble(cnt2, t0, SHORT_STRING);
1342
1343 // Compare longwords
1344 // load first parts of strings and finish initialization while loading
1345 {
1346 if (str1_isL == str2_isL) { // LL or UU
1347 // check if str1 and str2 is same pointer
1348 beq(str1, str2, DONE);
1349 // load 8 bytes once to compare
1350 ld(tmp1, Address(str1));
1351 ld(tmp2, Address(str2));
1352 mv(t0, STUB_THRESHOLD);
1353 bge(cnt2, t0, STUB);
1354 sub(cnt2, cnt2, minCharsInWord);
1355 beqz(cnt2, TAIL_CHECK);
1356 // convert cnt2 from characters to bytes
1357 if (!str1_isL) {
1358 slli(cnt2, cnt2, 1);
1359 }
1360 add(str2, str2, cnt2);
1361 add(str1, str1, cnt2);
1362 sub(cnt2, zr, cnt2);
1363 } else if (isLU) { // LU case
1364 lwu(tmp1, Address(str1));
1365 ld(tmp2, Address(str2));
1366 mv(t0, STUB_THRESHOLD);
1367 bge(cnt2, t0, STUB);
1368 addi(cnt2, cnt2, -4);
1369 add(str1, str1, cnt2);
1370 sub(cnt1, zr, cnt2);
1371 slli(cnt2, cnt2, 1);
1372 add(str2, str2, cnt2);
1373 inflate_lo32(tmp3, tmp1);
1374 mv(tmp1, tmp3);
1375 sub(cnt2, zr, cnt2);
1376 addi(cnt1, cnt1, 4);
1377 } else { // UL case
1378 ld(tmp1, Address(str1));
1379 lwu(tmp2, Address(str2));
1380 mv(t0, STUB_THRESHOLD);
1381 bge(cnt2, t0, STUB);
1382 addi(cnt2, cnt2, -4);
1383 slli(t0, cnt2, 1);
1384 sub(cnt1, zr, t0);
1385 add(str1, str1, t0);
1386 add(str2, str2, cnt2);
1387 inflate_lo32(tmp3, tmp2);
1388 mv(tmp2, tmp3);
1389 sub(cnt2, zr, cnt2);
1390 addi(cnt1, cnt1, 8);
1391 }
1392 addi(cnt2, cnt2, isUL ? 4 : 8);
1393 bne(tmp1, tmp2, DIFFERENCE);
1394 bgez(cnt2, TAIL);
1395
1396 // main loop
1397 bind(NEXT_WORD);
1398 if (str1_isL == str2_isL) { // LL or UU
1399 add(t0, str1, cnt2);
1400 ld(tmp1, Address(t0));
1401 add(t0, str2, cnt2);
1402 ld(tmp2, Address(t0));
1403 addi(cnt2, cnt2, 8);
1404 } else if (isLU) { // LU case
1405 add(t0, str1, cnt1);
1406 lwu(tmp1, Address(t0));
1407 add(t0, str2, cnt2);
1408 ld(tmp2, Address(t0));
1409 addi(cnt1, cnt1, 4);
1410 inflate_lo32(tmp3, tmp1);
1411 mv(tmp1, tmp3);
1412 addi(cnt2, cnt2, 8);
1413 } else { // UL case
1414 add(t0, str2, cnt2);
1415 lwu(tmp2, Address(t0));
1416 add(t0, str1, cnt1);
1417 ld(tmp1, Address(t0));
1418 inflate_lo32(tmp3, tmp2);
1419 mv(tmp2, tmp3);
1420 addi(cnt1, cnt1, 8);
1421 addi(cnt2, cnt2, 4);
1422 }
1423 bne(tmp1, tmp2, DIFFERENCE);
1424 bltz(cnt2, NEXT_WORD);
1425 bind(TAIL);
1426 if (str1_isL == str2_isL) { // LL or UU
1427 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1428 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1429 } else if (isLU) { // LU case
1430 load_int_misaligned(tmp1, Address(str1), tmp3, false);
1431 load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1432 inflate_lo32(tmp3, tmp1);
1433 mv(tmp1, tmp3);
1434 } else { // UL case
1435 load_int_misaligned(tmp2, Address(str2), tmp3, false);
1436 load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1437 inflate_lo32(tmp3, tmp2);
1438 mv(tmp2, tmp3);
1439 }
1440 bind(TAIL_CHECK);
1441 beq(tmp1, tmp2, DONE);
1442
1443 // Find the first different characters in the longwords and
1444 // compute their difference.
1445 bind(DIFFERENCE);
1446 xorr(tmp3, tmp1, tmp2);
1447 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1448 srl(tmp1, tmp1, result);
1449 srl(tmp2, tmp2, result);
1450 if (isLL) {
1451 andi(tmp1, tmp1, 0xFF);
1452 andi(tmp2, tmp2, 0xFF);
1453 } else {
1454 andi(tmp1, tmp1, 0xFFFF);
1455 andi(tmp2, tmp2, 0xFFFF);
1456 }
1457 sub(result, tmp1, tmp2);
1458 j(DONE);
1459 }
1460
1461 bind(STUB);
1462 RuntimeAddress stub = nullptr;
1463 switch (ae) {
1464 case StrIntrinsicNode::LL:
1465 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1466 break;
1467 case StrIntrinsicNode::UU:
1468 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1469 break;
1470 case StrIntrinsicNode::LU:
1471 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1472 break;
1473 case StrIntrinsicNode::UL:
1474 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1475 break;
1476 default:
1477 ShouldNotReachHere();
1478 }
1479 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1480 address call = trampoline_call(stub);
1481 if (call == nullptr) {
1482 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1483 ciEnv::current()->record_failure("CodeCache is full");
1484 return;
1485 }
1486 j(DONE);
1487
1488 bind(SHORT_STRING);
1489 // Is the minimum length zero?
1490 beqz(cnt2, DONE);
1491 // arrange code to do most branches while loading and loading next characters
1492 // while comparing previous
1493 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1494 addi(str1, str1, str1_chr_size);
1495 addi(cnt2, cnt2, -1);
1496 beqz(cnt2, SHORT_LAST_INIT);
1497 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1498 addi(str2, str2, str2_chr_size);
1499 j(SHORT_LOOP_START);
1500 bind(SHORT_LOOP);
1501 addi(cnt2, cnt2, -1);
1502 beqz(cnt2, SHORT_LAST);
1503 bind(SHORT_LOOP_START);
1504 (this->*str1_load_chr)(tmp2, Address(str1), t0);
1505 addi(str1, str1, str1_chr_size);
1506 (this->*str2_load_chr)(t0, Address(str2), t0);
1507 addi(str2, str2, str2_chr_size);
1508 bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1509 addi(cnt2, cnt2, -1);
1510 beqz(cnt2, SHORT_LAST2);
1511 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1512 addi(str1, str1, str1_chr_size);
1513 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1514 addi(str2, str2, str2_chr_size);
1515 beq(tmp2, t0, SHORT_LOOP);
1516 sub(result, tmp2, t0);
1517 j(DONE);
1518 bind(SHORT_LOOP_TAIL);
1519 sub(result, tmp1, cnt1);
1520 j(DONE);
1521 bind(SHORT_LAST2);
1522 beq(tmp2, t0, DONE);
1523 sub(result, tmp2, t0);
1524
1525 j(DONE);
1526 bind(SHORT_LAST_INIT);
1527 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1528 addi(str2, str2, str2_chr_size);
1529 bind(SHORT_LAST);
1530 beq(tmp1, cnt1, DONE);
1531 sub(result, tmp1, cnt1);
1532
1533 bind(DONE);
1534
1535 BLOCK_COMMENT("} string_compare");
1536 }
1537
1538 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1539 Register tmp4, Register tmp5, Register tmp6, Register result,
1540 Register cnt1, int elem_size) {
1541 Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1542 Register tmp1 = t0;
1543 Register tmp2 = t1;
1544 Register cnt2 = tmp2; // cnt2 only used in array length compare
1545 Register elem_per_word = tmp6;
1546 int log_elem_size = exact_log2(elem_size);
1547 int length_offset = arrayOopDesc::length_offset_in_bytes();
1548 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1549
1550 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1551 assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1552 mv(elem_per_word, wordSize / elem_size);
1553
1554 BLOCK_COMMENT("arrays_equals {");
1555
1556 // if (a1 == a2), return true
1557 beq(a1, a2, SAME);
1558
1559 mv(result, false);
1560 beqz(a1, DONE);
1561 beqz(a2, DONE);
1562 lwu(cnt1, Address(a1, length_offset));
1563 lwu(cnt2, Address(a2, length_offset));
1564 bne(cnt2, cnt1, DONE);
1565 beqz(cnt1, SAME);
1566
1567 slli(tmp5, cnt1, 3 + log_elem_size);
1568 sub(tmp5, zr, tmp5);
1569 add(a1, a1, base_offset);
1570 add(a2, a2, base_offset);
1571 ld(tmp3, Address(a1, 0));
1572 ld(tmp4, Address(a2, 0));
1573 ble(cnt1, elem_per_word, SHORT); // short or same
1574
1575 // Main 16 byte comparison loop with 2 exits
1576 bind(NEXT_DWORD); {
1577 ld(tmp1, Address(a1, wordSize));
1578 ld(tmp2, Address(a2, wordSize));
1579 sub(cnt1, cnt1, 2 * wordSize / elem_size);
1580 blez(cnt1, TAIL);
1581 bne(tmp3, tmp4, DONE);
1582 ld(tmp3, Address(a1, 2 * wordSize));
1583 ld(tmp4, Address(a2, 2 * wordSize));
1584 add(a1, a1, 2 * wordSize);
1585 add(a2, a2, 2 * wordSize);
1586 ble(cnt1, elem_per_word, TAIL2);
1587 } beq(tmp1, tmp2, NEXT_DWORD);
1588 j(DONE);
1589
1590 bind(TAIL);
1591 xorr(tmp4, tmp3, tmp4);
1592 xorr(tmp2, tmp1, tmp2);
1593 sll(tmp2, tmp2, tmp5);
1594 orr(tmp5, tmp4, tmp2);
1595 j(IS_TMP5_ZR);
1596
1597 bind(TAIL2);
1598 bne(tmp1, tmp2, DONE);
1599
1600 bind(SHORT);
1601 xorr(tmp4, tmp3, tmp4);
1602 sll(tmp5, tmp4, tmp5);
1603
1604 bind(IS_TMP5_ZR);
1605 bnez(tmp5, DONE);
1606
1607 bind(SAME);
1608 mv(result, true);
1609 // That's it.
1610 bind(DONE);
1611
1612 BLOCK_COMMENT("} array_equals");
1613 }
1614
1615 // Compare Strings
1616
1617 // For Strings we're passed the address of the first characters in a1
1618 // and a2 and the length in cnt1.
1619 // elem_size is the element size in bytes: either 1 or 2.
1620 // There are two implementations. For arrays >= 8 bytes, all
1621 // comparisons (for hw supporting unaligned access: including the final one,
1622 // which may overlap) are performed 8 bytes at a time.
1623 // For strings < 8 bytes (and for tails of long strings when
1624 // AvoidUnalignedAccesses is true), we compare a
1625 // halfword, then a short, and then a byte.
1626
1627 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1628 Register result, Register cnt1, int elem_size)
1629 {
1630 Label SAME, DONE, SHORT, NEXT_WORD;
1631 Register tmp1 = t0;
1632 Register tmp2 = t1;
1633
1634 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
1635 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1636
1637 BLOCK_COMMENT("string_equals {");
1638
1639 beqz(cnt1, SAME);
1640 mv(result, false);
1641
1642 // Check for short strings, i.e. smaller than wordSize.
1643 sub(cnt1, cnt1, wordSize);
1644 bltz(cnt1, SHORT);
1645
1646 // Main 8 byte comparison loop.
1647 bind(NEXT_WORD); {
1648 ld(tmp1, Address(a1, 0));
1649 add(a1, a1, wordSize);
1650 ld(tmp2, Address(a2, 0));
1651 add(a2, a2, wordSize);
1652 sub(cnt1, cnt1, wordSize);
1653 bne(tmp1, tmp2, DONE);
1654 } bgez(cnt1, NEXT_WORD);
1655
1656 if (!AvoidUnalignedAccesses) {
1657 // Last longword. In the case where length == 4 we compare the
1658 // same longword twice, but that's still faster than another
1659 // conditional branch.
1660 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1661 // length == 4.
1662 add(tmp1, a1, cnt1);
1663 ld(tmp1, Address(tmp1, 0));
1664 add(tmp2, a2, cnt1);
1665 ld(tmp2, Address(tmp2, 0));
1666 bne(tmp1, tmp2, DONE);
1667 j(SAME);
1668 } else {
1669 add(tmp1, cnt1, wordSize);
1670 beqz(tmp1, SAME);
1671 }
1672
1673 bind(SHORT);
1674 Label TAIL03, TAIL01;
1675
1676 // 0-7 bytes left.
1677 test_bit(tmp1, cnt1, 2);
1678 beqz(tmp1, TAIL03);
1679 {
1680 lwu(tmp1, Address(a1, 0));
1681 add(a1, a1, 4);
1682 lwu(tmp2, Address(a2, 0));
1683 add(a2, a2, 4);
1684 bne(tmp1, tmp2, DONE);
1685 }
1686
1687 bind(TAIL03);
1688 // 0-3 bytes left.
1689 test_bit(tmp1, cnt1, 1);
1690 beqz(tmp1, TAIL01);
1691 {
1692 lhu(tmp1, Address(a1, 0));
1693 add(a1, a1, 2);
1694 lhu(tmp2, Address(a2, 0));
1695 add(a2, a2, 2);
1696 bne(tmp1, tmp2, DONE);
1697 }
1698
1699 bind(TAIL01);
1700 if (elem_size == 1) { // Only needed when comparing 1-byte elements
1701 // 0-1 bytes left.
1702 test_bit(tmp1, cnt1, 0);
1703 beqz(tmp1, SAME);
1704 {
1705 lbu(tmp1, Address(a1, 0));
1706 lbu(tmp2, Address(a2, 0));
1707 bne(tmp1, tmp2, DONE);
1708 }
1709 }
1710
1711 // Arrays are equal.
1712 bind(SAME);
1713 mv(result, true);
1714
1715 // That's it.
1716 bind(DONE);
1717 BLOCK_COMMENT("} string_equals");
1718 }
1719
1720 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1721 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1722 bool is_far, bool is_unordered);
1723
1724 static conditional_branch_insn conditional_branches[] =
1725 {
1726 /* SHORT branches */
1727 (conditional_branch_insn)&MacroAssembler::beq,
1728 (conditional_branch_insn)&MacroAssembler::bgt,
1729 nullptr, // BoolTest::overflow
1730 (conditional_branch_insn)&MacroAssembler::blt,
1731 (conditional_branch_insn)&MacroAssembler::bne,
1732 (conditional_branch_insn)&MacroAssembler::ble,
1733 nullptr, // BoolTest::no_overflow
1734 (conditional_branch_insn)&MacroAssembler::bge,
1735
1736 /* UNSIGNED branches */
1737 (conditional_branch_insn)&MacroAssembler::beq,
1738 (conditional_branch_insn)&MacroAssembler::bgtu,
1739 nullptr,
1740 (conditional_branch_insn)&MacroAssembler::bltu,
1741 (conditional_branch_insn)&MacroAssembler::bne,
1742 (conditional_branch_insn)&MacroAssembler::bleu,
1743 nullptr,
1744 (conditional_branch_insn)&MacroAssembler::bgeu
1745 };
1746
1747 static float_conditional_branch_insn float_conditional_branches[] =
1748 {
1749 /* FLOAT SHORT branches */
1750 (float_conditional_branch_insn)&MacroAssembler::float_beq,
1751 (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1752 nullptr, // BoolTest::overflow
1753 (float_conditional_branch_insn)&MacroAssembler::float_blt,
1754 (float_conditional_branch_insn)&MacroAssembler::float_bne,
1755 (float_conditional_branch_insn)&MacroAssembler::float_ble,
1756 nullptr, // BoolTest::no_overflow
1757 (float_conditional_branch_insn)&MacroAssembler::float_bge,
1758
1759 /* DOUBLE SHORT branches */
1760 (float_conditional_branch_insn)&MacroAssembler::double_beq,
1761 (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1762 nullptr,
1763 (float_conditional_branch_insn)&MacroAssembler::double_blt,
1764 (float_conditional_branch_insn)&MacroAssembler::double_bne,
1765 (float_conditional_branch_insn)&MacroAssembler::double_ble,
1766 nullptr,
1767 (float_conditional_branch_insn)&MacroAssembler::double_bge
1768 };
1769
1770 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1771 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1772 "invalid conditional branch index");
1773 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1774 }
1775
1776 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1777 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1778 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1779 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1780 "invalid float conditional branch index");
1781 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1782 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1783 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1784 }
1785
1786 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1787 switch (cmpFlag) {
1788 case BoolTest::eq:
1789 case BoolTest::le:
1790 beqz(op1, L, is_far);
1791 break;
1792 case BoolTest::ne:
1793 case BoolTest::gt:
1794 bnez(op1, L, is_far);
1795 break;
1796 default:
1797 ShouldNotReachHere();
1798 }
1799 }
1800
1801 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1802 switch (cmpFlag) {
1803 case BoolTest::eq:
1804 beqz(op1, L, is_far);
1805 break;
1806 case BoolTest::ne:
1807 bnez(op1, L, is_far);
1808 break;
1809 default:
1810 ShouldNotReachHere();
1811 }
1812 }
1813
1814 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1815 Label L;
1816 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1817 mv(dst, src);
1818 bind(L);
1819 }
1820
1821 // Set dst to NaN if any NaN input.
1822 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1823 bool is_double, bool is_min) {
1824 assert_different_registers(dst, src1, src2);
1825
1826 Label Done, Compare;
1827
1828 is_double ? fclass_d(t0, src1)
1829 : fclass_s(t0, src1);
1830 is_double ? fclass_d(t1, src2)
1831 : fclass_s(t1, src2);
1832 orr(t0, t0, t1);
1833 andi(t0, t0, 0b1100000000); //if src1 or src2 is quiet or signaling NaN then return NaN
1834 beqz(t0, Compare);
1835 is_double ? fadd_d(dst, src1, src2)
1836 : fadd_s(dst, src1, src2);
1837 j(Done);
1838
1839 bind(Compare);
1840 if (is_double) {
1841 is_min ? fmin_d(dst, src1, src2)
1842 : fmax_d(dst, src1, src2);
1843 } else {
1844 is_min ? fmin_s(dst, src1, src2)
1845 : fmax_s(dst, src1, src2);
1846 }
1847
1848 bind(Done);
1849 }
1850
1851 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1852 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1853 Label loop;
1854 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1855
1856 bind(loop);
1857 vsetvli(tmp1, cnt, sew, Assembler::m2);
1858 vlex_v(vr1, a1, sew);
1859 vlex_v(vr2, a2, sew);
1860 vmsne_vv(vrs, vr1, vr2);
1861 vfirst_m(tmp2, vrs);
1862 bgez(tmp2, DONE);
1863 sub(cnt, cnt, tmp1);
1864 if (!islatin) {
1865 slli(tmp1, tmp1, 1); // get byte counts
1866 }
1867 add(a1, a1, tmp1);
1868 add(a2, a2, tmp1);
1869 bnez(cnt, loop);
1870
1871 mv(result, true);
1872 }
1873
1874 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt, int elem_size) {
1875 Label DONE;
1876 Register tmp1 = t0;
1877 Register tmp2 = t1;
1878
1879 BLOCK_COMMENT("string_equals_v {");
1880
1881 mv(result, false);
1882
1883 if (elem_size == 2) {
1884 srli(cnt, cnt, 1);
1885 }
1886
1887 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1888
1889 bind(DONE);
1890 BLOCK_COMMENT("} string_equals_v");
1891 }
1892
1893 // used by C2 ClearArray patterns.
1894 // base: Address of a buffer to be zeroed
1895 // cnt: Count in HeapWords
1896 //
1897 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
1898 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
1899 Label loop;
1900
1901 // making zero words
1902 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1903 vxor_vv(v4, v4, v4);
1904
1905 bind(loop);
1906 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1907 vse64_v(v4, base);
1908 sub(cnt, cnt, t0);
1909 shadd(base, t0, base, t0, 3);
1910 bnez(cnt, loop);
1911 }
1912
1913 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
1914 Register cnt1, int elem_size) {
1915 Label DONE;
1916 Register tmp1 = t0;
1917 Register tmp2 = t1;
1918 Register cnt2 = tmp2;
1919 int length_offset = arrayOopDesc::length_offset_in_bytes();
1920 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1921
1922 BLOCK_COMMENT("arrays_equals_v {");
1923
1924 // if (a1 == a2), return true
1925 mv(result, true);
1926 beq(a1, a2, DONE);
1927
1928 mv(result, false);
1929 // if a1 == null or a2 == null, return false
1930 beqz(a1, DONE);
1931 beqz(a2, DONE);
1932 // if (a1.length != a2.length), return false
1933 lwu(cnt1, Address(a1, length_offset));
1934 lwu(cnt2, Address(a2, length_offset));
1935 bne(cnt1, cnt2, DONE);
1936
1937 la(a1, Address(a1, base_offset));
1938 la(a2, Address(a2, base_offset));
1939
1940 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1941
1942 bind(DONE);
1943
1944 BLOCK_COMMENT("} arrays_equals_v");
1945 }
1946
1947 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
1948 Register result, Register tmp1, Register tmp2, int encForm) {
1949 Label DIFFERENCE, DONE, L, loop;
1950 bool encLL = encForm == StrIntrinsicNode::LL;
1951 bool encLU = encForm == StrIntrinsicNode::LU;
1952 bool encUL = encForm == StrIntrinsicNode::UL;
1953
1954 bool str1_isL = encLL || encLU;
1955 bool str2_isL = encLL || encUL;
1956
1957 int minCharsInWord = encLL ? wordSize : wordSize / 2;
1958
1959 BLOCK_COMMENT("string_compare {");
1960
1961 // for Latin strings, 1 byte for 1 character
1962 // for UTF16 strings, 2 bytes for 1 character
1963 if (!str1_isL)
1964 sraiw(cnt1, cnt1, 1);
1965 if (!str2_isL)
1966 sraiw(cnt2, cnt2, 1);
1967
1968 // if str1 == str2, return the difference
1969 // save the minimum of the string lengths in cnt2.
1970 sub(result, cnt1, cnt2);
1971 bgt(cnt1, cnt2, L);
1972 mv(cnt2, cnt1);
1973 bind(L);
1974
1975 if (str1_isL == str2_isL) { // LL or UU
1976 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE);
1977 j(DONE);
1978 } else { // LU or UL
1979 Register strL = encLU ? str1 : str2;
1980 Register strU = encLU ? str2 : str1;
1981 VectorRegister vstr1 = encLU ? v8 : v4;
1982 VectorRegister vstr2 = encLU ? v4 : v8;
1983
1984 bind(loop);
1985 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
1986 vle8_v(vstr1, strL);
1987 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
1988 vzext_vf2(vstr2, vstr1);
1989 vle16_v(vstr1, strU);
1990 vmsne_vv(v4, vstr2, vstr1);
1991 vfirst_m(tmp2, v4);
1992 bgez(tmp2, DIFFERENCE);
1993 sub(cnt2, cnt2, tmp1);
1994 add(strL, strL, tmp1);
1995 shadd(strU, tmp1, strU, tmp1, 1);
1996 bnez(cnt2, loop);
1997 j(DONE);
1998 }
1999
2000 bind(DIFFERENCE);
2001 slli(tmp1, tmp2, 1);
2002 add(str1, str1, str1_isL ? tmp2 : tmp1);
2003 add(str2, str2, str2_isL ? tmp2 : tmp1);
2004 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2005 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2006 sub(result, tmp1, tmp2);
2007
2008 bind(DONE);
2009 }
2010
2011 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2012 Label loop;
2013 assert_different_registers(src, dst, len, tmp, t0);
2014
2015 BLOCK_COMMENT("byte_array_inflate_v {");
2016 bind(loop);
2017 vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2018 vle8_v(v6, src);
2019 vsetvli(t0, len, Assembler::e16, Assembler::m4);
2020 vzext_vf2(v4, v6);
2021 vse16_v(v4, dst);
2022 sub(len, len, tmp);
2023 add(src, src, tmp);
2024 shadd(dst, tmp, dst, tmp, 1);
2025 bnez(len, loop);
2026 BLOCK_COMMENT("} byte_array_inflate_v");
2027 }
2028
2029 // Compress char[] array to byte[].
2030 // result: the array length if every element in array can be encoded; 0, otherwise.
2031 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2032 Register result, Register tmp) {
2033 Label done;
2034 encode_iso_array_v(src, dst, len, result, tmp, false);
2035 beqz(len, done);
2036 mv(result, zr);
2037 bind(done);
2038 }
2039
2040 // Intrinsic for
2041 //
2042 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
2043 // return the number of characters copied.
2044 // - java/lang/StringUTF16.compress
2045 // return zero (0) if copy fails, otherwise 'len'.
2046 //
2047 // This version always returns the number of characters copied. A successful
2048 // copy will complete with the post-condition: 'res' == 'len', while an
2049 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2050 //
2051 // Clobbers: src, dst, len, result, t0
2052 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2053 Register result, Register tmp, bool ascii) {
2054 Label loop, fail, done;
2055
2056 BLOCK_COMMENT("encode_iso_array_v {");
2057 mv(result, 0);
2058
2059 bind(loop);
2060 mv(tmp, ascii ? 0x7f : 0xff);
2061 vsetvli(t0, len, Assembler::e16, Assembler::m2);
2062 vle16_v(v2, src);
2063
2064 vmsgtu_vx(v1, v2, tmp);
2065 vfirst_m(tmp, v1);
2066 vmsbf_m(v0, v1);
2067 // compress char to byte
2068 vsetvli(t0, len, Assembler::e8);
2069 vncvt_x_x_w(v1, v2, Assembler::v0_t);
2070 vse8_v(v1, dst, Assembler::v0_t);
2071
2072 // fail if char > 0x7f/0xff
2073 bgez(tmp, fail);
2074 add(result, result, t0);
2075 add(dst, dst, t0);
2076 sub(len, len, t0);
2077 shadd(src, t0, src, t0, 1);
2078 bnez(len, loop);
2079 j(done);
2080
2081 bind(fail);
2082 add(result, result, tmp);
2083
2084 bind(done);
2085 BLOCK_COMMENT("} encode_iso_array_v");
2086 }
2087
2088 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2089 Label LOOP, SET_RESULT, DONE;
2090
2091 BLOCK_COMMENT("count_positives_v {");
2092 assert_different_registers(ary, len, result, tmp);
2093
2094 mv(result, zr);
2095
2096 bind(LOOP);
2097 vsetvli(t0, len, Assembler::e8, Assembler::m4);
2098 vle8_v(v4, ary);
2099 vmslt_vx(v4, v4, zr);
2100 vfirst_m(tmp, v4);
2101 bgez(tmp, SET_RESULT);
2102 // if tmp == -1, all bytes are positive
2103 add(result, result, t0);
2104
2105 sub(len, len, t0);
2106 add(ary, ary, t0);
2107 bnez(len, LOOP);
2108 j(DONE);
2109
2110 // add remaining positive bytes count
2111 bind(SET_RESULT);
2112 add(result, result, tmp);
2113
2114 bind(DONE);
2115 BLOCK_COMMENT("} count_positives_v");
2116 }
2117
2118 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2119 Register ch, Register result,
2120 Register tmp1, Register tmp2,
2121 bool isL) {
2122 mv(result, zr);
2123
2124 Label loop, MATCH, DONE;
2125 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2126 bind(loop);
2127 vsetvli(tmp1, cnt1, sew, Assembler::m4);
2128 vlex_v(v4, str1, sew);
2129 vmseq_vx(v4, v4, ch);
2130 vfirst_m(tmp2, v4);
2131 bgez(tmp2, MATCH); // if equal, return index
2132
2133 add(result, result, tmp1);
2134 sub(cnt1, cnt1, tmp1);
2135 if (!isL) slli(tmp1, tmp1, 1);
2136 add(str1, str1, tmp1);
2137 bnez(cnt1, loop);
2138
2139 mv(result, -1);
2140 j(DONE);
2141
2142 bind(MATCH);
2143 add(result, result, tmp2);
2144
2145 bind(DONE);
2146 }
2147
2148 // Set dst to NaN if any NaN input.
2149 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2150 bool is_double, bool is_min, int vector_length) {
2151 assert_different_registers(dst, src1, src2);
2152
2153 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2154
2155 is_min ? vfmin_vv(dst, src1, src2)
2156 : vfmax_vv(dst, src1, src2);
2157
2158 vmfne_vv(v0, src1, src1);
2159 vfadd_vv(dst, src1, src1, Assembler::v0_t);
2160 vmfne_vv(v0, src2, src2);
2161 vfadd_vv(dst, src2, src2, Assembler::v0_t);
2162 }
2163
2164 // Set dst to NaN if any NaN input.
2165 // The destination vector register elements corresponding to masked-off elements
2166 // are handled with a mask-undisturbed policy.
2167 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2168 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2169 bool is_double, bool is_min, int vector_length) {
2170 assert_different_registers(src1, src2, tmp1, tmp2);
2171 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2172
2173 // Check vector elements of src1 and src2 for NaN.
2174 vmfeq_vv(tmp1, src1, src1);
2175 vmfeq_vv(tmp2, src2, src2);
2176
2177 vmandn_mm(v0, vmask, tmp1);
2178 vfadd_vv(dst, src1, src1, Assembler::v0_t);
2179 vmandn_mm(v0, vmask, tmp2);
2180 vfadd_vv(dst, src2, src2, Assembler::v0_t);
2181
2182 vmand_mm(tmp2, tmp1, tmp2);
2183 vmand_mm(v0, vmask, tmp2);
2184 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2185 : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2186 }
2187
2188 // Set dst to NaN if any NaN input.
2189 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2190 FloatRegister src1, VectorRegister src2,
2191 VectorRegister tmp1, VectorRegister tmp2,
2192 bool is_double, bool is_min, int vector_length, VectorMask vm) {
2193 assert_different_registers(dst, src1);
2194 assert_different_registers(src2, tmp1, tmp2);
2195
2196 Label L_done, L_NaN_1, L_NaN_2;
2197 // Set dst to src1 if src1 is NaN
2198 is_double ? feq_d(t0, src1, src1)
2199 : feq_s(t0, src1, src1);
2200 beqz(t0, L_NaN_2);
2201
2202 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2203 vfmv_s_f(tmp2, src1);
2204
2205 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2206 : vfredmax_vs(tmp1, src2, tmp2, vm);
2207 vfmv_f_s(dst, tmp1);
2208
2209 // Checking NaNs in src2
2210 vmfne_vv(tmp1, src2, src2, vm);
2211 vcpop_m(t0, tmp1, vm);
2212 beqz(t0, L_done);
2213
2214 bind(L_NaN_1);
2215 vfredusum_vs(tmp1, src2, tmp2, vm);
2216 vfmv_f_s(dst, tmp1);
2217 j(L_done);
2218
2219 bind(L_NaN_2);
2220 is_double ? fmv_d(dst, src1)
2221 : fmv_s(dst, src1);
2222 bind(L_done);
2223 }
2224
2225 bool C2_MacroAssembler::in_scratch_emit_size() {
2226 if (ciEnv::current()->task() != nullptr) {
2227 PhaseOutput* phase_output = Compile::current()->output();
2228 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2229 return true;
2230 }
2231 }
2232 return MacroAssembler::in_scratch_emit_size();
2233 }
2234
2235 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
2236 VectorRegister src2, VectorRegister tmp,
2237 int opc, BasicType bt, int vector_length, VectorMask vm) {
2238 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2239 vsetvli_helper(bt, vector_length);
2240 vmv_s_x(tmp, src1);
2241 switch (opc) {
2242 case Op_AddReductionVI:
2243 case Op_AddReductionVL:
2244 vredsum_vs(tmp, src2, tmp, vm);
2245 break;
2246 case Op_AndReductionV:
2247 vredand_vs(tmp, src2, tmp, vm);
2248 break;
2249 case Op_OrReductionV:
2250 vredor_vs(tmp, src2, tmp, vm);
2251 break;
2252 case Op_XorReductionV:
2253 vredxor_vs(tmp, src2, tmp, vm);
2254 break;
2255 case Op_MaxReductionV:
2256 vredmax_vs(tmp, src2, tmp, vm);
2257 break;
2258 case Op_MinReductionV:
2259 vredmin_vs(tmp, src2, tmp, vm);
2260 break;
2261 default:
2262 ShouldNotReachHere();
2263 }
2264 vmv_x_s(dst, tmp);
2265 }
2266
2267 // Set vl and vtype for full and partial vector operations.
2268 // (vma = mu, vta = tu, vill = false)
2269 void C2_MacroAssembler::vsetvli_helper(BasicType bt, int vector_length, LMUL vlmul, Register tmp) {
2270 Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2271 if (vector_length <= 31) {
2272 vsetivli(tmp, vector_length, sew, vlmul);
2273 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2274 vsetvli(tmp, x0, sew, vlmul);
2275 } else {
2276 mv(tmp, vector_length);
2277 vsetvli(tmp, tmp, sew, vlmul);
2278 }
2279 }
2280
2281 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2282 int cond, BasicType bt, int vector_length, VectorMask vm) {
2283 assert(is_integral_type(bt), "unsupported element type");
2284 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2285 vsetvli_helper(bt, vector_length);
2286 vmclr_m(vd);
2287 switch (cond) {
2288 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2289 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2290 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2291 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2292 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2293 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2294 default:
2295 assert(false, "unsupported compare condition");
2296 ShouldNotReachHere();
2297 }
2298 }
2299
2300 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2301 int cond, BasicType bt, int vector_length, VectorMask vm) {
2302 assert(is_floating_point_type(bt), "unsupported element type");
2303 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2304 vsetvli_helper(bt, vector_length);
2305 vmclr_m(vd);
2306 switch (cond) {
2307 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2308 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2309 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2310 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2311 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2312 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2313 default:
2314 assert(false, "unsupported compare condition");
2315 ShouldNotReachHere();
2316 }
2317 }
2318
2319 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2320 VectorRegister src, BasicType src_bt) {
2321 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2322 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2323 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2324 // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2325 // and the overlap is in the highest-numbered part of the destination register group.
2326 // Since LMUL=1, vd and vs cannot be the same.
2327 assert_different_registers(dst, src);
2328
2329 vsetvli_helper(dst_bt, vector_length);
2330 if (src_bt == T_BYTE) {
2331 switch (dst_bt) {
2332 case T_SHORT:
2333 vsext_vf2(dst, src);
2334 break;
2335 case T_INT:
2336 vsext_vf4(dst, src);
2337 break;
2338 case T_LONG:
2339 vsext_vf8(dst, src);
2340 break;
2341 default:
2342 ShouldNotReachHere();
2343 }
2344 } else if (src_bt == T_SHORT) {
2345 if (dst_bt == T_INT) {
2346 vsext_vf2(dst, src);
2347 } else {
2348 vsext_vf4(dst, src);
2349 }
2350 } else if (src_bt == T_INT) {
2351 vsext_vf2(dst, src);
2352 }
2353 }
2354
2355 // Vector narrow from src to dst with specified element sizes.
2356 // High part of dst vector will be filled with zero.
2357 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2358 VectorRegister src, BasicType src_bt) {
2359 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2360 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2361 mv(t0, vector_length);
2362 if (src_bt == T_LONG) {
2363 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2364 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2365 // So we can currently only scale down by 1/2 the width at a time.
2366 vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2367 vncvt_x_x_w(dst, src);
2368 if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2369 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2370 vncvt_x_x_w(dst, dst);
2371 if (dst_bt == T_BYTE) {
2372 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2373 vncvt_x_x_w(dst, dst);
2374 }
2375 }
2376 } else if (src_bt == T_INT) {
2377 // T_SHORT
2378 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2379 vncvt_x_x_w(dst, src);
2380 if (dst_bt == T_BYTE) {
2381 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2382 vncvt_x_x_w(dst, dst);
2383 }
2384 } else if (src_bt == T_SHORT) {
2385 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2386 vncvt_x_x_w(dst, src);
2387 }
2388 }
2389
2390 #define VFCVT_SAFE(VFLOATCVT) \
2391 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2392 assert_different_registers(dst, src); \
2393 vxor_vv(dst, dst, dst); \
2394 vmfeq_vv(v0, src, src); \
2395 VFLOATCVT(dst, src, Assembler::v0_t); \
2396 }
2397
2398 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2399
2400 #undef VFCVT_SAFE
2401
2402 // Extract a scalar element from an vector at position 'idx'.
2403 // The input elements in src are expected to be of integral type.
2404 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2405 int idx, VectorRegister tmp) {
2406 assert(is_integral_type(bt), "unsupported element type");
2407 assert(idx >= 0, "idx cannot be negative");
2408 // Only need the first element after vector slidedown
2409 vsetvli_helper(bt, 1);
2410 if (idx == 0) {
2411 vmv_x_s(dst, src);
2412 } else if (idx <= 31) {
2413 vslidedown_vi(tmp, src, idx);
2414 vmv_x_s(dst, tmp);
2415 } else {
2416 mv(t0, idx);
2417 vslidedown_vx(tmp, src, t0);
2418 vmv_x_s(dst, tmp);
2419 }
2420 }
2421
2422 // Extract a scalar element from an vector at position 'idx'.
2423 // The input elements in src are expected to be of floating point type.
2424 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2425 int idx, VectorRegister tmp) {
2426 assert(is_floating_point_type(bt), "unsupported element type");
2427 assert(idx >= 0, "idx cannot be negative");
2428 // Only need the first element after vector slidedown
2429 vsetvli_helper(bt, 1);
2430 if (idx == 0) {
2431 vfmv_f_s(dst, src);
2432 } else if (idx <= 31) {
2433 vslidedown_vi(tmp, src, idx);
2434 vfmv_f_s(dst, tmp);
2435 } else {
2436 mv(t0, idx);
2437 vslidedown_vx(tmp, src, t0);
2438 vfmv_f_s(dst, tmp);
2439 }
2440 }