1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/objectMonitorTable.hpp"
34 #include "runtime/stubRoutines.hpp"
35 #include "runtime/synchronizer.hpp"
36 #include "utilities/globalDefinitions.hpp"
37
38 #ifdef PRODUCT
39 #define BLOCK_COMMENT(str) /* nothing */
40 #define STOP(error) stop(error)
41 #else
42 #define BLOCK_COMMENT(str) block_comment(str)
43 #define STOP(error) block_comment(error); stop(error)
44 #endif
45
46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
47
48 void C2_MacroAssembler::fast_lock(Register obj, Register box,
49 Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
50 // Flag register, zero for success; non-zero for failure.
51 Register flag = t1;
52
53 assert_different_registers(obj, box, tmp1, tmp2, tmp3, tmp4, flag, t0);
54
55 mv(flag, 1);
56
57 // Handle inflated monitor.
58 Label inflated;
59 // Finish fast lock successfully. MUST branch to with flag == 0
60 Label locked;
61 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
62 Label slow_path;
63
64 if (UseObjectMonitorTable) {
65 // Clear cache in case fast locking succeeds or we need to take the slow-path.
66 sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
67 }
68
69 if (DiagnoseSyncOnValueBasedClasses != 0) {
70 load_klass(tmp1, obj);
71 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
72 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
73 bnez(tmp1, slow_path);
74 }
75
76 const Register tmp1_mark = tmp1;
77 const Register tmp3_t = tmp3;
78
79 { // Fast locking
80
81 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
82 Label push;
83
84 const Register tmp2_top = tmp2;
85
86 // Check if lock-stack is full.
87 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
88 mv(tmp3_t, (unsigned)LockStack::end_offset());
89 bge(tmp2_top, tmp3_t, slow_path);
90
91 // Check if recursive.
92 add(tmp3_t, xthread, tmp2_top);
93 ld(tmp3_t, Address(tmp3_t, -oopSize));
94 beq(obj, tmp3_t, push);
95
96 // Relaxed normal load to check for monitor. Optimization for monitor case.
97 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
98 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
99 bnez(tmp3_t, inflated);
100
101 // Not inflated
102 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
103
104 // Try to lock. Transition lock-bits 0b01 => 0b00
105 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
106 xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
107 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
108 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
109 bne(tmp1_mark, tmp3_t, slow_path);
110
111 bind(push);
112 // After successful lock, push object on lock-stack.
113 add(tmp3_t, xthread, tmp2_top);
114 sd(obj, Address(tmp3_t));
115 addw(tmp2_top, tmp2_top, oopSize);
116 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
117 j(locked);
118 }
119
120 { // Handle inflated monitor.
121 bind(inflated);
122
123 const Register tmp1_monitor = tmp1;
124
125 if (!UseObjectMonitorTable) {
126 assert(tmp1_monitor == tmp1_mark, "should be the same here");
127 } else {
128 const Register tmp2_hash = tmp2;
129 const Register tmp3_bucket = tmp3;
130 Label monitor_found;
131
132 // Save the mark, we might need it to extract the hash.
133 mv(tmp2_hash, tmp1_mark);
134
135 // Look for the monitor in the om_cache.
136
137 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
138 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
139 const int num_unrolled = OMCache::CAPACITY;
140 for (int i = 0; i < num_unrolled; i++) {
141 ld(tmp1_monitor, Address(xthread, cache_offset + monitor_offset));
142 ld(tmp4, Address(xthread, cache_offset));
143 beq(obj, tmp4, monitor_found);
144 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
145 }
146
147 // Look for the monitor in the table.
148
149 // Get the hash code.
150 srli(tmp2_hash, tmp2_hash, markWord::hash_shift);
151
152 // Get the table and calculate the bucket's address.
153 la(tmp3_t, ExternalAddress(ObjectMonitorTable::current_table_address()));
154 ld(tmp3_t, Address(tmp3_t));
155 ld(tmp1, Address(tmp3_t, ObjectMonitorTable::table_capacity_mask_offset()));
156 andr(tmp2_hash, tmp2_hash, tmp1);
157 ld(tmp3_t, Address(tmp3_t, ObjectMonitorTable::table_buckets_offset()));
158
159 // Read the monitor from the bucket.
160 shadd(tmp3_bucket, tmp2_hash, tmp3_t, tmp4, LogBytesPerWord);
161 ld(tmp1_monitor, Address(tmp3_bucket));
162
163 // Check if the monitor in the bucket is special (empty, tombstone or removed).
164 mv(tmp2, ObjectMonitorTable::SpecialPointerValues::below_is_special);
165 bltu(tmp1_monitor, tmp2, slow_path);
166
167 // Check if object matches.
168 ld(tmp3, Address(tmp1_monitor, ObjectMonitor::object_offset()));
169 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
170 bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
171 bne(tmp3, obj, slow_path);
172
173 bind(monitor_found);
174 }
175
176 const Register tmp2_owner_addr = tmp2;
177 const Register tmp3_owner = tmp3;
178
179 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
180 const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
181 const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
182
183 Label monitor_locked;
184
185 // Compute owner address.
186 la(tmp2_owner_addr, owner_address);
187
188 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
189 Register tid = tmp4;
190 ld(tid, Address(xthread, JavaThread::monitor_owner_id_offset()));
191 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tid, Assembler::int64,
192 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
193 beqz(tmp3_owner, monitor_locked);
194
195 // Check if recursive.
196 bne(tmp3_owner, tid, slow_path);
197
198 // Recursive.
199 increment(recursions_address, 1, tmp2, tmp3);
200
201 bind(monitor_locked);
202 if (UseObjectMonitorTable) {
203 sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
204 }
205 }
206
207 bind(locked);
208 mv(flag, zr);
209
210 #ifdef ASSERT
211 // Check that locked label is reached with flag == 0.
212 Label flag_correct;
213 beqz(flag, flag_correct);
214 stop("Fast Lock Flag != 0");
215 #endif
216
217 bind(slow_path);
218 #ifdef ASSERT
219 // Check that slow_path label is reached with flag != 0.
220 bnez(flag, flag_correct);
221 stop("Fast Lock Flag == 0");
222 bind(flag_correct);
223 #endif
224 // C2 uses the value of flag (0 vs !0) to determine the continuation.
225 }
226
227 void C2_MacroAssembler::fast_unlock(Register obj, Register box,
228 Register tmp1, Register tmp2, Register tmp3) {
229 // Flag register, zero for success; non-zero for failure.
230 Register flag = t1;
231
232 assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0);
233
234 mv(flag, 1);
235
236 // Handle inflated monitor.
237 Label inflated, inflated_load_mark;
238 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
239 Label unlocked;
240 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
241 Label slow_path;
242
243 const Register tmp1_mark = tmp1;
244 const Register tmp2_top = tmp2;
245 const Register tmp3_t = tmp3;
246
247 { // Fast unlock
248 Label push_and_slow_path;
249
250 // Check if obj is top of lock-stack.
251 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
252 subw(tmp2_top, tmp2_top, oopSize);
253 add(tmp3_t, xthread, tmp2_top);
254 ld(tmp3_t, Address(tmp3_t));
255 // Top of lock stack was not obj. Must be monitor.
256 bne(obj, tmp3_t, inflated_load_mark);
257
258 // Pop lock-stack.
259 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
260 DEBUG_ONLY(sd(zr, Address(tmp3_t));)
261 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
262
263 // Check if recursive.
264 add(tmp3_t, xthread, tmp2_top);
265 ld(tmp3_t, Address(tmp3_t, -oopSize));
266 beq(obj, tmp3_t, unlocked);
267
268 // Not recursive.
269 // Load Mark.
270 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
271
272 // Check header for monitor (0b10).
273 // Because we got here by popping (meaning we pushed in locked)
274 // there will be no monitor in the box. So we need to push back the obj
275 // so that the runtime can fix any potential anonymous owner.
276 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
277 bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated);
278
279 // Try to unlock. Transition lock bits 0b00 => 0b01
280 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
281 ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
282 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
283 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
284 beq(tmp1_mark, tmp3_t, unlocked);
285
286 bind(push_and_slow_path);
287 // Compare and exchange failed.
288 // Restore lock-stack and handle the unlock in runtime.
289 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
290 DEBUG_ONLY(sd(obj, Address(tmp3_t));)
291 addw(tmp2_top, tmp2_top, oopSize);
292 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
293 j(slow_path);
294 }
295
296 { // Handle inflated monitor.
297 bind(inflated_load_mark);
298 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
299 #ifdef ASSERT
300 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
301 bnez(tmp3_t, inflated);
302 stop("Fast Unlock not monitor");
303 #endif
304
305 bind(inflated);
306
307 #ifdef ASSERT
308 Label check_done;
309 subw(tmp2_top, tmp2_top, oopSize);
310 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
311 blt(tmp2_top, tmp3_t, check_done);
312 add(tmp3_t, xthread, tmp2_top);
313 ld(tmp3_t, Address(tmp3_t));
314 bne(obj, tmp3_t, inflated);
315 stop("Fast Unlock lock on stack");
316 bind(check_done);
317 #endif
318
319 const Register tmp1_monitor = tmp1;
320
321 if (!UseObjectMonitorTable) {
322 assert(tmp1_monitor == tmp1_mark, "should be the same here");
323 // Untag the monitor.
324 subi(tmp1_monitor, tmp1_mark, (int)markWord::monitor_value);
325 } else {
326 ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
327 // No valid pointer below alignof(ObjectMonitor*). Take the slow path.
328 mv(tmp3_t, alignof(ObjectMonitor*));
329 bltu(tmp1_monitor, tmp3_t, slow_path);
330 }
331
332 const Register tmp2_recursions = tmp2;
333 Label not_recursive;
334
335 // Check if recursive.
336 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
337 beqz(tmp2_recursions, not_recursive);
338
339 // Recursive unlock.
340 subi(tmp2_recursions, tmp2_recursions, 1);
341 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
342 j(unlocked);
343
344 bind(not_recursive);
345
346 const Register tmp2_owner_addr = tmp2;
347
348 // Compute owner address.
349 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
350
351 // Set owner to null.
352 // Release to satisfy the JMM
353 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
354 sd(zr, Address(tmp2_owner_addr));
355 // We need a full fence after clearing owner to avoid stranding.
356 // StoreLoad achieves this.
357 membar(StoreLoad);
358
359 // Check if the entry_list is empty.
360 ld(t0, Address(tmp1_monitor, ObjectMonitor::entry_list_offset()));
361 beqz(t0, unlocked); // If so we are done.
362
363 // Check if there is a successor.
364 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset()));
365 bnez(tmp3_t, unlocked); // If so we are done.
366
367 // Save the monitor pointer in the current thread, so we can try
368 // to reacquire the lock in SharedRuntime::monitor_exit_helper().
369 sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
370
371 mv(flag, 1);
372 j(slow_path);
373 }
374
375 bind(unlocked);
376 mv(flag, zr);
377
378 #ifdef ASSERT
379 // Check that unlocked label is reached with flag == 0.
380 Label flag_correct;
381 beqz(flag, flag_correct);
382 stop("Fast Lock Flag != 0");
383 #endif
384
385 bind(slow_path);
386 #ifdef ASSERT
387 // Check that slow_path label is reached with flag != 0.
388 bnez(flag, flag_correct);
389 stop("Fast Lock Flag == 0");
390 bind(flag_correct);
391 #endif
392 // C2 uses the value of flag (0 vs !0) to determine the continuation.
393 }
394
395 // short string
396 // StringUTF16.indexOfChar
397 // StringLatin1.indexOfChar
398 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
399 Register ch, Register result,
400 bool isL)
401 {
402 Register ch1 = t0;
403 Register index = t1;
404
405 BLOCK_COMMENT("string_indexof_char_short {");
406
407 Label LOOP, LOOP1, LOOP4, LOOP8;
408 Label MATCH, MATCH1, MATCH2, MATCH3,
409 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
410
411 mv(result, -1);
412 mv(index, zr);
413
414 bind(LOOP);
415 addi(t0, index, 8);
416 ble(t0, cnt1, LOOP8);
417 addi(t0, index, 4);
418 ble(t0, cnt1, LOOP4);
419 j(LOOP1);
420
421 bind(LOOP8);
422 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
423 beq(ch, ch1, MATCH);
424 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
425 beq(ch, ch1, MATCH1);
426 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
427 beq(ch, ch1, MATCH2);
428 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
429 beq(ch, ch1, MATCH3);
430 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
431 beq(ch, ch1, MATCH4);
432 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
433 beq(ch, ch1, MATCH5);
434 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
435 beq(ch, ch1, MATCH6);
436 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
437 beq(ch, ch1, MATCH7);
438 addi(index, index, 8);
439 addi(str1, str1, isL ? 8 : 16);
440 blt(index, cnt1, LOOP);
441 j(NOMATCH);
442
443 bind(LOOP4);
444 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
445 beq(ch, ch1, MATCH);
446 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
447 beq(ch, ch1, MATCH1);
448 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
449 beq(ch, ch1, MATCH2);
450 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
451 beq(ch, ch1, MATCH3);
452 addi(index, index, 4);
453 addi(str1, str1, isL ? 4 : 8);
454 bge(index, cnt1, NOMATCH);
455
456 bind(LOOP1);
457 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
458 beq(ch, ch1, MATCH);
459 addi(index, index, 1);
460 addi(str1, str1, isL ? 1 : 2);
461 blt(index, cnt1, LOOP1);
462 j(NOMATCH);
463
464 bind(MATCH1);
465 addi(index, index, 1);
466 j(MATCH);
467
468 bind(MATCH2);
469 addi(index, index, 2);
470 j(MATCH);
471
472 bind(MATCH3);
473 addi(index, index, 3);
474 j(MATCH);
475
476 bind(MATCH4);
477 addi(index, index, 4);
478 j(MATCH);
479
480 bind(MATCH5);
481 addi(index, index, 5);
482 j(MATCH);
483
484 bind(MATCH6);
485 addi(index, index, 6);
486 j(MATCH);
487
488 bind(MATCH7);
489 addi(index, index, 7);
490
491 bind(MATCH);
492 mv(result, index);
493 bind(NOMATCH);
494 BLOCK_COMMENT("} string_indexof_char_short");
495 }
496
497 // StringUTF16.indexOfChar
498 // StringLatin1.indexOfChar
499 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
500 Register ch, Register result,
501 Register tmp1, Register tmp2,
502 Register tmp3, Register tmp4,
503 bool isL)
504 {
505 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
506 Register ch1 = t0;
507 Register orig_cnt = t1;
508 Register mask1 = tmp3;
509 Register mask2 = tmp2;
510 Register match_mask = tmp1;
511 Register trailing_char = tmp4;
512 Register unaligned_elems = tmp4;
513
514 BLOCK_COMMENT("string_indexof_char {");
515 beqz(cnt1, NOMATCH);
516
517 subi(t0, cnt1, isL ? 32 : 16);
518 bgtz(t0, DO_LONG);
519 string_indexof_char_short(str1, cnt1, ch, result, isL);
520 j(DONE);
521
522 bind(DO_LONG);
523 mv(orig_cnt, cnt1);
524 if (AvoidUnalignedAccesses) {
525 Label ALIGNED;
526 andi(unaligned_elems, str1, 0x7);
527 beqz(unaligned_elems, ALIGNED);
528 sub(unaligned_elems, unaligned_elems, 8);
529 neg(unaligned_elems, unaligned_elems);
530 if (!isL) {
531 srli(unaligned_elems, unaligned_elems, 1);
532 }
533 // do unaligned part per element
534 string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
535 bgez(result, DONE);
536 mv(orig_cnt, cnt1);
537 sub(cnt1, cnt1, unaligned_elems);
538 bind(ALIGNED);
539 }
540
541 // duplicate ch
542 if (isL) {
543 slli(ch1, ch, 8);
544 orr(ch, ch1, ch);
545 }
546 slli(ch1, ch, 16);
547 orr(ch, ch1, ch);
548 slli(ch1, ch, 32);
549 orr(ch, ch1, ch);
550
551 if (!isL) {
552 slli(cnt1, cnt1, 1);
553 }
554
555 uint64_t mask0101 = UCONST64(0x0101010101010101);
556 uint64_t mask0001 = UCONST64(0x0001000100010001);
557 mv(mask1, isL ? mask0101 : mask0001);
558 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
559 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
560 mv(mask2, isL ? mask7f7f : mask7fff);
561
562 bind(CH1_LOOP);
563 ld(ch1, Address(str1));
564 addi(str1, str1, 8);
565 subi(cnt1, cnt1, 8);
566 compute_match_mask(ch1, ch, match_mask, mask1, mask2);
567 bnez(match_mask, HIT);
568 bgtz(cnt1, CH1_LOOP);
569 j(NOMATCH);
570
571 bind(HIT);
572 // count bits of trailing zero chars
573 ctzc_bits(trailing_char, match_mask, isL, ch1, result);
574 srli(trailing_char, trailing_char, 3);
575 addi(cnt1, cnt1, 8);
576 ble(cnt1, trailing_char, NOMATCH);
577 // match case
578 if (!isL) {
579 srli(cnt1, cnt1, 1);
580 srli(trailing_char, trailing_char, 1);
581 }
582
583 sub(result, orig_cnt, cnt1);
584 add(result, result, trailing_char);
585 j(DONE);
586
587 bind(NOMATCH);
588 mv(result, -1);
589
590 bind(DONE);
591 BLOCK_COMMENT("} string_indexof_char");
592 }
593
594 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
595
596 // Search for needle in haystack and return index or -1
597 // x10: result
598 // x11: haystack
599 // x12: haystack_len
600 // x13: needle
601 // x14: needle_len
602 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
603 Register haystack_len, Register needle_len,
604 Register tmp1, Register tmp2,
605 Register tmp3, Register tmp4,
606 Register tmp5, Register tmp6,
607 Register result, int ae)
608 {
609 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
610
611 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
612
613 Register ch1 = t0;
614 Register ch2 = t1;
615 Register nlen_tmp = tmp1; // needle len tmp
616 Register hlen_tmp = tmp2; // haystack len tmp
617 Register result_tmp = tmp4;
618
619 bool isLL = ae == StrIntrinsicNode::LL;
620
621 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
622 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
623 int needle_chr_shift = needle_isL ? 0 : 1;
624 int haystack_chr_shift = haystack_isL ? 0 : 1;
625 int needle_chr_size = needle_isL ? 1 : 2;
626 int haystack_chr_size = haystack_isL ? 1 : 2;
627 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
628 (load_chr_insn)&MacroAssembler::lhu;
629 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
630 (load_chr_insn)&MacroAssembler::lhu;
631
632 BLOCK_COMMENT("string_indexof {");
633
634 // Note, inline_string_indexOf() generates checks:
635 // if (pattern.count > src.count) return -1;
636 // if (pattern.count == 0) return 0;
637
638 // We have two strings, a source string in haystack, haystack_len and a pattern string
639 // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
640
641 // For larger pattern and source we use a simplified Boyer Moore algorithm.
642 // With a small pattern and source we use linear scan.
643
644 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
645 sub(result_tmp, haystack_len, needle_len);
646 // needle_len < 8, use linear scan
647 sub(t0, needle_len, 8);
648 bltz(t0, LINEARSEARCH);
649 // needle_len >= 256, use linear scan
650 sub(t0, needle_len, 256);
651 bgez(t0, LINEARSTUB);
652 // needle_len >= haystack_len/4, use linear scan
653 srli(t0, haystack_len, 2);
654 bge(needle_len, t0, LINEARSTUB);
655
656 // Boyer-Moore-Horspool introduction:
657 // The Boyer Moore alogorithm is based on the description here:-
658 //
659 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
660 //
661 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
662 // and the 'Good Suffix' rule.
663 //
664 // These rules are essentially heuristics for how far we can shift the
665 // pattern along the search string.
666 //
667 // The implementation here uses the 'Bad Character' rule only because of the
668 // complexity of initialisation for the 'Good Suffix' rule.
669 //
670 // This is also known as the Boyer-Moore-Horspool algorithm:
671 //
672 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
673 //
674 // #define ASIZE 256
675 //
676 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
677 // int i, j;
678 // unsigned c;
679 // unsigned char bc[ASIZE];
680 //
681 // /* Preprocessing */
682 // for (i = 0; i < ASIZE; ++i)
683 // bc[i] = m;
684 // for (i = 0; i < m - 1; ) {
685 // c = pattern[i];
686 // ++i;
687 // // c < 256 for Latin1 string, so, no need for branch
688 // #ifdef PATTERN_STRING_IS_LATIN1
689 // bc[c] = m - i;
690 // #else
691 // if (c < ASIZE) bc[c] = m - i;
692 // #endif
693 // }
694 //
695 // /* Searching */
696 // j = 0;
697 // while (j <= n - m) {
698 // c = src[i+j];
699 // if (pattern[m-1] == c)
700 // int k;
701 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
702 // if (k < 0) return j;
703 // // c < 256 for Latin1 string, so, no need for branch
704 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
705 // // LL case: (c< 256) always true. Remove branch
706 // j += bc[pattern[j+m-1]];
707 // #endif
708 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
709 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
710 // if (c < ASIZE)
711 // j += bc[pattern[j+m-1]];
712 // else
713 // j += 1
714 // #endif
715 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
716 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
717 // if (c < ASIZE)
718 // j += bc[pattern[j+m-1]];
719 // else
720 // j += m
721 // #endif
722 // }
723 // return -1;
724 // }
725
726 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
727 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
728 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
729
730 Register haystack_end = haystack_len;
731 Register skipch = tmp2;
732
733 // pattern length is >=8, so, we can read at least 1 register for cases when
734 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
735 // UL case. We'll re-read last character in inner pre-loop code to have
736 // single outer pre-loop load
737 const int firstStep = isLL ? 7 : 3;
738
739 const int ASIZE = 256;
740 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
741
742 subi(sp, sp, ASIZE);
743
744 // init BC offset table with default value: needle_len
745 slli(t0, needle_len, 8);
746 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
747 slli(tmp1, t0, 16);
748 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
749 slli(tmp1, t0, 32);
750 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
751
752 mv(ch1, sp); // ch1 is t0
753 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
754
755 bind(BM_INIT_LOOP);
756 // for (i = 0; i < ASIZE; ++i)
757 // bc[i] = m;
758 for (int i = 0; i < 4; i++) {
759 sd(tmp5, Address(ch1, i * wordSize));
760 }
761 addi(ch1, ch1, 32);
762 subi(tmp6, tmp6, 4);
763 bgtz(tmp6, BM_INIT_LOOP);
764
765 subi(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
766 Register orig_haystack = tmp5;
767 mv(orig_haystack, haystack);
768 // result_tmp = tmp4
769 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
770 subi(ch2, needle_len, 1); // bc offset init value, ch2 is t1
771 mv(tmp3, needle);
772
773 // for (i = 0; i < m - 1; ) {
774 // c = pattern[i];
775 // ++i;
776 // // c < 256 for Latin1 string, so, no need for branch
777 // #ifdef PATTERN_STRING_IS_LATIN1
778 // bc[c] = m - i;
779 // #else
780 // if (c < ASIZE) bc[c] = m - i;
781 // #endif
782 // }
783 bind(BCLOOP);
784 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
785 addi(tmp3, tmp3, needle_chr_size);
786 if (!needle_isL) {
787 // ae == StrIntrinsicNode::UU
788 mv(tmp6, ASIZE);
789 bgeu(ch1, tmp6, BCSKIP);
790 }
791 add(tmp4, sp, ch1);
792 sb(ch2, Address(tmp4)); // store skip offset to BC offset table
793
794 bind(BCSKIP);
795 subi(ch2, ch2, 1); // for next pattern element, skip distance -1
796 bgtz(ch2, BCLOOP);
797
798 // tmp6: pattern end, address after needle
799 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
800 if (needle_isL == haystack_isL) {
801 // load last 8 bytes (8LL/4UU symbols)
802 ld(tmp6, Address(tmp6, -wordSize));
803 } else {
804 // UL: from UTF-16(source) search Latin1(pattern)
805 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
806 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
807 // We'll have to wait until load completed, but it's still faster than per-character loads+checks
808 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
809 slli(ch2, tmp6, XLEN - 24);
810 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
811 slli(ch1, tmp6, XLEN - 16);
812 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
813 zext(tmp6, tmp6, 8); // pattern[m-4], 0x0000000d
814 slli(ch2, ch2, 16);
815 orr(ch2, ch2, ch1); // 0x00000b0c
816 slli(result, tmp3, 48); // use result as temp register
817 orr(tmp6, tmp6, result); // 0x0a00000d
818 slli(result, ch2, 16);
819 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
820 }
821
822 // i = m - 1;
823 // skipch = j + i;
824 // if (skipch == pattern[m - 1]
825 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
826 // else
827 // move j with bad char offset table
828 bind(BMLOOPSTR2);
829 // compare pattern to source string backward
830 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
831 (this->*haystack_load_1chr)(skipch, Address(result), noreg);
832 subi(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
833 if (needle_isL == haystack_isL) {
834 // re-init tmp3. It's for free because it's executed in parallel with
835 // load above. Alternative is to initialize it before loop, but it'll
836 // affect performance on in-order systems with 2 or more ld/st pipelines
837 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
838 }
839 if (!isLL) { // UU/UL case
840 slli(ch2, nlen_tmp, 1); // offsets in bytes
841 }
842 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
843 add(result, haystack, isLL ? nlen_tmp : ch2);
844 // load 8 bytes from source string
845 // if isLL is false then read granularity can be 2
846 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
847 mv(ch1, tmp6);
848 if (isLL) {
849 j(BMLOOPSTR1_AFTER_LOAD);
850 } else {
851 subi(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
852 j(BMLOOPSTR1_CMP);
853 }
854
855 bind(BMLOOPSTR1);
856 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
857 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
858 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
859 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
860
861 bind(BMLOOPSTR1_AFTER_LOAD);
862 subi(nlen_tmp, nlen_tmp, 1);
863 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
864
865 bind(BMLOOPSTR1_CMP);
866 beq(ch1, ch2, BMLOOPSTR1);
867
868 bind(BMSKIP);
869 if (!isLL) {
870 // if we've met UTF symbol while searching Latin1 pattern, then we can
871 // skip needle_len symbols
872 if (needle_isL != haystack_isL) {
873 mv(result_tmp, needle_len);
874 } else {
875 mv(result_tmp, 1);
876 }
877 mv(t0, ASIZE);
878 bgeu(skipch, t0, BMADV);
879 }
880 add(result_tmp, sp, skipch);
881 lbu(result_tmp, Address(result_tmp)); // load skip offset
882
883 bind(BMADV);
884 subi(nlen_tmp, needle_len, 1);
885 // move haystack after bad char skip offset
886 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
887 ble(haystack, haystack_end, BMLOOPSTR2);
888 addi(sp, sp, ASIZE);
889 j(NOMATCH);
890
891 bind(BMLOOPSTR1_LASTCMP);
892 bne(ch1, ch2, BMSKIP);
893
894 bind(BMMATCH);
895 sub(result, haystack, orig_haystack);
896 if (!haystack_isL) {
897 srli(result, result, 1);
898 }
899 addi(sp, sp, ASIZE);
900 j(DONE);
901
902 bind(LINEARSTUB);
903 subi(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
904 bltz(t0, LINEARSEARCH);
905 mv(result, zr);
906 RuntimeAddress stub = nullptr;
907 if (isLL) {
908 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
909 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
910 } else if (needle_isL) {
911 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
912 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
913 } else {
914 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
915 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
916 }
917 address call = reloc_call(stub);
918 if (call == nullptr) {
919 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
920 ciEnv::current()->record_failure("CodeCache is full");
921 return;
922 }
923 j(DONE);
924
925 bind(NOMATCH);
926 mv(result, -1);
927 j(DONE);
928
929 bind(LINEARSEARCH);
930 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
931
932 bind(DONE);
933 BLOCK_COMMENT("} string_indexof");
934 }
935
936 // string_indexof
937 // result: x10
938 // src: x11
939 // src_count: x12
940 // pattern: x13
941 // pattern_count: x14 or 1/2/3/4
942 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
943 Register haystack_len, Register needle_len,
944 Register tmp1, Register tmp2,
945 Register tmp3, Register tmp4,
946 int needle_con_cnt, Register result, int ae)
947 {
948 // Note:
949 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
950 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
951 assert(needle_con_cnt <= 4, "Invalid needle constant count");
952 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
953
954 Register ch1 = t0;
955 Register ch2 = t1;
956 Register hlen_neg = haystack_len, nlen_neg = needle_len;
957 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
958
959 bool isLL = ae == StrIntrinsicNode::LL;
960
961 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
962 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
963 int needle_chr_shift = needle_isL ? 0 : 1;
964 int haystack_chr_shift = haystack_isL ? 0 : 1;
965 int needle_chr_size = needle_isL ? 1 : 2;
966 int haystack_chr_size = haystack_isL ? 1 : 2;
967
968 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
969 (load_chr_insn)&MacroAssembler::lhu;
970 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
971 (load_chr_insn)&MacroAssembler::lhu;
972 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
973 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
974
975 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
976
977 Register first = tmp3;
978
979 if (needle_con_cnt == -1) {
980 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
981
982 subi(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
983 bltz(t0, DOSHORT);
984
985 (this->*needle_load_1chr)(first, Address(needle), noreg);
986 slli(t0, needle_len, needle_chr_shift);
987 add(needle, needle, t0);
988 neg(nlen_neg, t0);
989 slli(t0, result_tmp, haystack_chr_shift);
990 add(haystack, haystack, t0);
991 neg(hlen_neg, t0);
992
993 bind(FIRST_LOOP);
994 add(t0, haystack, hlen_neg);
995 (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
996 beq(first, ch2, STR1_LOOP);
997
998 bind(STR2_NEXT);
999 addi(hlen_neg, hlen_neg, haystack_chr_size);
1000 blez(hlen_neg, FIRST_LOOP);
1001 j(NOMATCH);
1002
1003 bind(STR1_LOOP);
1004 addi(nlen_tmp, nlen_neg, needle_chr_size);
1005 addi(hlen_tmp, hlen_neg, haystack_chr_size);
1006 bgez(nlen_tmp, MATCH);
1007
1008 bind(STR1_NEXT);
1009 add(ch1, needle, nlen_tmp);
1010 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1011 add(ch2, haystack, hlen_tmp);
1012 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1013 bne(ch1, ch2, STR2_NEXT);
1014 addi(nlen_tmp, nlen_tmp, needle_chr_size);
1015 addi(hlen_tmp, hlen_tmp, haystack_chr_size);
1016 bltz(nlen_tmp, STR1_NEXT);
1017 j(MATCH);
1018
1019 bind(DOSHORT);
1020 if (needle_isL == haystack_isL) {
1021 subi(t0, needle_len, 2);
1022 bltz(t0, DO1);
1023 bgtz(t0, DO3);
1024 }
1025 }
1026
1027 if (needle_con_cnt == 4) {
1028 Label CH1_LOOP;
1029 (this->*load_4chr)(ch1, Address(needle), noreg);
1030 subi(result_tmp, haystack_len, 4);
1031 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1032 add(haystack, haystack, tmp3);
1033 neg(hlen_neg, tmp3);
1034 if (AvoidUnalignedAccesses) {
1035 // preload first value, then we will read by 1 character per loop, instead of four
1036 // just shifting previous ch2 right by size of character in bits
1037 add(tmp3, haystack, hlen_neg);
1038 (this->*load_4chr)(ch2, Address(tmp3), noreg);
1039 if (isLL) {
1040 // need to erase 1 most significant byte in 32-bit value of ch2
1041 slli(ch2, ch2, 40);
1042 srli(ch2, ch2, 32);
1043 } else {
1044 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1045 }
1046 }
1047
1048 bind(CH1_LOOP);
1049 add(tmp3, haystack, hlen_neg);
1050 if (AvoidUnalignedAccesses) {
1051 srli(ch2, ch2, isLL ? 8 : 16);
1052 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1053 slli(tmp3, tmp3, isLL ? 24 : 48);
1054 add(ch2, ch2, tmp3);
1055 } else {
1056 (this->*load_4chr)(ch2, Address(tmp3), noreg);
1057 }
1058 beq(ch1, ch2, MATCH);
1059 addi(hlen_neg, hlen_neg, haystack_chr_size);
1060 blez(hlen_neg, CH1_LOOP);
1061 j(NOMATCH);
1062 }
1063
1064 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1065 Label CH1_LOOP;
1066 BLOCK_COMMENT("string_indexof DO2 {");
1067 bind(DO2);
1068 (this->*load_2chr)(ch1, Address(needle), noreg);
1069 if (needle_con_cnt == 2) {
1070 subi(result_tmp, haystack_len, 2);
1071 }
1072 slli(tmp3, result_tmp, haystack_chr_shift);
1073 add(haystack, haystack, tmp3);
1074 neg(hlen_neg, tmp3);
1075 if (AvoidUnalignedAccesses) {
1076 // preload first value, then we will read by 1 character per loop, instead of two
1077 // just shifting previous ch2 right by size of character in bits
1078 add(tmp3, haystack, hlen_neg);
1079 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1080 slli(ch2, ch2, isLL ? 8 : 16);
1081 }
1082 bind(CH1_LOOP);
1083 add(tmp3, haystack, hlen_neg);
1084 if (AvoidUnalignedAccesses) {
1085 srli(ch2, ch2, isLL ? 8 : 16);
1086 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1087 slli(tmp3, tmp3, isLL ? 8 : 16);
1088 add(ch2, ch2, tmp3);
1089 } else {
1090 (this->*load_2chr)(ch2, Address(tmp3), noreg);
1091 }
1092 beq(ch1, ch2, MATCH);
1093 addi(hlen_neg, hlen_neg, haystack_chr_size);
1094 blez(hlen_neg, CH1_LOOP);
1095 j(NOMATCH);
1096 BLOCK_COMMENT("} string_indexof DO2");
1097 }
1098
1099 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1100 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1101 BLOCK_COMMENT("string_indexof DO3 {");
1102
1103 bind(DO3);
1104 (this->*load_2chr)(first, Address(needle), noreg);
1105 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1106 if (needle_con_cnt == 3) {
1107 subi(result_tmp, haystack_len, 3);
1108 }
1109 slli(hlen_tmp, result_tmp, haystack_chr_shift);
1110 add(haystack, haystack, hlen_tmp);
1111 neg(hlen_neg, hlen_tmp);
1112
1113 bind(FIRST_LOOP);
1114 add(ch2, haystack, hlen_neg);
1115 if (AvoidUnalignedAccesses) {
1116 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1117 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1118 slli(tmp2, tmp2, isLL ? 8 : 16);
1119 add(ch2, ch2, tmp2);
1120 } else {
1121 (this->*load_2chr)(ch2, Address(ch2), noreg);
1122 }
1123 beq(first, ch2, STR1_LOOP);
1124
1125 bind(STR2_NEXT);
1126 addi(hlen_neg, hlen_neg, haystack_chr_size);
1127 blez(hlen_neg, FIRST_LOOP);
1128 j(NOMATCH);
1129
1130 bind(STR1_LOOP);
1131 addi(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1132 add(ch2, haystack, hlen_tmp);
1133 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1134 bne(ch1, ch2, STR2_NEXT);
1135 j(MATCH);
1136 BLOCK_COMMENT("} string_indexof DO3");
1137 }
1138
1139 if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1140 Label DO1_LOOP;
1141
1142 BLOCK_COMMENT("string_indexof DO1 {");
1143 bind(DO1);
1144 (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1145 subi(result_tmp, haystack_len, 1);
1146 slli(tmp3, result_tmp, haystack_chr_shift);
1147 add(haystack, haystack, tmp3);
1148 neg(hlen_neg, tmp3);
1149
1150 bind(DO1_LOOP);
1151 add(tmp3, haystack, hlen_neg);
1152 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1153 beq(ch1, ch2, MATCH);
1154 addi(hlen_neg, hlen_neg, haystack_chr_size);
1155 blez(hlen_neg, DO1_LOOP);
1156 BLOCK_COMMENT("} string_indexof DO1");
1157 }
1158
1159 bind(NOMATCH);
1160 mv(result, -1);
1161 j(DONE);
1162
1163 bind(MATCH);
1164 srai(t0, hlen_neg, haystack_chr_shift);
1165 add(result, result_tmp, t0);
1166
1167 bind(DONE);
1168 }
1169
1170 // Compare longwords
1171 void C2_MacroAssembler::string_compare_long_same_encoding(Register result, Register str1, Register str2,
1172 const bool isLL, Register cnt1, Register cnt2,
1173 Register tmp1, Register tmp2, Register tmp3,
1174 const int STUB_THRESHOLD, Label *STUB, Label *SHORT_STRING, Label *DONE) {
1175 Label TAIL_CHECK, TAIL, NEXT_WORD, DIFFERENCE;
1176
1177 const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1178 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1179
1180 const int minCharsInWord = isLL ? wordSize : wordSize / 2;
1181
1182 // load first parts of strings and finish initialization while loading
1183 beq(str1, str2, *DONE);
1184 // Alignment
1185 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1186 lwu(tmp1, Address(str1));
1187 lwu(tmp2, Address(str2));
1188 bne(tmp1, tmp2, DIFFERENCE);
1189 addi(str1, str1, 4);
1190 addi(str2, str2, 4);
1191 subi(cnt2, cnt2, minCharsInWord / 2);
1192
1193 // A very short string
1194 mv(t0, minCharsInWord);
1195 ble(cnt2, t0, *SHORT_STRING);
1196 }
1197 #ifdef ASSERT
1198 if (AvoidUnalignedAccesses) {
1199 Label align_ok;
1200 orr(t0, str1, str2);
1201 andi(t0, t0, 0x7);
1202 beqz(t0, align_ok);
1203 stop("bad alignment");
1204 bind(align_ok);
1205 }
1206 #endif
1207 // load 8 bytes once to compare
1208 ld(tmp1, Address(str1));
1209 ld(tmp2, Address(str2));
1210 mv(t0, STUB_THRESHOLD);
1211 bge(cnt2, t0, *STUB);
1212 subi(cnt2, cnt2, minCharsInWord);
1213 beqz(cnt2, TAIL_CHECK);
1214 // convert cnt2 from characters to bytes
1215 if (!isLL) {
1216 slli(cnt2, cnt2, 1);
1217 }
1218 add(str2, str2, cnt2);
1219 add(str1, str1, cnt2);
1220 sub(cnt2, zr, cnt2);
1221 addi(cnt2, cnt2, 8);
1222 bne(tmp1, tmp2, DIFFERENCE);
1223 bgez(cnt2, TAIL);
1224
1225 // main loop
1226 bind(NEXT_WORD);
1227 // 8-byte aligned loads when AvoidUnalignedAccesses is enabled
1228 add(t0, str1, cnt2);
1229 ld(tmp1, Address(t0));
1230 add(t0, str2, cnt2);
1231 ld(tmp2, Address(t0));
1232 addi(cnt2, cnt2, 8);
1233 bne(tmp1, tmp2, DIFFERENCE);
1234 bltz(cnt2, NEXT_WORD);
1235
1236 bind(TAIL);
1237 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1238 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1239
1240 bind(TAIL_CHECK);
1241 beq(tmp1, tmp2, *DONE);
1242
1243 // Find the first different characters in the longwords and
1244 // compute their difference.
1245 bind(DIFFERENCE);
1246 xorr(tmp3, tmp1, tmp2);
1247 // count bits of trailing zero chars
1248 ctzc_bits(result, tmp3, isLL);
1249 srl(tmp1, tmp1, result);
1250 srl(tmp2, tmp2, result);
1251 if (isLL) {
1252 zext(tmp1, tmp1, 8);
1253 zext(tmp2, tmp2, 8);
1254 } else {
1255 zext(tmp1, tmp1, 16);
1256 zext(tmp2, tmp2, 16);
1257 }
1258 sub(result, tmp1, tmp2);
1259
1260 j(*DONE);
1261 }
1262
1263 // Compare longwords
1264 void C2_MacroAssembler::string_compare_long_different_encoding(Register result, Register str1, Register str2,
1265 bool isLU, Register cnt1, Register cnt2,
1266 Register tmp1, Register tmp2, Register tmp3,
1267 const int STUB_THRESHOLD, Label *STUB, Label *DONE) {
1268 Label TAIL, NEXT_WORD, DIFFERENCE;
1269
1270 const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1271 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1272
1273 Register strL = isLU ? str1 : str2;
1274 Register strU = isLU ? str2 : str1;
1275 Register tmpL = tmp1, tmpU = tmp2;
1276
1277 // load first parts of strings and finish initialization while loading
1278 mv(t0, STUB_THRESHOLD);
1279 bge(cnt2, t0, *STUB);
1280 lwu(tmpL, Address(strL));
1281 load_long_misaligned(tmpU, Address(strU), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1282 subi(cnt2, cnt2, 4);
1283 add(strL, strL, cnt2);
1284 sub(cnt1, zr, cnt2);
1285 slli(cnt2, cnt2, 1);
1286 add(strU, strU, cnt2);
1287 inflate_lo32(tmp3, tmpL);
1288 mv(tmpL, tmp3);
1289 sub(cnt2, zr, cnt2);
1290 addi(cnt1, cnt1, 4);
1291 addi(cnt2, cnt2, 8);
1292 bne(tmpL, tmpU, DIFFERENCE);
1293 bgez(cnt2, TAIL);
1294
1295 // main loop
1296 bind(NEXT_WORD);
1297 add(t0, strL, cnt1);
1298 lwu(tmpL, Address(t0));
1299 add(t0, strU, cnt2);
1300 load_long_misaligned(tmpU, Address(t0), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1301 addi(cnt1, cnt1, 4);
1302 inflate_lo32(tmp3, tmpL);
1303 mv(tmpL, tmp3);
1304 addi(cnt2, cnt2, 8);
1305 bne(tmpL, tmpU, DIFFERENCE);
1306 bltz(cnt2, NEXT_WORD);
1307
1308 bind(TAIL);
1309 load_int_misaligned(tmpL, Address(strL), tmp3, false);
1310 load_long_misaligned(tmpU, Address(strU), tmp3, 2);
1311 inflate_lo32(tmp3, tmpL);
1312 mv(tmpL, tmp3);
1313
1314 beq(tmpL, tmpU, *DONE);
1315
1316 // Find the first different characters in the longwords and
1317 // compute their difference.
1318 bind(DIFFERENCE);
1319 xorr(tmp3, tmpL, tmpU);
1320 // count bits of trailing zero chars
1321 ctzc_bits(result, tmp3);
1322 srl(tmpL, tmpL, result);
1323 srl(tmpU, tmpU, result);
1324 zext(tmpL, tmpL, 16);
1325 zext(tmpU, tmpU, 16);
1326 if (isLU) {
1327 sub(result, tmpL, tmpU);
1328 } else {
1329 sub(result, tmpU, tmpL);
1330 }
1331
1332 j(*DONE);
1333 }
1334
1335 // Compare strings.
1336 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1337 Register cnt1, Register cnt2, Register result,
1338 Register tmp1, Register tmp2, Register tmp3,
1339 int ae)
1340 {
1341 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, STUB,
1342 SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1343 SHORT_LOOP_START, L;
1344
1345 const int STUB_THRESHOLD = 64 + 8;
1346 bool isLL = ae == StrIntrinsicNode::LL;
1347 bool isLU = ae == StrIntrinsicNode::LU;
1348 bool isUL = ae == StrIntrinsicNode::UL;
1349
1350 bool str1_isL = isLL || isLU;
1351 bool str2_isL = isLL || isUL;
1352
1353 // for L strings, 1 byte for 1 character
1354 // for U strings, 2 bytes for 1 character
1355 int str1_chr_size = str1_isL ? 1 : 2;
1356 int str2_chr_size = str2_isL ? 1 : 2;
1357 int minCharsInWord = isLL ? wordSize : wordSize / 2;
1358
1359 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1360 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1361
1362 BLOCK_COMMENT("string_compare {");
1363
1364 // Bizarrely, the counts are passed in bytes, regardless of whether they
1365 // are L or U strings, however the result is always in characters.
1366 if (!str1_isL) {
1367 sraiw(cnt1, cnt1, 1);
1368 }
1369 if (!str2_isL) {
1370 sraiw(cnt2, cnt2, 1);
1371 }
1372
1373 // Compute the minimum of the string lengths and save the difference in result.
1374 sub(result, cnt1, cnt2);
1375 bgt(cnt1, cnt2, L);
1376 mv(cnt2, cnt1);
1377 bind(L);
1378
1379 // A very short string
1380 mv(t0, minCharsInWord);
1381 ble(cnt2, t0, SHORT_STRING);
1382
1383 // Compare longwords
1384 {
1385 if (str1_isL == str2_isL) { // LL or UU
1386 string_compare_long_same_encoding(result,
1387 str1, str2, isLL,
1388 cnt1, cnt2, tmp1, tmp2, tmp3,
1389 STUB_THRESHOLD, &STUB, &SHORT_STRING, &DONE);
1390 } else { // LU or UL
1391 string_compare_long_different_encoding(result,
1392 str1, str2, isLU,
1393 cnt1, cnt2, tmp1, tmp2, tmp3,
1394 STUB_THRESHOLD, &STUB, &DONE);
1395 }
1396 }
1397
1398 bind(STUB);
1399 RuntimeAddress stub = nullptr;
1400 switch (ae) {
1401 case StrIntrinsicNode::LL:
1402 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1403 break;
1404 case StrIntrinsicNode::UU:
1405 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1406 break;
1407 case StrIntrinsicNode::LU:
1408 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1409 break;
1410 case StrIntrinsicNode::UL:
1411 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1412 break;
1413 default:
1414 ShouldNotReachHere();
1415 }
1416 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1417 address call = reloc_call(stub);
1418 if (call == nullptr) {
1419 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1420 ciEnv::current()->record_failure("CodeCache is full");
1421 return;
1422 }
1423 j(DONE);
1424
1425 bind(SHORT_STRING);
1426 // Is the minimum length zero?
1427 beqz(cnt2, DONE);
1428 // arrange code to do most branches while loading and loading next characters
1429 // while comparing previous
1430 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1431 addi(str1, str1, str1_chr_size);
1432 subi(cnt2, cnt2, 1);
1433 beqz(cnt2, SHORT_LAST_INIT);
1434 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1435 addi(str2, str2, str2_chr_size);
1436 j(SHORT_LOOP_START);
1437 bind(SHORT_LOOP);
1438 subi(cnt2, cnt2, 1);
1439 beqz(cnt2, SHORT_LAST);
1440 bind(SHORT_LOOP_START);
1441 (this->*str1_load_chr)(tmp2, Address(str1), t0);
1442 addi(str1, str1, str1_chr_size);
1443 (this->*str2_load_chr)(t0, Address(str2), t0);
1444 addi(str2, str2, str2_chr_size);
1445 bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1446 subi(cnt2, cnt2, 1);
1447 beqz(cnt2, SHORT_LAST2);
1448 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1449 addi(str1, str1, str1_chr_size);
1450 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1451 addi(str2, str2, str2_chr_size);
1452 beq(tmp2, t0, SHORT_LOOP);
1453 sub(result, tmp2, t0);
1454 j(DONE);
1455 bind(SHORT_LOOP_TAIL);
1456 sub(result, tmp1, cnt1);
1457 j(DONE);
1458 bind(SHORT_LAST2);
1459 beq(tmp2, t0, DONE);
1460 sub(result, tmp2, t0);
1461
1462 j(DONE);
1463 bind(SHORT_LAST_INIT);
1464 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1465 addi(str2, str2, str2_chr_size);
1466 bind(SHORT_LAST);
1467 beq(tmp1, cnt1, DONE);
1468 sub(result, tmp1, cnt1);
1469
1470 bind(DONE);
1471
1472 BLOCK_COMMENT("} string_compare");
1473 }
1474
1475 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1476 Register tmp1, Register tmp2, Register tmp3,
1477 Register result, int elem_size) {
1478 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1479 assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1480
1481 int elem_per_word = wordSize / elem_size;
1482 int log_elem_size = exact_log2(elem_size);
1483 int length_offset = arrayOopDesc::length_offset_in_bytes();
1484 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1485
1486 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1487
1488 Register cnt1 = tmp3;
1489 Register cnt2 = tmp1; // cnt2 only used in array length compare
1490 Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1491
1492 BLOCK_COMMENT("arrays_equals {");
1493
1494 // if (a1 == a2), return true
1495 beq(a1, a2, SAME);
1496
1497 mv(result, false);
1498 // if (a1 == nullptr || a2 == nullptr)
1499 // return false;
1500 beqz(a1, DONE);
1501 beqz(a2, DONE);
1502
1503 // if (a1.length != a2.length)
1504 // return false;
1505 lwu(cnt1, Address(a1, length_offset));
1506 lwu(cnt2, Address(a2, length_offset));
1507 bne(cnt1, cnt2, DONE);
1508
1509 la(a1, Address(a1, base_offset));
1510 la(a2, Address(a2, base_offset));
1511
1512 // Load 4 bytes once to compare for alignment before main loop.
1513 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1514 subi(cnt1, cnt1, elem_per_word / 2);
1515 bltz(cnt1, TAIL03);
1516 lwu(tmp1, Address(a1));
1517 lwu(tmp2, Address(a2));
1518 addi(a1, a1, 4);
1519 addi(a2, a2, 4);
1520 bne(tmp1, tmp2, DONE);
1521 }
1522
1523 // Check for short strings, i.e. smaller than wordSize.
1524 subi(cnt1, cnt1, elem_per_word);
1525 bltz(cnt1, SHORT);
1526
1527 #ifdef ASSERT
1528 if (AvoidUnalignedAccesses) {
1529 Label align_ok;
1530 orr(t0, a1, a2);
1531 andi(t0, t0, 0x7);
1532 beqz(t0, align_ok);
1533 stop("bad alignment");
1534 bind(align_ok);
1535 }
1536 #endif
1537
1538 // Main 8 byte comparison loop.
1539 bind(NEXT_WORD); {
1540 ld(tmp1, Address(a1));
1541 ld(tmp2, Address(a2));
1542 subi(cnt1, cnt1, elem_per_word);
1543 addi(a1, a1, wordSize);
1544 addi(a2, a2, wordSize);
1545 bne(tmp1, tmp2, DONE);
1546 } bgez(cnt1, NEXT_WORD);
1547
1548 addi(tmp1, cnt1, elem_per_word);
1549 beqz(tmp1, SAME);
1550
1551 bind(SHORT);
1552 test_bit(tmp1, cnt1, 2 - log_elem_size);
1553 beqz(tmp1, TAIL03); // 0-7 bytes left.
1554 {
1555 lwu(tmp1, Address(a1));
1556 lwu(tmp2, Address(a2));
1557 addi(a1, a1, 4);
1558 addi(a2, a2, 4);
1559 bne(tmp1, tmp2, DONE);
1560 }
1561
1562 bind(TAIL03);
1563 test_bit(tmp1, cnt1, 1 - log_elem_size);
1564 beqz(tmp1, TAIL01); // 0-3 bytes left.
1565 {
1566 lhu(tmp1, Address(a1));
1567 lhu(tmp2, Address(a2));
1568 addi(a1, a1, 2);
1569 addi(a2, a2, 2);
1570 bne(tmp1, tmp2, DONE);
1571 }
1572
1573 bind(TAIL01);
1574 if (elem_size == 1) { // Only needed when comparing byte arrays.
1575 test_bit(tmp1, cnt1, 0);
1576 beqz(tmp1, SAME); // 0-1 bytes left.
1577 {
1578 lbu(tmp1, Address(a1));
1579 lbu(tmp2, Address(a2));
1580 bne(tmp1, tmp2, DONE);
1581 }
1582 }
1583
1584 bind(SAME);
1585 mv(result, true);
1586 // That's it.
1587 bind(DONE);
1588
1589 BLOCK_COMMENT("} arrays_equals");
1590 }
1591
1592 // Compare Strings
1593
1594 // For Strings we're passed the address of the first characters in a1 and a2
1595 // and the length in cnt1. There are two implementations.
1596 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1597 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1598 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1599
1600 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1601 Register result, Register cnt1)
1602 {
1603 Label SAME, DONE, SHORT, NEXT_WORD, TAIL03, TAIL01;
1604 Register tmp1 = t0;
1605 Register tmp2 = t1;
1606
1607 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1608
1609 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1610
1611 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1612
1613 BLOCK_COMMENT("string_equals {");
1614
1615 mv(result, false);
1616
1617 // Load 4 bytes once to compare for alignment before main loop.
1618 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1619 subi(cnt1, cnt1, 4);
1620 bltz(cnt1, TAIL03);
1621 lwu(tmp1, Address(a1));
1622 lwu(tmp2, Address(a2));
1623 addi(a1, a1, 4);
1624 addi(a2, a2, 4);
1625 bne(tmp1, tmp2, DONE);
1626 }
1627
1628 // Check for short strings, i.e. smaller than wordSize.
1629 subi(cnt1, cnt1, wordSize);
1630 bltz(cnt1, SHORT);
1631
1632 #ifdef ASSERT
1633 if (AvoidUnalignedAccesses) {
1634 Label align_ok;
1635 orr(t0, a1, a2);
1636 andi(t0, t0, 0x7);
1637 beqz(t0, align_ok);
1638 stop("bad alignment");
1639 bind(align_ok);
1640 }
1641 #endif
1642
1643 // Main 8 byte comparison loop.
1644 bind(NEXT_WORD); {
1645 ld(tmp1, Address(a1));
1646 ld(tmp2, Address(a2));
1647 subi(cnt1, cnt1, wordSize);
1648 addi(a1, a1, wordSize);
1649 addi(a2, a2, wordSize);
1650 bne(tmp1, tmp2, DONE);
1651 } bgez(cnt1, NEXT_WORD);
1652
1653 addi(tmp1, cnt1, wordSize);
1654 beqz(tmp1, SAME);
1655
1656 bind(SHORT);
1657 // 0-7 bytes left.
1658 test_bit(tmp1, cnt1, 2);
1659 beqz(tmp1, TAIL03);
1660 {
1661 lwu(tmp1, Address(a1));
1662 lwu(tmp2, Address(a2));
1663 addi(a1, a1, 4);
1664 addi(a2, a2, 4);
1665 bne(tmp1, tmp2, DONE);
1666 }
1667
1668 bind(TAIL03);
1669 // 0-3 bytes left.
1670 test_bit(tmp1, cnt1, 1);
1671 beqz(tmp1, TAIL01);
1672 {
1673 lhu(tmp1, Address(a1));
1674 lhu(tmp2, Address(a2));
1675 addi(a1, a1, 2);
1676 addi(a2, a2, 2);
1677 bne(tmp1, tmp2, DONE);
1678 }
1679
1680 bind(TAIL01);
1681 // 0-1 bytes left.
1682 test_bit(tmp1, cnt1, 0);
1683 beqz(tmp1, SAME);
1684 {
1685 lbu(tmp1, Address(a1));
1686 lbu(tmp2, Address(a2));
1687 bne(tmp1, tmp2, DONE);
1688 }
1689
1690 // Arrays are equal.
1691 bind(SAME);
1692 mv(result, true);
1693
1694 // That's it.
1695 bind(DONE);
1696 BLOCK_COMMENT("} string_equals");
1697 }
1698
1699 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1700 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1701 Register tmp1, Register tmp2, Register tmp3,
1702 Register tmp4, Register tmp5, Register tmp6,
1703 BasicType eltype)
1704 {
1705 assert(!UseRVV, "sanity");
1706 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1707
1708 const int elsize = arrays_hashcode_elsize(eltype);
1709 const int chunks_end_shift = exact_log2(elsize);
1710
1711 switch (eltype) {
1712 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1713 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
1714 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
1715 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
1716 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
1717 default:
1718 ShouldNotReachHere();
1719 }
1720
1721 const int stride = 4;
1722 const Register pow31_4 = tmp1;
1723 const Register pow31_3 = tmp2;
1724 const Register pow31_2 = tmp3;
1725 const Register chunks = tmp4;
1726 const Register chunks_end = chunks;
1727
1728 Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1729
1730 // result has a value initially
1731
1732 beqz(cnt, DONE);
1733
1734 andi(chunks, cnt, ~(stride - 1));
1735 beqz(chunks, TAIL);
1736
1737 mv(pow31_4, 923521); // [31^^4]
1738 mv(pow31_3, 29791); // [31^^3]
1739 mv(pow31_2, 961); // [31^^2]
1740
1741 shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
1742 andi(cnt, cnt, stride - 1); // don't forget about tail!
1743
1744 bind(WIDE_LOOP);
1745 arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype);
1746 arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype);
1747 arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1748 arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1749 mulw(result, result, pow31_4); // 31^^4 * h
1750 mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0]
1751 addw(result, result, t0);
1752 mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1]
1753 addw(result, result, t1);
1754 slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2]
1755 subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2]
1756 addw(result, result, tmp5);
1757 addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1758 // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1759 addi(ary, ary, elsize * stride);
1760 bne(ary, chunks_end, WIDE_LOOP);
1761 beqz(cnt, DONE);
1762
1763 bind(TAIL);
1764 shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
1765
1766 bind(TAIL_LOOP);
1767 arrays_hashcode_elload(t0, Address(ary), eltype);
1768 slli(t1, result, 5); // optimize 31 * result
1769 subw(result, t1, result); // with result<<5 - result
1770 addw(result, result, t0);
1771 addi(ary, ary, elsize);
1772 bne(ary, chunks_end, TAIL_LOOP);
1773
1774 bind(DONE);
1775 BLOCK_COMMENT("} // arrays_hashcode");
1776 }
1777
1778 void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register result,
1779 Register tmp1, Register tmp2, Register tmp3,
1780 BasicType eltype)
1781 {
1782 assert(UseRVV, "sanity");
1783 assert(StubRoutines::riscv::arrays_hashcode_powers_of_31() != nullptr, "sanity");
1784 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, t0, t1);
1785
1786 // The MaxVectorSize should have been set by detecting RVV max vector register
1787 // size when check UseRVV (i.e. MaxVectorSize == VM_Version::_initial_vector_length).
1788 // Let's use T_INT as all hashCode calculations eventually deal with ints.
1789 const int lmul = 2;
1790 const int stride = MaxVectorSize / sizeof(jint) * lmul;
1791
1792 const int elsize_bytes = arrays_hashcode_elsize(eltype);
1793 const int elsize_shift = exact_log2(elsize_bytes);
1794
1795 switch (eltype) {
1796 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode_v(unsigned byte) {"); break;
1797 case T_CHAR: BLOCK_COMMENT("arrays_hashcode_v(char) {"); break;
1798 case T_BYTE: BLOCK_COMMENT("arrays_hashcode_v(byte) {"); break;
1799 case T_SHORT: BLOCK_COMMENT("arrays_hashcode_v(short) {"); break;
1800 case T_INT: BLOCK_COMMENT("arrays_hashcode_v(int) {"); break;
1801 default:
1802 ShouldNotReachHere();
1803 }
1804
1805 const Register pow31_highest = tmp1;
1806 const Register ary_end = tmp2;
1807 const Register consumed = tmp3;
1808
1809 const VectorRegister v_sum = v2;
1810 const VectorRegister v_src = v4;
1811 const VectorRegister v_coeffs = v6;
1812 const VectorRegister v_tmp = v8;
1813
1814 const address adr_pows31 = StubRoutines::riscv::arrays_hashcode_powers_of_31()
1815 + sizeof(jint);
1816 Label VEC_LOOP, DONE, SCALAR_TAIL, SCALAR_TAIL_LOOP;
1817
1818 // NB: at this point (a) 'result' already has some value,
1819 // (b) 'cnt' is not 0 or 1, see java code for details.
1820
1821 andi(t0, cnt, ~(stride - 1));
1822 beqz(t0, SCALAR_TAIL);
1823
1824 la(t1, ExternalAddress(adr_pows31));
1825 lw(pow31_highest, Address(t1, -1 * sizeof(jint)));
1826
1827 vsetvli(consumed, cnt, Assembler::e32, Assembler::m2);
1828 vle32_v(v_coeffs, t1); // 31^^(stride - 1) ... 31^^0
1829 vmv_v_x(v_sum, x0);
1830
1831 bind(VEC_LOOP);
1832 arrays_hashcode_elload_v(v_src, v_tmp, ary, eltype);
1833 vmul_vv(v_src, v_src, v_coeffs);
1834 vmadd_vx(v_sum, pow31_highest, v_src);
1835 mulw(result, result, pow31_highest);
1836 shadd(ary, consumed, ary, t0, elsize_shift);
1837 subw(cnt, cnt, consumed);
1838 andi(t1, cnt, ~(stride - 1));
1839 bnez(t1, VEC_LOOP);
1840
1841 vmv_s_x(v_tmp, x0);
1842 vredsum_vs(v_sum, v_sum, v_tmp);
1843 vmv_x_s(t0, v_sum);
1844 addw(result, result, t0);
1845 beqz(cnt, DONE);
1846
1847 bind(SCALAR_TAIL);
1848 shadd(ary_end, cnt, ary, t0, elsize_shift);
1849
1850 bind(SCALAR_TAIL_LOOP);
1851 arrays_hashcode_elload(t0, Address(ary), eltype);
1852 slli(t1, result, 5); // optimize 31 * result
1853 subw(result, t1, result); // with result<<5 - result
1854 addw(result, result, t0);
1855 addi(ary, ary, elsize_bytes);
1856 bne(ary, ary_end, SCALAR_TAIL_LOOP);
1857
1858 bind(DONE);
1859 BLOCK_COMMENT("} // arrays_hashcode_v");
1860 }
1861
1862 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1863 switch (eltype) {
1864 case T_BOOLEAN: return sizeof(jboolean);
1865 case T_BYTE: return sizeof(jbyte);
1866 case T_SHORT: return sizeof(jshort);
1867 case T_CHAR: return sizeof(jchar);
1868 case T_INT: return sizeof(jint);
1869 default:
1870 ShouldNotReachHere();
1871 return -1;
1872 }
1873 }
1874
1875 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1876 switch (eltype) {
1877 // T_BOOLEAN used as surrogate for unsigned byte
1878 case T_BOOLEAN: lbu(dst, src); break;
1879 case T_BYTE: lb(dst, src); break;
1880 case T_SHORT: lh(dst, src); break;
1881 case T_CHAR: lhu(dst, src); break;
1882 case T_INT: lw(dst, src); break;
1883 default:
1884 ShouldNotReachHere();
1885 }
1886 }
1887
1888 void C2_MacroAssembler::arrays_hashcode_elload_v(VectorRegister vdst,
1889 VectorRegister vtmp,
1890 Register src,
1891 BasicType eltype) {
1892 assert_different_registers(vdst, vtmp);
1893 switch (eltype) {
1894 case T_BOOLEAN:
1895 vle8_v(vtmp, src);
1896 vzext_vf4(vdst, vtmp);
1897 break;
1898 case T_BYTE:
1899 vle8_v(vtmp, src);
1900 vsext_vf4(vdst, vtmp);
1901 break;
1902 case T_CHAR:
1903 vle16_v(vtmp, src);
1904 vzext_vf2(vdst, vtmp);
1905 break;
1906 case T_SHORT:
1907 vle16_v(vtmp, src);
1908 vsext_vf2(vdst, vtmp);
1909 break;
1910 case T_INT:
1911 vle32_v(vdst, src);
1912 break;
1913 default:
1914 ShouldNotReachHere();
1915 }
1916 }
1917
1918 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1919 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1920 bool is_far, bool is_unordered);
1921
1922 static conditional_branch_insn conditional_branches[] =
1923 {
1924 /* SHORT branches */
1925 (conditional_branch_insn)&MacroAssembler::beq,
1926 (conditional_branch_insn)&MacroAssembler::bgt,
1927 nullptr, // BoolTest::overflow
1928 (conditional_branch_insn)&MacroAssembler::blt,
1929 (conditional_branch_insn)&MacroAssembler::bne,
1930 (conditional_branch_insn)&MacroAssembler::ble,
1931 nullptr, // BoolTest::no_overflow
1932 (conditional_branch_insn)&MacroAssembler::bge,
1933
1934 /* UNSIGNED branches */
1935 (conditional_branch_insn)&MacroAssembler::beq,
1936 (conditional_branch_insn)&MacroAssembler::bgtu,
1937 nullptr,
1938 (conditional_branch_insn)&MacroAssembler::bltu,
1939 (conditional_branch_insn)&MacroAssembler::bne,
1940 (conditional_branch_insn)&MacroAssembler::bleu,
1941 nullptr,
1942 (conditional_branch_insn)&MacroAssembler::bgeu
1943 };
1944
1945 static float_conditional_branch_insn float_conditional_branches[] =
1946 {
1947 /* FLOAT SHORT branches */
1948 (float_conditional_branch_insn)&MacroAssembler::float_beq,
1949 (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1950 nullptr, // BoolTest::overflow
1951 (float_conditional_branch_insn)&MacroAssembler::float_blt,
1952 (float_conditional_branch_insn)&MacroAssembler::float_bne,
1953 (float_conditional_branch_insn)&MacroAssembler::float_ble,
1954 nullptr, // BoolTest::no_overflow
1955 (float_conditional_branch_insn)&MacroAssembler::float_bge,
1956
1957 /* DOUBLE SHORT branches */
1958 (float_conditional_branch_insn)&MacroAssembler::double_beq,
1959 (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1960 nullptr,
1961 (float_conditional_branch_insn)&MacroAssembler::double_blt,
1962 (float_conditional_branch_insn)&MacroAssembler::double_bne,
1963 (float_conditional_branch_insn)&MacroAssembler::double_ble,
1964 nullptr,
1965 (float_conditional_branch_insn)&MacroAssembler::double_bge
1966 };
1967
1968 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1969 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1970 "invalid conditional branch index");
1971 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1972 }
1973
1974 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1975 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1976 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1977 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1978 "invalid float conditional branch index");
1979 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1980 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1981 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1982 }
1983
1984 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1985 switch (cmpFlag) {
1986 case BoolTest::eq:
1987 case BoolTest::le:
1988 beqz(op1, L, is_far);
1989 break;
1990 case BoolTest::ne:
1991 case BoolTest::gt:
1992 bnez(op1, L, is_far);
1993 break;
1994 default:
1995 ShouldNotReachHere();
1996 }
1997 }
1998
1999 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
2000 switch (cmpFlag) {
2001 case BoolTest::eq:
2002 beqz(op1, L, is_far);
2003 break;
2004 case BoolTest::ne:
2005 bnez(op1, L, is_far);
2006 break;
2007 default:
2008 ShouldNotReachHere();
2009 }
2010 }
2011
2012 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
2013 bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2014 int op_select = cmpFlag & (~unsigned_branch_mask);
2015
2016 switch (op_select) {
2017 case BoolTest::eq:
2018 cmov_eq(op1, op2, dst, src);
2019 break;
2020 case BoolTest::ne:
2021 cmov_ne(op1, op2, dst, src);
2022 break;
2023 case BoolTest::le:
2024 if (is_unsigned) {
2025 cmov_leu(op1, op2, dst, src);
2026 } else {
2027 cmov_le(op1, op2, dst, src);
2028 }
2029 break;
2030 case BoolTest::ge:
2031 if (is_unsigned) {
2032 cmov_geu(op1, op2, dst, src);
2033 } else {
2034 cmov_ge(op1, op2, dst, src);
2035 }
2036 break;
2037 case BoolTest::lt:
2038 if (is_unsigned) {
2039 cmov_ltu(op1, op2, dst, src);
2040 } else {
2041 cmov_lt(op1, op2, dst, src);
2042 }
2043 break;
2044 case BoolTest::gt:
2045 if (is_unsigned) {
2046 cmov_gtu(op1, op2, dst, src);
2047 } else {
2048 cmov_gt(op1, op2, dst, src);
2049 }
2050 break;
2051 default:
2052 assert(false, "unsupported compare condition");
2053 ShouldNotReachHere();
2054 }
2055 }
2056
2057 void C2_MacroAssembler::enc_cmove_cmp_fp(int cmpFlag, FloatRegister op1, FloatRegister op2, Register dst, Register src, bool is_single) {
2058 int op_select = cmpFlag & (~unsigned_branch_mask);
2059
2060 switch (op_select) {
2061 case BoolTest::eq:
2062 cmov_cmp_fp_eq(op1, op2, dst, src, is_single);
2063 break;
2064 case BoolTest::ne:
2065 cmov_cmp_fp_ne(op1, op2, dst, src, is_single);
2066 break;
2067 case BoolTest::le:
2068 cmov_cmp_fp_le(op1, op2, dst, src, is_single);
2069 break;
2070 case BoolTest::ge:
2071 cmov_cmp_fp_ge(op1, op2, dst, src, is_single);
2072 break;
2073 case BoolTest::lt:
2074 cmov_cmp_fp_lt(op1, op2, dst, src, is_single);
2075 break;
2076 case BoolTest::gt:
2077 cmov_cmp_fp_gt(op1, op2, dst, src, is_single);
2078 break;
2079 default:
2080 assert(false, "unsupported compare condition");
2081 ShouldNotReachHere();
2082 }
2083 }
2084
2085 void C2_MacroAssembler::enc_cmove_fp_cmp(int cmpFlag, Register op1, Register op2,
2086 FloatRegister dst, FloatRegister src, bool is_single) {
2087 bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2088 int op_select = cmpFlag & (~unsigned_branch_mask);
2089
2090 switch (op_select) {
2091 case BoolTest::eq:
2092 cmov_fp_eq(op1, op2, dst, src, is_single);
2093 break;
2094 case BoolTest::ne:
2095 cmov_fp_ne(op1, op2, dst, src, is_single);
2096 break;
2097 case BoolTest::le:
2098 if (is_unsigned) {
2099 cmov_fp_leu(op1, op2, dst, src, is_single);
2100 } else {
2101 cmov_fp_le(op1, op2, dst, src, is_single);
2102 }
2103 break;
2104 case BoolTest::ge:
2105 if (is_unsigned) {
2106 cmov_fp_geu(op1, op2, dst, src, is_single);
2107 } else {
2108 cmov_fp_ge(op1, op2, dst, src, is_single);
2109 }
2110 break;
2111 case BoolTest::lt:
2112 if (is_unsigned) {
2113 cmov_fp_ltu(op1, op2, dst, src, is_single);
2114 } else {
2115 cmov_fp_lt(op1, op2, dst, src, is_single);
2116 }
2117 break;
2118 case BoolTest::gt:
2119 if (is_unsigned) {
2120 cmov_fp_gtu(op1, op2, dst, src, is_single);
2121 } else {
2122 cmov_fp_gt(op1, op2, dst, src, is_single);
2123 }
2124 break;
2125 default:
2126 assert(false, "unsupported compare condition");
2127 ShouldNotReachHere();
2128 }
2129 }
2130
2131 void C2_MacroAssembler::enc_cmove_fp_cmp_fp(int cmpFlag,
2132 FloatRegister op1, FloatRegister op2,
2133 FloatRegister dst, FloatRegister src,
2134 bool cmp_single, bool cmov_single) {
2135 int op_select = cmpFlag & (~unsigned_branch_mask);
2136
2137 switch (op_select) {
2138 case BoolTest::eq:
2139 cmov_fp_cmp_fp_eq(op1, op2, dst, src, cmp_single, cmov_single);
2140 break;
2141 case BoolTest::ne:
2142 cmov_fp_cmp_fp_ne(op1, op2, dst, src, cmp_single, cmov_single);
2143 break;
2144 case BoolTest::le:
2145 cmov_fp_cmp_fp_le(op1, op2, dst, src, cmp_single, cmov_single);
2146 break;
2147 case BoolTest::ge:
2148 cmov_fp_cmp_fp_ge(op1, op2, dst, src, cmp_single, cmov_single);
2149 break;
2150 case BoolTest::lt:
2151 cmov_fp_cmp_fp_lt(op1, op2, dst, src, cmp_single, cmov_single);
2152 break;
2153 case BoolTest::gt:
2154 cmov_fp_cmp_fp_gt(op1, op2, dst, src, cmp_single, cmov_single);
2155 break;
2156 default:
2157 assert(false, "unsupported compare condition");
2158 ShouldNotReachHere();
2159 }
2160 }
2161
2162 // Set dst to NaN if any NaN input.
2163 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
2164 FLOAT_TYPE ft, bool is_min) {
2165 assert_cond((ft != FLOAT_TYPE::half_precision) || UseZfh);
2166
2167 Label Done, Compare;
2168
2169 switch (ft) {
2170 case FLOAT_TYPE::half_precision:
2171 fclass_h(t0, src1);
2172 fclass_h(t1, src2);
2173
2174 orr(t0, t0, t1);
2175 andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2176 beqz(t0, Compare);
2177
2178 fadd_h(dst, src1, src2);
2179 j(Done);
2180
2181 bind(Compare);
2182 if (is_min) {
2183 fmin_h(dst, src1, src2);
2184 } else {
2185 fmax_h(dst, src1, src2);
2186 }
2187 break;
2188 case FLOAT_TYPE::single_precision:
2189 fclass_s(t0, src1);
2190 fclass_s(t1, src2);
2191
2192 orr(t0, t0, t1);
2193 andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2194 beqz(t0, Compare);
2195
2196 fadd_s(dst, src1, src2);
2197 j(Done);
2198
2199 bind(Compare);
2200 if (is_min) {
2201 fmin_s(dst, src1, src2);
2202 } else {
2203 fmax_s(dst, src1, src2);
2204 }
2205 break;
2206 case FLOAT_TYPE::double_precision:
2207 fclass_d(t0, src1);
2208 fclass_d(t1, src2);
2209
2210 orr(t0, t0, t1);
2211 andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2212 beqz(t0, Compare);
2213
2214 fadd_d(dst, src1, src2);
2215 j(Done);
2216
2217 bind(Compare);
2218 if (is_min) {
2219 fmin_d(dst, src1, src2);
2220 } else {
2221 fmax_d(dst, src1, src2);
2222 }
2223 break;
2224 default:
2225 ShouldNotReachHere();
2226 }
2227
2228 bind(Done);
2229 }
2230
2231 // According to Java SE specification, for floating-point round operations, if
2232 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
2233 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
2234 // round out-of-range values to the nearest max or min value), therefore special
2235 // handling is needed by NaN, +/-Infinity, +/-0.
2236 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
2237 Register tmp1, Register tmp2, Register tmp3) {
2238
2239 assert_different_registers(dst, src);
2240 assert_different_registers(tmp1, tmp2, tmp3);
2241
2242 // Set rounding mode for conversions
2243 // Here we use similar modes to double->long and long->double conversions
2244 // Different mode for long->double conversion matter only if long value was not representable as double,
2245 // we got long value as a result of double->long conversion so, it is definitely representable
2246 RoundingMode rm;
2247 switch (round_mode) {
2248 case RoundDoubleModeNode::rmode_ceil:
2249 rm = RoundingMode::rup;
2250 break;
2251 case RoundDoubleModeNode::rmode_floor:
2252 rm = RoundingMode::rdn;
2253 break;
2254 case RoundDoubleModeNode::rmode_rint:
2255 rm = RoundingMode::rne;
2256 break;
2257 default:
2258 ShouldNotReachHere();
2259 }
2260
2261 // tmp1 - is a register to store double converted to long int
2262 // tmp2 - is a register to create constant for comparison
2263 // tmp3 - is a register where we store modified result of double->long conversion
2264 Label done, bad_val;
2265
2266 // Conversion from double to long
2267 fcvt_l_d(tmp1, src, rm);
2268
2269 // Generate constant (tmp2)
2270 // tmp2 = 100...0000
2271 addi(tmp2, zr, 1);
2272 slli(tmp2, tmp2, 63);
2273
2274 // Prepare converted long (tmp1)
2275 // as a result when conversion overflow we got:
2276 // tmp1 = 011...1111 or 100...0000
2277 // Convert it to: tmp3 = 100...0000
2278 addi(tmp3, tmp1, 1);
2279 andi(tmp3, tmp3, -2);
2280 beq(tmp3, tmp2, bad_val);
2281
2282 // Conversion from long to double
2283 fcvt_d_l(dst, tmp1, rm);
2284 // Add sign of input value to result for +/- 0 cases
2285 fsgnj_d(dst, dst, src);
2286 j(done);
2287
2288 // If got conversion overflow return src
2289 bind(bad_val);
2290 fmv_d(dst, src);
2291
2292 bind(done);
2293 }
2294
2295 // According to Java SE specification, for floating-point signum operations, if
2296 // on input we have NaN or +/-0.0 value we should return it,
2297 // otherwise return +/- 1.0 using sign of input.
2298 // one - gives us a floating-point 1.0 (got from matching rule)
2299 // bool is_double - specifies single or double precision operations will be used.
2300 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2301 Label done;
2302
2303 is_double ? fclass_d(t0, dst)
2304 : fclass_s(t0, dst);
2305
2306 // check if input is -0, +0, signaling NaN or quiet NaN
2307 andi(t0, t0, FClassBits::zero | FClassBits::nan);
2308
2309 bnez(t0, done);
2310
2311 // use floating-point 1.0 with a sign of input
2312 is_double ? fsgnj_d(dst, one, dst)
2313 : fsgnj_s(dst, one, dst);
2314
2315 bind(done);
2316 }
2317
2318 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2319 #define __ masm.
2320 FloatRegister dst = stub.data<0>();
2321 Register src = stub.data<1>();
2322 Register tmp = stub.data<2>();
2323 __ bind(stub.entry());
2324
2325 // following instructions mainly focus on NaN, as riscv does not handle
2326 // NaN well with fcvt, but the code also works for Inf at the same time.
2327
2328 // construct a NaN in 32 bits from the NaN in 16 bits,
2329 // we need the payloads of non-canonical NaNs to be preserved.
2330 __ mv(tmp, 0x7f800000);
2331 // sign-bit was already set via sign-extension if necessary.
2332 __ slli(t0, src, 13);
2333 __ orr(tmp, t0, tmp);
2334 __ fmv_w_x(dst, tmp);
2335
2336 __ j(stub.continuation());
2337 #undef __
2338 }
2339
2340 // j.l.Float.float16ToFloat
2341 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2342 auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2343
2344 // On riscv, NaN needs a special process as fcvt does not work in that case.
2345 // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2346 // but we consider to get the slow path to process NaN and Inf at the same time,
2347 // as both of them are rare cases, and if we try to get the slow path to handle
2348 // only NaN case it would sacrifise the performance for normal cases,
2349 // i.e. non-NaN and non-Inf cases.
2350
2351 // check whether it's a NaN or +/- Inf.
2352 mv(t0, 0x7c00);
2353 andr(tmp, src, t0);
2354 // jump to stub processing NaN and Inf cases.
2355 beq(t0, tmp, stub->entry(), true);
2356
2357 // non-NaN or non-Inf cases, just use built-in instructions.
2358 fmv_h_x(dst, src);
2359 fcvt_s_h(dst, dst);
2360
2361 bind(stub->continuation());
2362 }
2363
2364 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2365 #define __ masm.
2366 Register dst = stub.data<0>();
2367 FloatRegister src = stub.data<1>();
2368 Register tmp = stub.data<2>();
2369 __ bind(stub.entry());
2370
2371 __ float_to_float16_NaN(dst, src, t0, tmp);
2372
2373 __ j(stub.continuation());
2374 #undef __
2375 }
2376
2377 // j.l.Float.floatToFloat16
2378 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2379 auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 64, float_to_float16_slow_path);
2380
2381 // On riscv, NaN needs a special process as fcvt does not work in that case.
2382
2383 // check whether it's a NaN.
2384 // replace fclass with feq as performance optimization.
2385 feq_s(t0, src, src);
2386 // jump to stub processing NaN cases.
2387 beqz(t0, stub->entry(), true);
2388
2389 // non-NaN cases, just use built-in instructions.
2390 fcvt_h_s(ftmp, src);
2391 fmv_x_h(dst, ftmp);
2392
2393 bind(stub->continuation());
2394 }
2395
2396 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2397 #define __ masm.
2398 VectorRegister dst = stub.data<0>();
2399 VectorRegister src = stub.data<1>();
2400 uint vector_length = stub.data<2>();
2401 __ bind(stub.entry());
2402
2403 // following instructions mainly focus on NaN, as riscv does not handle
2404 // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2405 //
2406 // construct NaN's in 32 bits from the NaN's in 16 bits,
2407 // we need the payloads of non-canonical NaNs to be preserved.
2408
2409 // adjust vector type to 2 * SEW.
2410 __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2411 // widen and sign-extend src data.
2412 __ vsext_vf2(dst, src, Assembler::v0_t);
2413 __ mv(t0, 0x7f800000);
2414 // sign-bit was already set via sign-extension if necessary.
2415 __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2416 __ vor_vx(dst, dst, t0, Assembler::v0_t);
2417
2418 __ j(stub.continuation());
2419 #undef __
2420 }
2421
2422 // j.l.Float.float16ToFloat
2423 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2424 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2425 (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2426 assert_different_registers(dst, src);
2427
2428 // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2429 // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2430 // but we consider to get the slow path to process NaN and Inf at the same time,
2431 // as both of them are rare cases, and if we try to get the slow path to handle
2432 // only NaN case it would sacrifise the performance for normal cases,
2433 // i.e. non-NaN and non-Inf cases.
2434
2435 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2436
2437 // check whether there is a NaN or +/- Inf.
2438 mv(t0, 0x7c00);
2439 vand_vx(v0, src, t0);
2440 // v0 will be used as mask in slow path.
2441 vmseq_vx(v0, v0, t0);
2442 vcpop_m(t0, v0);
2443
2444 // For non-NaN or non-Inf cases, just use built-in instructions.
2445 vfwcvt_f_f_v(dst, src);
2446
2447 // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2448 bnez(t0, stub->entry(), true);
2449
2450 bind(stub->continuation());
2451 }
2452
2453 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2454 C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2455 #define __ masm.
2456 VectorRegister dst = stub.data<0>();
2457 VectorRegister src = stub.data<1>();
2458 VectorRegister vtmp = stub.data<2>();
2459 assert_different_registers(dst, src, vtmp);
2460
2461 __ bind(stub.entry());
2462
2463 // Active elements (NaNs) are marked in v0 mask register.
2464 // mul is already set to mf2 in float_to_float16_v.
2465
2466 // Float (32 bits)
2467 // Bit: 31 30 to 23 22 to 0
2468 // +---+------------------+-----------------------------+
2469 // | S | Exponent | Mantissa (Fraction) |
2470 // +---+------------------+-----------------------------+
2471 // 1 bit 8 bits 23 bits
2472 //
2473 // Float (16 bits)
2474 // Bit: 15 14 to 10 9 to 0
2475 // +---+----------------+------------------+
2476 // | S | Exponent | Mantissa |
2477 // +---+----------------+------------------+
2478 // 1 bit 5 bits 10 bits
2479 const int fp_sign_bits = 1;
2480 const int fp32_bits = 32;
2481 const int fp32_mantissa_2nd_part_bits = 9;
2482 const int fp32_mantissa_3rd_part_bits = 4;
2483 const int fp16_exponent_bits = 5;
2484 const int fp16_mantissa_bits = 10;
2485
2486 // preserve the sign bit and exponent, clear mantissa.
2487 __ vnsra_wi(dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t);
2488 __ vsll_vi(dst, dst, fp16_mantissa_bits, Assembler::v0_t);
2489
2490 // Preserve high order bit of float NaN in the
2491 // binary16 result NaN (tenth bit); OR in remaining
2492 // bits into lower 9 bits of binary 16 significand.
2493 // | (doppel & 0x007f_e000) >> 13 // 10 bits
2494 // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
2495 // | (doppel & 0x0000_000f)); // 4 bits
2496 //
2497 // Check j.l.Float.floatToFloat16 for more information.
2498 // 10 bits
2499 __ vnsrl_wi(vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2500 __ mv(t0, 0x3ff); // retain first part of mantissa in a float 32
2501 __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2502 __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2503 // 9 bits
2504 __ vnsrl_wi(vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2505 __ mv(t0, 0x1ff); // retain second part of mantissa in a float 32
2506 __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2507 __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2508 // 4 bits
2509 // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register.
2510 __ vnsrl_wi(vtmp, src, 0, Assembler::v0_t);
2511 __ vand_vi(vtmp, vtmp, 0xf, Assembler::v0_t);
2512 __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2513
2514 __ j(stub.continuation());
2515 #undef __
2516 }
2517
2518 // j.l.Float.float16ToFloat
2519 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src,
2520 VectorRegister vtmp, Register tmp, uint vector_length) {
2521 assert_different_registers(dst, src, vtmp);
2522
2523 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2524 (dst, src, vtmp, 56, float_to_float16_v_slow_path);
2525
2526 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2527
2528 vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2529
2530 // check whether there is a NaN.
2531 // replace v_fclass with vmfne_vv as performance optimization.
2532 vmfne_vv(v0, src, src);
2533 vcpop_m(t0, v0);
2534
2535 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2536
2537 // For non-NaN cases, just use built-in instructions.
2538 vfncvt_f_f_w(dst, src);
2539
2540 // jump to stub processing NaN cases.
2541 bnez(t0, stub->entry(), true);
2542
2543 bind(stub->continuation());
2544 }
2545
2546 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2547 vsetvli_helper(bt, vlen);
2548
2549 // check if input is -0, +0, signaling NaN or quiet NaN
2550 vfclass_v(v0, dst);
2551 mv(t0, FClassBits::zero | FClassBits::nan);
2552 vand_vx(v0, v0, t0);
2553 vmseq_vi(v0, v0, 0);
2554
2555 // use floating-point 1.0 with a sign of input
2556 vfsgnj_vv(dst, one, dst, v0_t);
2557 }
2558
2559 // j.l.Math.round(float)
2560 // Returns the closest int to the argument, with ties rounding to positive infinity.
2561 // We need to handle 3 special cases defined by java api spec:
2562 // NaN,
2563 // float >= Integer.MAX_VALUE,
2564 // float <= Integer.MIN_VALUE.
2565 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2566 BasicType bt, uint vector_length) {
2567 // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined,
2568 // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g.
2569 // RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted
2570 // to 2, instead of 2 and 3 respectively.
2571 // RUP does not work either, although java api requires "rounding to positive infinity",
2572 // but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively.
2573 //
2574 // The optimal solution for non-NaN cases is:
2575 // src+0.5 => dst, with rdn rounding mode,
2576 // convert dst from float to int, with rnd rounding mode.
2577 // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE.
2578 //
2579 // But, we still need to handle NaN explicilty with vector mask instructions.
2580 //
2581 // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details.
2582
2583 csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2584 vsetvli_helper(bt, vector_length);
2585
2586 // don't rearrage the instructions sequence order without performance testing.
2587 // check MacroAssembler::java_round_float in riscv64 for more details.
2588 mv(t0, jint_cast(0.5f));
2589 fmv_w_x(ftmp, t0);
2590
2591 // replacing vfclass with feq as performance optimization
2592 vmfeq_vv(v0, src, src);
2593 // set dst = 0 in cases of NaN
2594 vmv_v_x(dst, zr);
2595
2596 // dst = (src + 0.5) rounded down towards negative infinity
2597 vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2598 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2599
2600 csrwi(CSR_FRM, C2_MacroAssembler::rne);
2601 }
2602
2603 // java.lang.Math.round(double a)
2604 // Returns the closest long to the argument, with ties rounding to positive infinity.
2605 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2606 BasicType bt, uint vector_length) {
2607 // check C2_MacroAssembler::java_round_float_v above for more details.
2608
2609 csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2610 vsetvli_helper(bt, vector_length);
2611
2612 mv(t0, julong_cast(0.5));
2613 fmv_d_x(ftmp, t0);
2614
2615 // replacing vfclass with feq as performance optimization
2616 vmfeq_vv(v0, src, src);
2617 // set dst = 0 in cases of NaN
2618 vmv_v_x(dst, zr);
2619
2620 // dst = (src + 0.5) rounded down towards negative infinity
2621 vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2622 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2623
2624 csrwi(CSR_FRM, C2_MacroAssembler::rne);
2625 }
2626
2627 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2628 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE,
2629 Assembler::LMUL lmul) {
2630 Label loop;
2631 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2632
2633 bind(loop);
2634 vsetvli(tmp1, cnt, sew, lmul);
2635 vlex_v(vr1, a1, sew);
2636 vlex_v(vr2, a2, sew);
2637 vmsne_vv(vrs, vr1, vr2);
2638 vfirst_m(tmp2, vrs);
2639 bgez(tmp2, DONE);
2640 sub(cnt, cnt, tmp1);
2641 if (!islatin) {
2642 slli(tmp1, tmp1, 1); // get byte counts
2643 }
2644 add(a1, a1, tmp1);
2645 add(a2, a2, tmp1);
2646 bnez(cnt, loop);
2647
2648 mv(result, true);
2649 }
2650
2651 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2652 Label DONE;
2653 Register tmp1 = t0;
2654 Register tmp2 = t1;
2655
2656 BLOCK_COMMENT("string_equals_v {");
2657
2658 mv(result, false);
2659
2660 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2);
2661
2662 bind(DONE);
2663 BLOCK_COMMENT("} string_equals_v");
2664 }
2665
2666 // used by C2 ClearArray patterns.
2667 // base: Address of a buffer to be zeroed
2668 // cnt: Count in HeapWords
2669 //
2670 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2671 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2672 Label loop;
2673
2674 // making zero words
2675 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2676 vxor_vv(v4, v4, v4);
2677
2678 bind(loop);
2679 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2680 vse64_v(v4, base);
2681 sub(cnt, cnt, t0);
2682 shadd(base, t0, base, t0, 3);
2683 bnez(cnt, loop);
2684 }
2685
2686 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2687 Register cnt1, int elem_size) {
2688 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
2689 assert_different_registers(a1, a2, result, cnt1, t0, t1);
2690
2691 Label DONE;
2692 Register tmp1 = t0;
2693 Register tmp2 = t1;
2694 Register cnt2 = tmp2;
2695 int length_offset = arrayOopDesc::length_offset_in_bytes();
2696 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2697
2698 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
2699
2700 BLOCK_COMMENT("arrays_equals_v {");
2701
2702 // if (a1 == a2), return true
2703 mv(result, true);
2704 beq(a1, a2, DONE);
2705
2706 mv(result, false);
2707 // if a1 == null or a2 == null, return false
2708 beqz(a1, DONE);
2709 beqz(a2, DONE);
2710 // if (a1.length != a2.length), return false
2711 lwu(cnt1, Address(a1, length_offset));
2712 lwu(cnt2, Address(a2, length_offset));
2713 bne(cnt1, cnt2, DONE);
2714
2715 la(a1, Address(a1, base_offset));
2716 la(a2, Address(a2, base_offset));
2717
2718 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2);
2719
2720 bind(DONE);
2721
2722 BLOCK_COMMENT("} arrays_equals_v");
2723 }
2724
2725 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2726 Register result, Register tmp1, Register tmp2, int encForm) {
2727 Label DIFFERENCE, DONE, L, loop;
2728 bool encLL = encForm == StrIntrinsicNode::LL;
2729 bool encLU = encForm == StrIntrinsicNode::LU;
2730 bool encUL = encForm == StrIntrinsicNode::UL;
2731
2732 bool str1_isL = encLL || encLU;
2733 bool str2_isL = encLL || encUL;
2734
2735 int minCharsInWord = encLL ? wordSize : wordSize / 2;
2736
2737 BLOCK_COMMENT("string_compare_v {");
2738
2739 // for Latin strings, 1 byte for 1 character
2740 // for UTF16 strings, 2 bytes for 1 character
2741 if (!str1_isL)
2742 sraiw(cnt1, cnt1, 1);
2743 if (!str2_isL)
2744 sraiw(cnt2, cnt2, 1);
2745
2746 // if str1 == str2, return the difference
2747 // save the minimum of the string lengths in cnt2.
2748 sub(result, cnt1, cnt2);
2749 bgt(cnt1, cnt2, L);
2750 mv(cnt2, cnt1);
2751 bind(L);
2752
2753 // We focus on the optimization of small sized string.
2754 // Please check below document for string size distribution statistics.
2755 // https://cr.openjdk.org/~shade/density/string-density-report.pdf
2756 if (str1_isL == str2_isL) { // LL or UU
2757 // Below construction of v regs and lmul is based on test on 2 different boards,
2758 // vlen == 128 and vlen == 256 respectively.
2759 if (!encLL && MaxVectorSize == 16) { // UU
2760 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4);
2761 } else { // UU + MaxVectorSize or LL
2762 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2);
2763 }
2764
2765 j(DONE);
2766 } else { // LU or UL
2767 Register strL = encLU ? str1 : str2;
2768 Register strU = encLU ? str2 : str1;
2769 VectorRegister vstr1 = encLU ? v8 : v4;
2770 VectorRegister vstr2 = encLU ? v4 : v8;
2771
2772 bind(loop);
2773 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2774 vle8_v(vstr1, strL);
2775 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2776 vzext_vf2(vstr2, vstr1);
2777 vle16_v(vstr1, strU);
2778 vmsne_vv(v4, vstr2, vstr1);
2779 vfirst_m(tmp2, v4);
2780 bgez(tmp2, DIFFERENCE);
2781 sub(cnt2, cnt2, tmp1);
2782 add(strL, strL, tmp1);
2783 shadd(strU, tmp1, strU, tmp1, 1);
2784 bnez(cnt2, loop);
2785 j(DONE);
2786 }
2787
2788 bind(DIFFERENCE);
2789 slli(tmp1, tmp2, 1);
2790 add(str1, str1, str1_isL ? tmp2 : tmp1);
2791 add(str2, str2, str2_isL ? tmp2 : tmp1);
2792 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2793 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2794 sub(result, tmp1, tmp2);
2795
2796 bind(DONE);
2797
2798 BLOCK_COMMENT("} string_compare_v");
2799 }
2800
2801 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2802 Label loop;
2803 assert_different_registers(src, dst, len, tmp, t0);
2804
2805 BLOCK_COMMENT("byte_array_inflate_v {");
2806 bind(loop);
2807 vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2808 vle8_v(v6, src);
2809 vsetvli(t0, len, Assembler::e16, Assembler::m4);
2810 vzext_vf2(v4, v6);
2811 vse16_v(v4, dst);
2812 sub(len, len, tmp);
2813 add(src, src, tmp);
2814 shadd(dst, tmp, dst, tmp, 1);
2815 bnez(len, loop);
2816 BLOCK_COMMENT("} byte_array_inflate_v");
2817 }
2818
2819 // Compress char[] array to byte[].
2820 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2821 // result: the array length if every element in array can be encoded,
2822 // otherwise, the index of first non-latin1 (> 0xff) character.
2823 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2824 Register result, Register tmp) {
2825 encode_iso_array_v(src, dst, len, result, tmp, false);
2826 }
2827
2828 // Intrinsic for
2829 //
2830 // - sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2831 // Encodes char[] to byte[] in ISO-8859-1
2832 //
2833 // - java.lang.StringCoding#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2834 // Encodes byte[] (containing UTF-16) to byte[] in ISO-8859-1
2835 //
2836 // - java.lang.StringCoding#encodeAsciiArray0(char[] sa, int sp, byte[] da, int dp, int len)
2837 // Encodes char[] to byte[] in ASCII
2838 //
2839 // This version always returns the number of characters copied. A successful
2840 // copy will complete with the post-condition: 'res' == 'len', while an
2841 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2842 //
2843 // Clobbers: src, dst, len, result, t0
2844 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2845 Register result, Register tmp, bool ascii) {
2846 Label loop, fail, done;
2847
2848 BLOCK_COMMENT("encode_iso_array_v {");
2849 mv(result, 0);
2850
2851 bind(loop);
2852 mv(tmp, ascii ? 0x7f : 0xff);
2853 vsetvli(t0, len, Assembler::e16, Assembler::m2);
2854 vle16_v(v2, src);
2855
2856 vmsgtu_vx(v1, v2, tmp);
2857 vfirst_m(tmp, v1);
2858 vmsbf_m(v0, v1);
2859 // compress char to byte
2860 vsetvli(t0, len, Assembler::e8);
2861 vncvt_x_x_w(v1, v2, Assembler::v0_t);
2862 vse8_v(v1, dst, Assembler::v0_t);
2863
2864 // fail if char > 0x7f/0xff
2865 bgez(tmp, fail);
2866 add(result, result, t0);
2867 add(dst, dst, t0);
2868 sub(len, len, t0);
2869 shadd(src, t0, src, t0, 1);
2870 bnez(len, loop);
2871 j(done);
2872
2873 bind(fail);
2874 add(result, result, tmp);
2875
2876 bind(done);
2877 BLOCK_COMMENT("} encode_iso_array_v");
2878 }
2879
2880 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2881 Label LOOP, SET_RESULT, DONE;
2882
2883 BLOCK_COMMENT("count_positives_v {");
2884 assert_different_registers(ary, len, result, tmp);
2885
2886 mv(result, zr);
2887
2888 bind(LOOP);
2889 vsetvli(t0, len, Assembler::e8, Assembler::m4);
2890 vle8_v(v4, ary);
2891 vmslt_vx(v4, v4, zr);
2892 vfirst_m(tmp, v4);
2893 bgez(tmp, SET_RESULT);
2894 // if tmp == -1, all bytes are positive
2895 add(result, result, t0);
2896
2897 sub(len, len, t0);
2898 add(ary, ary, t0);
2899 bnez(len, LOOP);
2900 j(DONE);
2901
2902 // add remaining positive bytes count
2903 bind(SET_RESULT);
2904 add(result, result, tmp);
2905
2906 bind(DONE);
2907 BLOCK_COMMENT("} count_positives_v");
2908 }
2909
2910 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2911 Register ch, Register result,
2912 Register tmp1, Register tmp2,
2913 bool isL) {
2914 mv(result, zr);
2915
2916 Label loop, MATCH, DONE;
2917 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2918 bind(loop);
2919 vsetvli(tmp1, cnt1, sew, Assembler::m4);
2920 vlex_v(v4, str1, sew);
2921 vmseq_vx(v4, v4, ch);
2922 vfirst_m(tmp2, v4);
2923 bgez(tmp2, MATCH); // if equal, return index
2924
2925 add(result, result, tmp1);
2926 sub(cnt1, cnt1, tmp1);
2927 if (!isL) slli(tmp1, tmp1, 1);
2928 add(str1, str1, tmp1);
2929 bnez(cnt1, loop);
2930
2931 mv(result, -1);
2932 j(DONE);
2933
2934 bind(MATCH);
2935 add(result, result, tmp2);
2936
2937 bind(DONE);
2938 }
2939
2940 // Set dst to NaN if any NaN input.
2941 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2942 BasicType bt, bool is_min, uint vector_length) {
2943 assert_different_registers(dst, src1, src2);
2944
2945 vsetvli_helper(bt, vector_length);
2946
2947 is_min ? vfmin_vv(dst, src1, src2)
2948 : vfmax_vv(dst, src1, src2);
2949
2950 vmfne_vv(v0, src1, src1);
2951 vfadd_vv(dst, src1, src1, Assembler::v0_t);
2952 vmfne_vv(v0, src2, src2);
2953 vfadd_vv(dst, src2, src2, Assembler::v0_t);
2954 }
2955
2956 // Set dst to NaN if any NaN input.
2957 // The destination vector register elements corresponding to masked-off elements
2958 // are handled with a mask-undisturbed policy.
2959 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2960 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2961 BasicType bt, bool is_min, uint vector_length) {
2962 assert_different_registers(src1, src2, tmp1, tmp2);
2963 vsetvli_helper(bt, vector_length);
2964
2965 // Check vector elements of src1 and src2 for NaN.
2966 vmfeq_vv(tmp1, src1, src1);
2967 vmfeq_vv(tmp2, src2, src2);
2968
2969 vmandn_mm(v0, vmask, tmp1);
2970 vfadd_vv(dst, src1, src1, Assembler::v0_t);
2971 vmandn_mm(v0, vmask, tmp2);
2972 vfadd_vv(dst, src2, src2, Assembler::v0_t);
2973
2974 vmand_mm(tmp2, tmp1, tmp2);
2975 vmand_mm(v0, vmask, tmp2);
2976 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
2977 : vfmax_vv(dst, src1, src2, Assembler::v0_t);
2978 }
2979
2980 // Set dst to NaN if any NaN input.
2981 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
2982 FloatRegister src1, VectorRegister src2,
2983 VectorRegister tmp1, VectorRegister tmp2,
2984 bool is_double, bool is_min, uint vector_length, VectorMask vm) {
2985 assert_different_registers(dst, src1);
2986 assert_different_registers(src2, tmp1, tmp2);
2987
2988 Label L_done, L_NaN_1, L_NaN_2;
2989 // Set dst to src1 if src1 is NaN
2990 is_double ? feq_d(t0, src1, src1)
2991 : feq_s(t0, src1, src1);
2992 beqz(t0, L_NaN_2);
2993
2994 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
2995 vfmv_s_f(tmp2, src1);
2996
2997 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
2998 : vfredmax_vs(tmp1, src2, tmp2, vm);
2999 vfmv_f_s(dst, tmp1);
3000
3001 // Checking NaNs in src2
3002 vmfne_vv(tmp1, src2, src2, vm);
3003 vcpop_m(t0, tmp1, vm);
3004 beqz(t0, L_done);
3005
3006 bind(L_NaN_1);
3007 vfredusum_vs(tmp1, src2, tmp2, vm);
3008 vfmv_f_s(dst, tmp1);
3009 j(L_done);
3010
3011 bind(L_NaN_2);
3012 is_double ? fmv_d(dst, src1)
3013 : fmv_s(dst, src1);
3014 bind(L_done);
3015 }
3016
3017 bool C2_MacroAssembler::in_scratch_emit_size() {
3018 if (ciEnv::current()->task() != nullptr) {
3019 PhaseOutput* phase_output = Compile::current()->output();
3020 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
3021 return true;
3022 }
3023 }
3024 return MacroAssembler::in_scratch_emit_size();
3025 }
3026
3027 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
3028 VectorRegister src2, VectorRegister tmp,
3029 int opc, BasicType bt, uint vector_length, VectorMask vm) {
3030 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3031 vsetvli_helper(bt, vector_length);
3032 vmv_s_x(tmp, src1);
3033 switch (opc) {
3034 case Op_AddReductionVI:
3035 case Op_AddReductionVL:
3036 vredsum_vs(tmp, src2, tmp, vm);
3037 break;
3038 case Op_AndReductionV:
3039 vredand_vs(tmp, src2, tmp, vm);
3040 break;
3041 case Op_OrReductionV:
3042 vredor_vs(tmp, src2, tmp, vm);
3043 break;
3044 case Op_XorReductionV:
3045 vredxor_vs(tmp, src2, tmp, vm);
3046 break;
3047 case Op_MaxReductionV:
3048 vredmax_vs(tmp, src2, tmp, vm);
3049 break;
3050 case Op_MinReductionV:
3051 vredmin_vs(tmp, src2, tmp, vm);
3052 break;
3053 default:
3054 ShouldNotReachHere();
3055 }
3056 vmv_x_s(dst, tmp);
3057 }
3058
3059 void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
3060 VectorRegister vtmp1, VectorRegister vtmp2,
3061 BasicType bt, uint vector_length, VectorMask vm) {
3062 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3063 vsetvli_helper(bt, vector_length);
3064
3065 vector_length /= 2;
3066 if (vm != Assembler::unmasked) {
3067 // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`:
3068 // If no elements are selected, an operation-specific identity value is returned.
3069 // If the operation is MUL, then the identity value is one.
3070 vmv_v_i(vtmp1, 1);
3071 vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0
3072 slidedown_v(vtmp1, vtmp2, vector_length);
3073
3074 vsetvli_helper(bt, vector_length);
3075 vmul_vv(vtmp1, vtmp1, vtmp2);
3076 } else {
3077 slidedown_v(vtmp1, src2, vector_length);
3078
3079 vsetvli_helper(bt, vector_length);
3080 vmul_vv(vtmp1, vtmp1, src2);
3081 }
3082
3083 while (vector_length > 1) {
3084 vector_length /= 2;
3085 slidedown_v(vtmp2, vtmp1, vector_length);
3086 vsetvli_helper(bt, vector_length);
3087 vmul_vv(vtmp1, vtmp1, vtmp2);
3088 }
3089
3090 vmv_x_s(dst, vtmp1);
3091 if (bt == T_INT) {
3092 mulw(dst, dst, src1);
3093 } else {
3094 mul(dst, dst, src1);
3095 }
3096 }
3097
3098 // Set vl and vtype for full and partial vector operations.
3099 // (vma = mu, vta = tu, vill = false)
3100 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
3101 Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
3102 if (vector_length <= 31) {
3103 vsetivli(tmp, vector_length, sew, vlmul);
3104 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
3105 vsetvli(tmp, x0, sew, vlmul);
3106 } else {
3107 mv(tmp, vector_length);
3108 vsetvli(tmp, tmp, sew, vlmul);
3109 }
3110 }
3111
3112 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3113 int cond, BasicType bt, uint vector_length, VectorMask vm) {
3114 assert(is_integral_type(bt), "unsupported element type");
3115 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3116 vsetvli_helper(bt, vector_length);
3117 if (vm == Assembler::v0_t) {
3118 vmclr_m(vd);
3119 }
3120 switch (cond) {
3121 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
3122 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
3123 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
3124 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
3125 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
3126 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
3127 case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
3128 case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
3129 case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
3130 case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
3131 default:
3132 assert(false, "unsupported compare condition");
3133 ShouldNotReachHere();
3134 }
3135 }
3136
3137 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3138 int cond, BasicType bt, uint vector_length, VectorMask vm) {
3139 assert(is_floating_point_type(bt), "unsupported element type");
3140 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3141 vsetvli_helper(bt, vector_length);
3142 if (vm == Assembler::v0_t) {
3143 vmclr_m(vd);
3144 }
3145 switch (cond) {
3146 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
3147 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
3148 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
3149 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
3150 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
3151 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
3152 default:
3153 assert(false, "unsupported compare condition");
3154 ShouldNotReachHere();
3155 }
3156 }
3157
3158 // In Matcher::scalable_predicate_reg_slots,
3159 // we assume each predicate register is one-eighth of the size of
3160 // scalable vector register, one mask bit per vector byte.
3161 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
3162 vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3163 add(t0, sp, offset);
3164 vse8_v(v, t0);
3165 }
3166
3167 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
3168 vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3169 add(t0, sp, offset);
3170 vle8_v(v, t0);
3171 }
3172
3173 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3174 VectorRegister src, BasicType src_bt, bool is_signed) {
3175 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
3176 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3177 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
3178 // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
3179 // and the overlap is in the highest-numbered part of the destination register group.
3180 // Since LMUL=1, vd and vs cannot be the same.
3181 assert_different_registers(dst, src);
3182
3183 vsetvli_helper(dst_bt, vector_length);
3184 if (is_signed) {
3185 if (src_bt == T_BYTE) {
3186 switch (dst_bt) {
3187 case T_SHORT:
3188 vsext_vf2(dst, src);
3189 break;
3190 case T_INT:
3191 vsext_vf4(dst, src);
3192 break;
3193 case T_LONG:
3194 vsext_vf8(dst, src);
3195 break;
3196 default:
3197 ShouldNotReachHere();
3198 }
3199 } else if (src_bt == T_SHORT) {
3200 if (dst_bt == T_INT) {
3201 vsext_vf2(dst, src);
3202 } else {
3203 vsext_vf4(dst, src);
3204 }
3205 } else if (src_bt == T_INT) {
3206 vsext_vf2(dst, src);
3207 }
3208 } else {
3209 if (src_bt == T_BYTE) {
3210 switch (dst_bt) {
3211 case T_SHORT:
3212 vzext_vf2(dst, src);
3213 break;
3214 case T_INT:
3215 vzext_vf4(dst, src);
3216 break;
3217 case T_LONG:
3218 vzext_vf8(dst, src);
3219 break;
3220 default:
3221 ShouldNotReachHere();
3222 }
3223 } else if (src_bt == T_SHORT) {
3224 if (dst_bt == T_INT) {
3225 vzext_vf2(dst, src);
3226 } else {
3227 vzext_vf4(dst, src);
3228 }
3229 } else if (src_bt == T_INT) {
3230 vzext_vf2(dst, src);
3231 }
3232 }
3233 }
3234
3235 // Vector narrow from src to dst with specified element sizes.
3236 // High part of dst vector will be filled with zero.
3237 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3238 VectorRegister src, BasicType src_bt) {
3239 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
3240 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3241 mv(t0, vector_length);
3242 if (src_bt == T_LONG) {
3243 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
3244 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
3245 // So we can currently only scale down by 1/2 the width at a time.
3246 vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
3247 vncvt_x_x_w(dst, src);
3248 if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
3249 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3250 vncvt_x_x_w(dst, dst);
3251 if (dst_bt == T_BYTE) {
3252 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3253 vncvt_x_x_w(dst, dst);
3254 }
3255 }
3256 } else if (src_bt == T_INT) {
3257 // T_SHORT
3258 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3259 vncvt_x_x_w(dst, src);
3260 if (dst_bt == T_BYTE) {
3261 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3262 vncvt_x_x_w(dst, dst);
3263 }
3264 } else if (src_bt == T_SHORT) {
3265 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3266 vncvt_x_x_w(dst, src);
3267 }
3268 }
3269
3270 #define VFCVT_SAFE(VFLOATCVT) \
3271 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
3272 assert_different_registers(dst, src); \
3273 vxor_vv(dst, dst, dst); \
3274 vmfeq_vv(v0, src, src); \
3275 VFLOATCVT(dst, src, Assembler::v0_t); \
3276 }
3277
3278 VFCVT_SAFE(vfcvt_rtz_x_f_v);
3279
3280 #undef VFCVT_SAFE
3281
3282 // Extract a scalar element from an vector at position 'idx'.
3283 // The input elements in src are expected to be of integral type.
3284 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src,
3285 BasicType bt, int idx, VectorRegister vtmp) {
3286 assert(is_integral_type(bt), "unsupported element type");
3287 assert(idx >= 0, "idx cannot be negative");
3288 // Only need the first element after vector slidedown
3289 vsetvli_helper(bt, 1);
3290 if (idx == 0) {
3291 vmv_x_s(dst, src);
3292 } else {
3293 slidedown_v(vtmp, src, idx);
3294 vmv_x_s(dst, vtmp);
3295 }
3296 }
3297
3298 // Extract a scalar element from an vector at position 'idx'.
3299 // The input elements in src are expected to be of floating point type.
3300 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src,
3301 BasicType bt, int idx, VectorRegister vtmp) {
3302 assert(is_floating_point_type(bt), "unsupported element type");
3303 assert(idx >= 0, "idx cannot be negative");
3304 // Only need the first element after vector slidedown
3305 vsetvli_helper(bt, 1);
3306 if (idx == 0) {
3307 vfmv_f_s(dst, src);
3308 } else {
3309 slidedown_v(vtmp, src, idx);
3310 vfmv_f_s(dst, vtmp);
3311 }
3312 }
3313
3314 // Move elements down a vector register group.
3315 // Offset is the start index (offset) for the source.
3316 void C2_MacroAssembler::slidedown_v(VectorRegister dst, VectorRegister src,
3317 uint32_t offset, Register tmp) {
3318 if (is_uimm5(offset)) {
3319 vslidedown_vi(dst, src, offset);
3320 } else {
3321 mv(tmp, offset);
3322 vslidedown_vx(dst, src, tmp);
3323 }
3324 }