1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "opto/c2_MacroAssembler.hpp"
29 #include "opto/compile.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/output.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/objectMonitorTable.hpp"
34 #include "runtime/stubRoutines.hpp"
35 #include "runtime/synchronizer.hpp"
36 #include "utilities/globalDefinitions.hpp"
37
38 #ifdef PRODUCT
39 #define BLOCK_COMMENT(str) /* nothing */
40 #define STOP(error) stop(error)
41 #else
42 #define BLOCK_COMMENT(str) block_comment(str)
43 #define STOP(error) block_comment(error); stop(error)
44 #endif
45
46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
47
48 void C2_MacroAssembler::entry_barrier() {
49 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
50 // Dummy labels for just measuring the code size
51 Label dummy_slow_path;
52 Label dummy_continuation;
53 Label dummy_guard;
54 Label* slow_path = &dummy_slow_path;
55 Label* continuation = &dummy_continuation;
56 Label* guard = &dummy_guard;
57
58 if (!Compile::current()->output()->in_scratch_emit_size()) {
59 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
60 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
61 Compile::current()->output()->add_stub(stub);
62 slow_path = &stub->entry();
63 continuation = &stub->continuation();
64 guard = &stub->guard();
65 }
66
67 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
68 bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
69 }
70
71 void C2_MacroAssembler::fast_lock(Register obj, Register box,
72 Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
73 // Flag register, zero for success; non-zero for failure.
74 Register flag = t1;
75
76 assert_different_registers(obj, box, tmp1, tmp2, tmp3, tmp4, flag, t0);
77
78 mv(flag, 1);
79
80 // Handle inflated monitor.
81 Label inflated;
82 // Finish fast lock successfully. MUST branch to with flag == 0
83 Label locked;
84 // Finish fast lock unsuccessfully. slow_path MUST branch to with flag != 0
85 Label slow_path;
86
87 if (UseObjectMonitorTable) {
88 // Clear cache in case fast locking succeeds or we need to take the slow-path.
89 sd(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
90 }
91
92 if (DiagnoseSyncOnValueBasedClasses != 0) {
93 load_klass(tmp1, obj);
94 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
95 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
96 bnez(tmp1, slow_path);
97 }
98
99 const Register tmp1_mark = tmp1;
100 const Register tmp3_t = tmp3;
101
102 { // Fast locking
103
104 // Push lock to the lock stack and finish successfully. MUST branch to with flag == 0
105 Label push;
106
107 const Register tmp2_top = tmp2;
108
109 // Check if lock-stack is full.
110 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
111 mv(tmp3_t, (unsigned)LockStack::end_offset());
112 bge(tmp2_top, tmp3_t, slow_path);
113
114 // Check if recursive.
115 add(tmp3_t, xthread, tmp2_top);
116 ld(tmp3_t, Address(tmp3_t, -oopSize));
117 beq(obj, tmp3_t, push);
118
119 // Relaxed normal load to check for monitor. Optimization for monitor case.
120 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
121 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
122 bnez(tmp3_t, inflated);
123
124 // Not inflated
125 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
126
127 // Try to lock. Transition lock-bits 0b01 => 0b00
128 ori(tmp1_mark, tmp1_mark, markWord::unlocked_value);
129 xori(tmp3_t, tmp1_mark, markWord::unlocked_value);
130 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
131 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_t);
132 bne(tmp1_mark, tmp3_t, slow_path);
133
134 bind(push);
135 // After successful lock, push object on lock-stack.
136 add(tmp3_t, xthread, tmp2_top);
137 sd(obj, Address(tmp3_t));
138 addw(tmp2_top, tmp2_top, oopSize);
139 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
140 j(locked);
141 }
142
143 { // Handle inflated monitor.
144 bind(inflated);
145
146 const Register tmp1_monitor = tmp1;
147
148 if (!UseObjectMonitorTable) {
149 assert(tmp1_monitor == tmp1_mark, "should be the same here");
150 } else {
151 const Register tmp2_hash = tmp2;
152 const Register tmp3_bucket = tmp3;
153 Label monitor_found;
154
155 // Save the mark, we might need it to extract the hash.
156 mv(tmp2_hash, tmp1_mark);
157
158 // Look for the monitor in the om_cache.
159
160 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
161 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
162 const int num_unrolled = OMCache::CAPACITY;
163 for (int i = 0; i < num_unrolled; i++) {
164 ld(tmp1_monitor, Address(xthread, cache_offset + monitor_offset));
165 ld(tmp4, Address(xthread, cache_offset));
166 beq(obj, tmp4, monitor_found);
167 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
168 }
169
170 // Look for the monitor in the table.
171
172 // Get the hash code.
173 srli(tmp2_hash, tmp2_hash, markWord::hash_shift);
174
175 // Get the table and calculate the bucket's address.
176 la(tmp3_t, ExternalAddress(ObjectMonitorTable::current_table_address()));
177 ld(tmp3_t, Address(tmp3_t));
178 ld(tmp1, Address(tmp3_t, ObjectMonitorTable::table_capacity_mask_offset()));
179 andr(tmp2_hash, tmp2_hash, tmp1);
180 ld(tmp3_t, Address(tmp3_t, ObjectMonitorTable::table_buckets_offset()));
181
182 // Read the monitor from the bucket.
183 shadd(tmp3_bucket, tmp2_hash, tmp3_t, tmp4, LogBytesPerWord);
184 ld(tmp1_monitor, Address(tmp3_bucket));
185
186 // Check if the monitor in the bucket is special (empty, tombstone or removed).
187 mv(tmp2, ObjectMonitorTable::SpecialPointerValues::below_is_special);
188 bltu(tmp1_monitor, tmp2, slow_path);
189
190 // Check if object matches.
191 ld(tmp3, Address(tmp1_monitor, ObjectMonitor::object_offset()));
192 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
193 bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
194 bne(tmp3, obj, slow_path);
195
196 bind(monitor_found);
197 }
198
199 const Register tmp2_owner_addr = tmp2;
200 const Register tmp3_owner = tmp3;
201
202 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
203 const Address owner_address(tmp1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
204 const Address recursions_address(tmp1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
205
206 Label monitor_locked;
207
208 // Compute owner address.
209 la(tmp2_owner_addr, owner_address);
210
211 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
212 Register tid = tmp4;
213 ld(tid, Address(xthread, JavaThread::monitor_owner_id_offset()));
214 cmpxchg(/*addr*/ tmp2_owner_addr, /*expected*/ zr, /*new*/ tid, Assembler::int64,
215 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ tmp3_owner);
216 beqz(tmp3_owner, monitor_locked);
217
218 // Check if recursive.
219 bne(tmp3_owner, tid, slow_path);
220
221 // Recursive.
222 increment(recursions_address, 1, tmp2, tmp3);
223
224 bind(monitor_locked);
225 if (UseObjectMonitorTable) {
226 sd(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
227 }
228 }
229
230 bind(locked);
231 mv(flag, zr);
232
233 #ifdef ASSERT
234 // Check that locked label is reached with flag == 0.
235 Label flag_correct;
236 beqz(flag, flag_correct);
237 stop("Fast Lock Flag != 0");
238 #endif
239
240 bind(slow_path);
241 #ifdef ASSERT
242 // Check that slow_path label is reached with flag != 0.
243 bnez(flag, flag_correct);
244 stop("Fast Lock Flag == 0");
245 bind(flag_correct);
246 #endif
247 // C2 uses the value of flag (0 vs !0) to determine the continuation.
248 }
249
250 void C2_MacroAssembler::fast_unlock(Register obj, Register box,
251 Register tmp1, Register tmp2, Register tmp3) {
252 // Flag register, zero for success; non-zero for failure.
253 Register flag = t1;
254
255 assert_different_registers(obj, box, tmp1, tmp2, tmp3, flag, t0);
256
257 mv(flag, 1);
258
259 // Handle inflated monitor.
260 Label inflated, inflated_load_mark;
261 // Finish fast unlock successfully. unlocked MUST branch to with flag == 0
262 Label unlocked;
263 // Finish fast unlock unsuccessfully. MUST branch to with flag != 0
264 Label slow_path;
265
266 const Register tmp1_mark = tmp1;
267 const Register tmp2_top = tmp2;
268 const Register tmp3_t = tmp3;
269
270 { // Fast unlock
271 Label push_and_slow_path;
272
273 // Check if obj is top of lock-stack.
274 lwu(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
275 subw(tmp2_top, tmp2_top, oopSize);
276 add(tmp3_t, xthread, tmp2_top);
277 ld(tmp3_t, Address(tmp3_t));
278 // Top of lock stack was not obj. Must be monitor.
279 bne(obj, tmp3_t, inflated_load_mark);
280
281 // Pop lock-stack.
282 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
283 DEBUG_ONLY(sd(zr, Address(tmp3_t));)
284 sw(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
285
286 // Check if recursive.
287 add(tmp3_t, xthread, tmp2_top);
288 ld(tmp3_t, Address(tmp3_t, -oopSize));
289 beq(obj, tmp3_t, unlocked);
290
291 // Not recursive.
292 // Load Mark.
293 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
294
295 // Check header for monitor (0b10).
296 // Because we got here by popping (meaning we pushed in locked)
297 // there will be no monitor in the box. So we need to push back the obj
298 // so that the runtime can fix any potential anonymous owner.
299 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
300 bnez(tmp3_t, UseObjectMonitorTable ? push_and_slow_path : inflated);
301
302 // Try to unlock. Transition lock bits 0b00 => 0b01
303 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
304 ori(tmp3_t, tmp1_mark, markWord::unlocked_value);
305 cmpxchg(/*addr*/ obj, /*expected*/ tmp1_mark, /*new*/ tmp3_t, Assembler::int64,
306 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ tmp3_t);
307 beq(tmp1_mark, tmp3_t, unlocked);
308
309 bind(push_and_slow_path);
310 // Compare and exchange failed.
311 // Restore lock-stack and handle the unlock in runtime.
312 DEBUG_ONLY(add(tmp3_t, xthread, tmp2_top);)
313 DEBUG_ONLY(sd(obj, Address(tmp3_t));)
314 addw(tmp2_top, tmp2_top, oopSize);
315 sd(tmp2_top, Address(xthread, JavaThread::lock_stack_top_offset()));
316 j(slow_path);
317 }
318
319 { // Handle inflated monitor.
320 bind(inflated_load_mark);
321 ld(tmp1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
322 #ifdef ASSERT
323 test_bit(tmp3_t, tmp1_mark, exact_log2(markWord::monitor_value));
324 bnez(tmp3_t, inflated);
325 stop("Fast Unlock not monitor");
326 #endif
327
328 bind(inflated);
329
330 #ifdef ASSERT
331 Label check_done;
332 subw(tmp2_top, tmp2_top, oopSize);
333 mv(tmp3_t, in_bytes(JavaThread::lock_stack_base_offset()));
334 blt(tmp2_top, tmp3_t, check_done);
335 add(tmp3_t, xthread, tmp2_top);
336 ld(tmp3_t, Address(tmp3_t));
337 bne(obj, tmp3_t, inflated);
338 stop("Fast Unlock lock on stack");
339 bind(check_done);
340 #endif
341
342 const Register tmp1_monitor = tmp1;
343
344 if (!UseObjectMonitorTable) {
345 assert(tmp1_monitor == tmp1_mark, "should be the same here");
346 // Untag the monitor.
347 subi(tmp1_monitor, tmp1_mark, (int)markWord::monitor_value);
348 } else {
349 ld(tmp1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
350 // No valid pointer below alignof(ObjectMonitor*). Take the slow path.
351 mv(tmp3_t, alignof(ObjectMonitor*));
352 bltu(tmp1_monitor, tmp3_t, slow_path);
353 }
354
355 const Register tmp2_recursions = tmp2;
356 Label not_recursive;
357
358 // Check if recursive.
359 ld(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
360 beqz(tmp2_recursions, not_recursive);
361
362 // Recursive unlock.
363 subi(tmp2_recursions, tmp2_recursions, 1);
364 sd(tmp2_recursions, Address(tmp1_monitor, ObjectMonitor::recursions_offset()));
365 j(unlocked);
366
367 bind(not_recursive);
368
369 const Register tmp2_owner_addr = tmp2;
370
371 // Compute owner address.
372 la(tmp2_owner_addr, Address(tmp1_monitor, ObjectMonitor::owner_offset()));
373
374 // Set owner to null.
375 // Release to satisfy the JMM
376 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
377 sd(zr, Address(tmp2_owner_addr));
378 // We need a full fence after clearing owner to avoid stranding.
379 // StoreLoad achieves this.
380 membar(StoreLoad);
381
382 // Check if the entry_list is empty.
383 ld(t0, Address(tmp1_monitor, ObjectMonitor::entry_list_offset()));
384 beqz(t0, unlocked); // If so we are done.
385
386 // Check if there is a successor.
387 ld(tmp3_t, Address(tmp1_monitor, ObjectMonitor::succ_offset()));
388 bnez(tmp3_t, unlocked); // If so we are done.
389
390 // Save the monitor pointer in the current thread, so we can try
391 // to reacquire the lock in SharedRuntime::monitor_exit_helper().
392 sd(tmp1_monitor, Address(xthread, JavaThread::unlocked_inflated_monitor_offset()));
393
394 mv(flag, 1);
395 j(slow_path);
396 }
397
398 bind(unlocked);
399 mv(flag, zr);
400
401 #ifdef ASSERT
402 // Check that unlocked label is reached with flag == 0.
403 Label flag_correct;
404 beqz(flag, flag_correct);
405 stop("Fast Lock Flag != 0");
406 #endif
407
408 bind(slow_path);
409 #ifdef ASSERT
410 // Check that slow_path label is reached with flag != 0.
411 bnez(flag, flag_correct);
412 stop("Fast Lock Flag == 0");
413 bind(flag_correct);
414 #endif
415 // C2 uses the value of flag (0 vs !0) to determine the continuation.
416 }
417
418 // short string
419 // StringUTF16.indexOfChar
420 // StringLatin1.indexOfChar
421 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
422 Register ch, Register result,
423 bool isL)
424 {
425 Register ch1 = t0;
426 Register index = t1;
427
428 BLOCK_COMMENT("string_indexof_char_short {");
429
430 Label LOOP, LOOP1, LOOP4, LOOP8;
431 Label MATCH, MATCH1, MATCH2, MATCH3,
432 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
433
434 mv(result, -1);
435 mv(index, zr);
436
437 bind(LOOP);
438 addi(t0, index, 8);
439 ble(t0, cnt1, LOOP8);
440 addi(t0, index, 4);
441 ble(t0, cnt1, LOOP4);
442 j(LOOP1);
443
444 bind(LOOP8);
445 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
446 beq(ch, ch1, MATCH);
447 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
448 beq(ch, ch1, MATCH1);
449 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
450 beq(ch, ch1, MATCH2);
451 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
452 beq(ch, ch1, MATCH3);
453 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
454 beq(ch, ch1, MATCH4);
455 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
456 beq(ch, ch1, MATCH5);
457 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
458 beq(ch, ch1, MATCH6);
459 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
460 beq(ch, ch1, MATCH7);
461 addi(index, index, 8);
462 addi(str1, str1, isL ? 8 : 16);
463 blt(index, cnt1, LOOP);
464 j(NOMATCH);
465
466 bind(LOOP4);
467 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
468 beq(ch, ch1, MATCH);
469 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
470 beq(ch, ch1, MATCH1);
471 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
472 beq(ch, ch1, MATCH2);
473 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
474 beq(ch, ch1, MATCH3);
475 addi(index, index, 4);
476 addi(str1, str1, isL ? 4 : 8);
477 bge(index, cnt1, NOMATCH);
478
479 bind(LOOP1);
480 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
481 beq(ch, ch1, MATCH);
482 addi(index, index, 1);
483 addi(str1, str1, isL ? 1 : 2);
484 blt(index, cnt1, LOOP1);
485 j(NOMATCH);
486
487 bind(MATCH1);
488 addi(index, index, 1);
489 j(MATCH);
490
491 bind(MATCH2);
492 addi(index, index, 2);
493 j(MATCH);
494
495 bind(MATCH3);
496 addi(index, index, 3);
497 j(MATCH);
498
499 bind(MATCH4);
500 addi(index, index, 4);
501 j(MATCH);
502
503 bind(MATCH5);
504 addi(index, index, 5);
505 j(MATCH);
506
507 bind(MATCH6);
508 addi(index, index, 6);
509 j(MATCH);
510
511 bind(MATCH7);
512 addi(index, index, 7);
513
514 bind(MATCH);
515 mv(result, index);
516 bind(NOMATCH);
517 BLOCK_COMMENT("} string_indexof_char_short");
518 }
519
520 // StringUTF16.indexOfChar
521 // StringLatin1.indexOfChar
522 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
523 Register ch, Register result,
524 Register tmp1, Register tmp2,
525 Register tmp3, Register tmp4,
526 bool isL)
527 {
528 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
529 Register ch1 = t0;
530 Register orig_cnt = t1;
531 Register mask1 = tmp3;
532 Register mask2 = tmp2;
533 Register match_mask = tmp1;
534 Register trailing_char = tmp4;
535 Register unaligned_elems = tmp4;
536
537 BLOCK_COMMENT("string_indexof_char {");
538 beqz(cnt1, NOMATCH);
539
540 subi(t0, cnt1, isL ? 32 : 16);
541 bgtz(t0, DO_LONG);
542 string_indexof_char_short(str1, cnt1, ch, result, isL);
543 j(DONE);
544
545 bind(DO_LONG);
546 mv(orig_cnt, cnt1);
547 if (AvoidUnalignedAccesses) {
548 Label ALIGNED;
549 andi(unaligned_elems, str1, 0x7);
550 beqz(unaligned_elems, ALIGNED);
551 sub(unaligned_elems, unaligned_elems, 8);
552 neg(unaligned_elems, unaligned_elems);
553 if (!isL) {
554 srli(unaligned_elems, unaligned_elems, 1);
555 }
556 // do unaligned part per element
557 string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
558 bgez(result, DONE);
559 mv(orig_cnt, cnt1);
560 sub(cnt1, cnt1, unaligned_elems);
561 bind(ALIGNED);
562 }
563
564 // duplicate ch
565 if (isL) {
566 slli(ch1, ch, 8);
567 orr(ch, ch1, ch);
568 }
569 slli(ch1, ch, 16);
570 orr(ch, ch1, ch);
571 slli(ch1, ch, 32);
572 orr(ch, ch1, ch);
573
574 if (!isL) {
575 slli(cnt1, cnt1, 1);
576 }
577
578 uint64_t mask0101 = UCONST64(0x0101010101010101);
579 uint64_t mask0001 = UCONST64(0x0001000100010001);
580 mv(mask1, isL ? mask0101 : mask0001);
581 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
582 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
583 mv(mask2, isL ? mask7f7f : mask7fff);
584
585 bind(CH1_LOOP);
586 ld(ch1, Address(str1));
587 addi(str1, str1, 8);
588 subi(cnt1, cnt1, 8);
589 compute_match_mask(ch1, ch, match_mask, mask1, mask2);
590 bnez(match_mask, HIT);
591 bgtz(cnt1, CH1_LOOP);
592 j(NOMATCH);
593
594 bind(HIT);
595 // count bits of trailing zero chars
596 ctzc_bits(trailing_char, match_mask, isL, ch1, result);
597 srli(trailing_char, trailing_char, 3);
598 addi(cnt1, cnt1, 8);
599 ble(cnt1, trailing_char, NOMATCH);
600 // match case
601 if (!isL) {
602 srli(cnt1, cnt1, 1);
603 srli(trailing_char, trailing_char, 1);
604 }
605
606 sub(result, orig_cnt, cnt1);
607 add(result, result, trailing_char);
608 j(DONE);
609
610 bind(NOMATCH);
611 mv(result, -1);
612
613 bind(DONE);
614 BLOCK_COMMENT("} string_indexof_char");
615 }
616
617 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
618
619 // Search for needle in haystack and return index or -1
620 // x10: result
621 // x11: haystack
622 // x12: haystack_len
623 // x13: needle
624 // x14: needle_len
625 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
626 Register haystack_len, Register needle_len,
627 Register tmp1, Register tmp2,
628 Register tmp3, Register tmp4,
629 Register tmp5, Register tmp6,
630 Register result, int ae)
631 {
632 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
633
634 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
635
636 Register ch1 = t0;
637 Register ch2 = t1;
638 Register nlen_tmp = tmp1; // needle len tmp
639 Register hlen_tmp = tmp2; // haystack len tmp
640 Register result_tmp = tmp4;
641
642 bool isLL = ae == StrIntrinsicNode::LL;
643
644 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
645 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
646 int needle_chr_shift = needle_isL ? 0 : 1;
647 int haystack_chr_shift = haystack_isL ? 0 : 1;
648 int needle_chr_size = needle_isL ? 1 : 2;
649 int haystack_chr_size = haystack_isL ? 1 : 2;
650 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
651 (load_chr_insn)&MacroAssembler::lhu;
652 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
653 (load_chr_insn)&MacroAssembler::lhu;
654
655 BLOCK_COMMENT("string_indexof {");
656
657 // Note, inline_string_indexOf() generates checks:
658 // if (pattern.count > src.count) return -1;
659 // if (pattern.count == 0) return 0;
660
661 // We have two strings, a source string in haystack, haystack_len and a pattern string
662 // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
663
664 // For larger pattern and source we use a simplified Boyer Moore algorithm.
665 // With a small pattern and source we use linear scan.
666
667 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
668 sub(result_tmp, haystack_len, needle_len);
669 // needle_len < 8, use linear scan
670 sub(t0, needle_len, 8);
671 bltz(t0, LINEARSEARCH);
672 // needle_len >= 256, use linear scan
673 sub(t0, needle_len, 256);
674 bgez(t0, LINEARSTUB);
675 // needle_len >= haystack_len/4, use linear scan
676 srli(t0, haystack_len, 2);
677 bge(needle_len, t0, LINEARSTUB);
678
679 // Boyer-Moore-Horspool introduction:
680 // The Boyer Moore alogorithm is based on the description here:-
681 //
682 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
683 //
684 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
685 // and the 'Good Suffix' rule.
686 //
687 // These rules are essentially heuristics for how far we can shift the
688 // pattern along the search string.
689 //
690 // The implementation here uses the 'Bad Character' rule only because of the
691 // complexity of initialisation for the 'Good Suffix' rule.
692 //
693 // This is also known as the Boyer-Moore-Horspool algorithm:
694 //
695 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
696 //
697 // #define ASIZE 256
698 //
699 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
700 // int i, j;
701 // unsigned c;
702 // unsigned char bc[ASIZE];
703 //
704 // /* Preprocessing */
705 // for (i = 0; i < ASIZE; ++i)
706 // bc[i] = m;
707 // for (i = 0; i < m - 1; ) {
708 // c = pattern[i];
709 // ++i;
710 // // c < 256 for Latin1 string, so, no need for branch
711 // #ifdef PATTERN_STRING_IS_LATIN1
712 // bc[c] = m - i;
713 // #else
714 // if (c < ASIZE) bc[c] = m - i;
715 // #endif
716 // }
717 //
718 // /* Searching */
719 // j = 0;
720 // while (j <= n - m) {
721 // c = src[i+j];
722 // if (pattern[m-1] == c)
723 // int k;
724 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
725 // if (k < 0) return j;
726 // // c < 256 for Latin1 string, so, no need for branch
727 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
728 // // LL case: (c< 256) always true. Remove branch
729 // j += bc[pattern[j+m-1]];
730 // #endif
731 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
732 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
733 // if (c < ASIZE)
734 // j += bc[pattern[j+m-1]];
735 // else
736 // j += 1
737 // #endif
738 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
739 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
740 // if (c < ASIZE)
741 // j += bc[pattern[j+m-1]];
742 // else
743 // j += m
744 // #endif
745 // }
746 // return -1;
747 // }
748
749 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
750 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
751 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
752
753 Register haystack_end = haystack_len;
754 Register skipch = tmp2;
755
756 // pattern length is >=8, so, we can read at least 1 register for cases when
757 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
758 // UL case. We'll re-read last character in inner pre-loop code to have
759 // single outer pre-loop load
760 const int firstStep = isLL ? 7 : 3;
761
762 const int ASIZE = 256;
763 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
764
765 subi(sp, sp, ASIZE);
766
767 // init BC offset table with default value: needle_len
768 slli(t0, needle_len, 8);
769 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
770 slli(tmp1, t0, 16);
771 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
772 slli(tmp1, t0, 32);
773 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
774
775 mv(ch1, sp); // ch1 is t0
776 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
777
778 bind(BM_INIT_LOOP);
779 // for (i = 0; i < ASIZE; ++i)
780 // bc[i] = m;
781 for (int i = 0; i < 4; i++) {
782 sd(tmp5, Address(ch1, i * wordSize));
783 }
784 addi(ch1, ch1, 32);
785 subi(tmp6, tmp6, 4);
786 bgtz(tmp6, BM_INIT_LOOP);
787
788 subi(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
789 Register orig_haystack = tmp5;
790 mv(orig_haystack, haystack);
791 // result_tmp = tmp4
792 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
793 subi(ch2, needle_len, 1); // bc offset init value, ch2 is t1
794 mv(tmp3, needle);
795
796 // for (i = 0; i < m - 1; ) {
797 // c = pattern[i];
798 // ++i;
799 // // c < 256 for Latin1 string, so, no need for branch
800 // #ifdef PATTERN_STRING_IS_LATIN1
801 // bc[c] = m - i;
802 // #else
803 // if (c < ASIZE) bc[c] = m - i;
804 // #endif
805 // }
806 bind(BCLOOP);
807 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
808 addi(tmp3, tmp3, needle_chr_size);
809 if (!needle_isL) {
810 // ae == StrIntrinsicNode::UU
811 mv(tmp6, ASIZE);
812 bgeu(ch1, tmp6, BCSKIP);
813 }
814 add(tmp4, sp, ch1);
815 sb(ch2, Address(tmp4)); // store skip offset to BC offset table
816
817 bind(BCSKIP);
818 subi(ch2, ch2, 1); // for next pattern element, skip distance -1
819 bgtz(ch2, BCLOOP);
820
821 // tmp6: pattern end, address after needle
822 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
823 if (needle_isL == haystack_isL) {
824 // load last 8 bytes (8LL/4UU symbols)
825 ld(tmp6, Address(tmp6, -wordSize));
826 } else {
827 // UL: from UTF-16(source) search Latin1(pattern)
828 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
829 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
830 // We'll have to wait until load completed, but it's still faster than per-character loads+checks
831 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
832 slli(ch2, tmp6, XLEN - 24);
833 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
834 slli(ch1, tmp6, XLEN - 16);
835 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
836 zext(tmp6, tmp6, 8); // pattern[m-4], 0x0000000d
837 slli(ch2, ch2, 16);
838 orr(ch2, ch2, ch1); // 0x00000b0c
839 slli(result, tmp3, 48); // use result as temp register
840 orr(tmp6, tmp6, result); // 0x0a00000d
841 slli(result, ch2, 16);
842 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
843 }
844
845 // i = m - 1;
846 // skipch = j + i;
847 // if (skipch == pattern[m - 1]
848 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
849 // else
850 // move j with bad char offset table
851 bind(BMLOOPSTR2);
852 // compare pattern to source string backward
853 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
854 (this->*haystack_load_1chr)(skipch, Address(result), noreg);
855 subi(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
856 if (needle_isL == haystack_isL) {
857 // re-init tmp3. It's for free because it's executed in parallel with
858 // load above. Alternative is to initialize it before loop, but it'll
859 // affect performance on in-order systems with 2 or more ld/st pipelines
860 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
861 }
862 if (!isLL) { // UU/UL case
863 slli(ch2, nlen_tmp, 1); // offsets in bytes
864 }
865 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
866 add(result, haystack, isLL ? nlen_tmp : ch2);
867 // load 8 bytes from source string
868 // if isLL is false then read granularity can be 2
869 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
870 mv(ch1, tmp6);
871 if (isLL) {
872 j(BMLOOPSTR1_AFTER_LOAD);
873 } else {
874 subi(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
875 j(BMLOOPSTR1_CMP);
876 }
877
878 bind(BMLOOPSTR1);
879 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
880 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
881 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
882 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
883
884 bind(BMLOOPSTR1_AFTER_LOAD);
885 subi(nlen_tmp, nlen_tmp, 1);
886 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
887
888 bind(BMLOOPSTR1_CMP);
889 beq(ch1, ch2, BMLOOPSTR1);
890
891 bind(BMSKIP);
892 if (!isLL) {
893 // if we've met UTF symbol while searching Latin1 pattern, then we can
894 // skip needle_len symbols
895 if (needle_isL != haystack_isL) {
896 mv(result_tmp, needle_len);
897 } else {
898 mv(result_tmp, 1);
899 }
900 mv(t0, ASIZE);
901 bgeu(skipch, t0, BMADV);
902 }
903 add(result_tmp, sp, skipch);
904 lbu(result_tmp, Address(result_tmp)); // load skip offset
905
906 bind(BMADV);
907 subi(nlen_tmp, needle_len, 1);
908 // move haystack after bad char skip offset
909 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
910 ble(haystack, haystack_end, BMLOOPSTR2);
911 addi(sp, sp, ASIZE);
912 j(NOMATCH);
913
914 bind(BMLOOPSTR1_LASTCMP);
915 bne(ch1, ch2, BMSKIP);
916
917 bind(BMMATCH);
918 sub(result, haystack, orig_haystack);
919 if (!haystack_isL) {
920 srli(result, result, 1);
921 }
922 addi(sp, sp, ASIZE);
923 j(DONE);
924
925 bind(LINEARSTUB);
926 subi(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
927 bltz(t0, LINEARSEARCH);
928 mv(result, zr);
929 RuntimeAddress stub = nullptr;
930 if (isLL) {
931 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
932 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
933 } else if (needle_isL) {
934 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
935 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
936 } else {
937 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
938 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
939 }
940 address call = reloc_call(stub);
941 if (call == nullptr) {
942 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
943 ciEnv::current()->record_failure("CodeCache is full");
944 return;
945 }
946 j(DONE);
947
948 bind(NOMATCH);
949 mv(result, -1);
950 j(DONE);
951
952 bind(LINEARSEARCH);
953 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
954
955 bind(DONE);
956 BLOCK_COMMENT("} string_indexof");
957 }
958
959 // string_indexof
960 // result: x10
961 // src: x11
962 // src_count: x12
963 // pattern: x13
964 // pattern_count: x14 or 1/2/3/4
965 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
966 Register haystack_len, Register needle_len,
967 Register tmp1, Register tmp2,
968 Register tmp3, Register tmp4,
969 int needle_con_cnt, Register result, int ae)
970 {
971 // Note:
972 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
973 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
974 assert(needle_con_cnt <= 4, "Invalid needle constant count");
975 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
976
977 Register ch1 = t0;
978 Register ch2 = t1;
979 Register hlen_neg = haystack_len, nlen_neg = needle_len;
980 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
981
982 bool isLL = ae == StrIntrinsicNode::LL;
983
984 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
985 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
986 int needle_chr_shift = needle_isL ? 0 : 1;
987 int haystack_chr_shift = haystack_isL ? 0 : 1;
988 int needle_chr_size = needle_isL ? 1 : 2;
989 int haystack_chr_size = haystack_isL ? 1 : 2;
990
991 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
992 (load_chr_insn)&MacroAssembler::lhu;
993 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
994 (load_chr_insn)&MacroAssembler::lhu;
995 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
996 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
997
998 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
999
1000 Register first = tmp3;
1001
1002 if (needle_con_cnt == -1) {
1003 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
1004
1005 subi(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
1006 bltz(t0, DOSHORT);
1007
1008 (this->*needle_load_1chr)(first, Address(needle), noreg);
1009 slli(t0, needle_len, needle_chr_shift);
1010 add(needle, needle, t0);
1011 neg(nlen_neg, t0);
1012 slli(t0, result_tmp, haystack_chr_shift);
1013 add(haystack, haystack, t0);
1014 neg(hlen_neg, t0);
1015
1016 bind(FIRST_LOOP);
1017 add(t0, haystack, hlen_neg);
1018 (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
1019 beq(first, ch2, STR1_LOOP);
1020
1021 bind(STR2_NEXT);
1022 addi(hlen_neg, hlen_neg, haystack_chr_size);
1023 blez(hlen_neg, FIRST_LOOP);
1024 j(NOMATCH);
1025
1026 bind(STR1_LOOP);
1027 addi(nlen_tmp, nlen_neg, needle_chr_size);
1028 addi(hlen_tmp, hlen_neg, haystack_chr_size);
1029 bgez(nlen_tmp, MATCH);
1030
1031 bind(STR1_NEXT);
1032 add(ch1, needle, nlen_tmp);
1033 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
1034 add(ch2, haystack, hlen_tmp);
1035 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1036 bne(ch1, ch2, STR2_NEXT);
1037 addi(nlen_tmp, nlen_tmp, needle_chr_size);
1038 addi(hlen_tmp, hlen_tmp, haystack_chr_size);
1039 bltz(nlen_tmp, STR1_NEXT);
1040 j(MATCH);
1041
1042 bind(DOSHORT);
1043 if (needle_isL == haystack_isL) {
1044 subi(t0, needle_len, 2);
1045 bltz(t0, DO1);
1046 bgtz(t0, DO3);
1047 }
1048 }
1049
1050 if (needle_con_cnt == 4) {
1051 Label CH1_LOOP;
1052 (this->*load_4chr)(ch1, Address(needle), noreg);
1053 subi(result_tmp, haystack_len, 4);
1054 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
1055 add(haystack, haystack, tmp3);
1056 neg(hlen_neg, tmp3);
1057 if (AvoidUnalignedAccesses) {
1058 // preload first value, then we will read by 1 character per loop, instead of four
1059 // just shifting previous ch2 right by size of character in bits
1060 add(tmp3, haystack, hlen_neg);
1061 (this->*load_4chr)(ch2, Address(tmp3), noreg);
1062 if (isLL) {
1063 // need to erase 1 most significant byte in 32-bit value of ch2
1064 slli(ch2, ch2, 40);
1065 srli(ch2, ch2, 32);
1066 } else {
1067 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
1068 }
1069 }
1070
1071 bind(CH1_LOOP);
1072 add(tmp3, haystack, hlen_neg);
1073 if (AvoidUnalignedAccesses) {
1074 srli(ch2, ch2, isLL ? 8 : 16);
1075 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
1076 slli(tmp3, tmp3, isLL ? 24 : 48);
1077 add(ch2, ch2, tmp3);
1078 } else {
1079 (this->*load_4chr)(ch2, Address(tmp3), noreg);
1080 }
1081 beq(ch1, ch2, MATCH);
1082 addi(hlen_neg, hlen_neg, haystack_chr_size);
1083 blez(hlen_neg, CH1_LOOP);
1084 j(NOMATCH);
1085 }
1086
1087 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
1088 Label CH1_LOOP;
1089 BLOCK_COMMENT("string_indexof DO2 {");
1090 bind(DO2);
1091 (this->*load_2chr)(ch1, Address(needle), noreg);
1092 if (needle_con_cnt == 2) {
1093 subi(result_tmp, haystack_len, 2);
1094 }
1095 slli(tmp3, result_tmp, haystack_chr_shift);
1096 add(haystack, haystack, tmp3);
1097 neg(hlen_neg, tmp3);
1098 if (AvoidUnalignedAccesses) {
1099 // preload first value, then we will read by 1 character per loop, instead of two
1100 // just shifting previous ch2 right by size of character in bits
1101 add(tmp3, haystack, hlen_neg);
1102 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1103 slli(ch2, ch2, isLL ? 8 : 16);
1104 }
1105 bind(CH1_LOOP);
1106 add(tmp3, haystack, hlen_neg);
1107 if (AvoidUnalignedAccesses) {
1108 srli(ch2, ch2, isLL ? 8 : 16);
1109 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
1110 slli(tmp3, tmp3, isLL ? 8 : 16);
1111 add(ch2, ch2, tmp3);
1112 } else {
1113 (this->*load_2chr)(ch2, Address(tmp3), noreg);
1114 }
1115 beq(ch1, ch2, MATCH);
1116 addi(hlen_neg, hlen_neg, haystack_chr_size);
1117 blez(hlen_neg, CH1_LOOP);
1118 j(NOMATCH);
1119 BLOCK_COMMENT("} string_indexof DO2");
1120 }
1121
1122 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
1123 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1124 BLOCK_COMMENT("string_indexof DO3 {");
1125
1126 bind(DO3);
1127 (this->*load_2chr)(first, Address(needle), noreg);
1128 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
1129 if (needle_con_cnt == 3) {
1130 subi(result_tmp, haystack_len, 3);
1131 }
1132 slli(hlen_tmp, result_tmp, haystack_chr_shift);
1133 add(haystack, haystack, hlen_tmp);
1134 neg(hlen_neg, hlen_tmp);
1135
1136 bind(FIRST_LOOP);
1137 add(ch2, haystack, hlen_neg);
1138 if (AvoidUnalignedAccesses) {
1139 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
1140 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1141 slli(tmp2, tmp2, isLL ? 8 : 16);
1142 add(ch2, ch2, tmp2);
1143 } else {
1144 (this->*load_2chr)(ch2, Address(ch2), noreg);
1145 }
1146 beq(first, ch2, STR1_LOOP);
1147
1148 bind(STR2_NEXT);
1149 addi(hlen_neg, hlen_neg, haystack_chr_size);
1150 blez(hlen_neg, FIRST_LOOP);
1151 j(NOMATCH);
1152
1153 bind(STR1_LOOP);
1154 addi(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
1155 add(ch2, haystack, hlen_tmp);
1156 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1157 bne(ch1, ch2, STR2_NEXT);
1158 j(MATCH);
1159 BLOCK_COMMENT("} string_indexof DO3");
1160 }
1161
1162 if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1163 Label DO1_LOOP;
1164
1165 BLOCK_COMMENT("string_indexof DO1 {");
1166 bind(DO1);
1167 (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1168 subi(result_tmp, haystack_len, 1);
1169 slli(tmp3, result_tmp, haystack_chr_shift);
1170 add(haystack, haystack, tmp3);
1171 neg(hlen_neg, tmp3);
1172
1173 bind(DO1_LOOP);
1174 add(tmp3, haystack, hlen_neg);
1175 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1176 beq(ch1, ch2, MATCH);
1177 addi(hlen_neg, hlen_neg, haystack_chr_size);
1178 blez(hlen_neg, DO1_LOOP);
1179 BLOCK_COMMENT("} string_indexof DO1");
1180 }
1181
1182 bind(NOMATCH);
1183 mv(result, -1);
1184 j(DONE);
1185
1186 bind(MATCH);
1187 srai(t0, hlen_neg, haystack_chr_shift);
1188 add(result, result_tmp, t0);
1189
1190 bind(DONE);
1191 }
1192
1193 // Compare longwords
1194 void C2_MacroAssembler::string_compare_long_same_encoding(Register result, Register str1, Register str2,
1195 const bool isLL, Register cnt1, Register cnt2,
1196 Register tmp1, Register tmp2, Register tmp3,
1197 const int STUB_THRESHOLD, Label *STUB, Label *SHORT_STRING, Label *DONE) {
1198 Label TAIL_CHECK, TAIL, NEXT_WORD, DIFFERENCE;
1199
1200 const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1201 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1202
1203 const int minCharsInWord = isLL ? wordSize : wordSize / 2;
1204
1205 // load first parts of strings and finish initialization while loading
1206 beq(str1, str2, *DONE);
1207 // Alignment
1208 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1209 lwu(tmp1, Address(str1));
1210 lwu(tmp2, Address(str2));
1211 bne(tmp1, tmp2, DIFFERENCE);
1212 addi(str1, str1, 4);
1213 addi(str2, str2, 4);
1214 subi(cnt2, cnt2, minCharsInWord / 2);
1215
1216 // A very short string
1217 mv(t0, minCharsInWord);
1218 ble(cnt2, t0, *SHORT_STRING);
1219 }
1220 #ifdef ASSERT
1221 if (AvoidUnalignedAccesses) {
1222 Label align_ok;
1223 orr(t0, str1, str2);
1224 andi(t0, t0, 0x7);
1225 beqz(t0, align_ok);
1226 stop("bad alignment");
1227 bind(align_ok);
1228 }
1229 #endif
1230 // load 8 bytes once to compare
1231 ld(tmp1, Address(str1));
1232 ld(tmp2, Address(str2));
1233 mv(t0, STUB_THRESHOLD);
1234 bge(cnt2, t0, *STUB);
1235 subi(cnt2, cnt2, minCharsInWord);
1236 beqz(cnt2, TAIL_CHECK);
1237 // convert cnt2 from characters to bytes
1238 if (!isLL) {
1239 slli(cnt2, cnt2, 1);
1240 }
1241 add(str2, str2, cnt2);
1242 add(str1, str1, cnt2);
1243 sub(cnt2, zr, cnt2);
1244 addi(cnt2, cnt2, 8);
1245 bne(tmp1, tmp2, DIFFERENCE);
1246 bgez(cnt2, TAIL);
1247
1248 // main loop
1249 bind(NEXT_WORD);
1250 // 8-byte aligned loads when AvoidUnalignedAccesses is enabled
1251 add(t0, str1, cnt2);
1252 ld(tmp1, Address(t0));
1253 add(t0, str2, cnt2);
1254 ld(tmp2, Address(t0));
1255 addi(cnt2, cnt2, 8);
1256 bne(tmp1, tmp2, DIFFERENCE);
1257 bltz(cnt2, NEXT_WORD);
1258
1259 bind(TAIL);
1260 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1261 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1262
1263 bind(TAIL_CHECK);
1264 beq(tmp1, tmp2, *DONE);
1265
1266 // Find the first different characters in the longwords and
1267 // compute their difference.
1268 bind(DIFFERENCE);
1269 xorr(tmp3, tmp1, tmp2);
1270 // count bits of trailing zero chars
1271 ctzc_bits(result, tmp3, isLL);
1272 srl(tmp1, tmp1, result);
1273 srl(tmp2, tmp2, result);
1274 if (isLL) {
1275 zext(tmp1, tmp1, 8);
1276 zext(tmp2, tmp2, 8);
1277 } else {
1278 zext(tmp1, tmp1, 16);
1279 zext(tmp2, tmp2, 16);
1280 }
1281 sub(result, tmp1, tmp2);
1282
1283 j(*DONE);
1284 }
1285
1286 // Compare longwords
1287 void C2_MacroAssembler::string_compare_long_different_encoding(Register result, Register str1, Register str2,
1288 bool isLU, Register cnt1, Register cnt2,
1289 Register tmp1, Register tmp2, Register tmp3,
1290 const int STUB_THRESHOLD, Label *STUB, Label *DONE) {
1291 Label TAIL, NEXT_WORD, DIFFERENCE;
1292
1293 const int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1294 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1295
1296 Register strL = isLU ? str1 : str2;
1297 Register strU = isLU ? str2 : str1;
1298 Register tmpL = tmp1, tmpU = tmp2;
1299
1300 // load first parts of strings and finish initialization while loading
1301 mv(t0, STUB_THRESHOLD);
1302 bge(cnt2, t0, *STUB);
1303 lwu(tmpL, Address(strL));
1304 load_long_misaligned(tmpU, Address(strU), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1305 subi(cnt2, cnt2, 4);
1306 add(strL, strL, cnt2);
1307 sub(cnt1, zr, cnt2);
1308 slli(cnt2, cnt2, 1);
1309 add(strU, strU, cnt2);
1310 inflate_lo32(tmp3, tmpL);
1311 mv(tmpL, tmp3);
1312 sub(cnt2, zr, cnt2);
1313 addi(cnt1, cnt1, 4);
1314 addi(cnt2, cnt2, 8);
1315 bne(tmpL, tmpU, DIFFERENCE);
1316 bgez(cnt2, TAIL);
1317
1318 // main loop
1319 bind(NEXT_WORD);
1320 add(t0, strL, cnt1);
1321 lwu(tmpL, Address(t0));
1322 add(t0, strU, cnt2);
1323 load_long_misaligned(tmpU, Address(t0), tmp3, (base_offset % 8) != 0 ? 4 : 8);
1324 addi(cnt1, cnt1, 4);
1325 inflate_lo32(tmp3, tmpL);
1326 mv(tmpL, tmp3);
1327 addi(cnt2, cnt2, 8);
1328 bne(tmpL, tmpU, DIFFERENCE);
1329 bltz(cnt2, NEXT_WORD);
1330
1331 bind(TAIL);
1332 load_int_misaligned(tmpL, Address(strL), tmp3, false);
1333 load_long_misaligned(tmpU, Address(strU), tmp3, 2);
1334 inflate_lo32(tmp3, tmpL);
1335 mv(tmpL, tmp3);
1336
1337 beq(tmpL, tmpU, *DONE);
1338
1339 // Find the first different characters in the longwords and
1340 // compute their difference.
1341 bind(DIFFERENCE);
1342 xorr(tmp3, tmpL, tmpU);
1343 // count bits of trailing zero chars
1344 ctzc_bits(result, tmp3);
1345 srl(tmpL, tmpL, result);
1346 srl(tmpU, tmpU, result);
1347 zext(tmpL, tmpL, 16);
1348 zext(tmpU, tmpU, 16);
1349 if (isLU) {
1350 sub(result, tmpL, tmpU);
1351 } else {
1352 sub(result, tmpU, tmpL);
1353 }
1354
1355 j(*DONE);
1356 }
1357
1358 // Compare strings.
1359 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1360 Register cnt1, Register cnt2, Register result,
1361 Register tmp1, Register tmp2, Register tmp3,
1362 int ae)
1363 {
1364 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, STUB,
1365 SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1366 SHORT_LOOP_START, L;
1367
1368 const int STUB_THRESHOLD = 64 + 8;
1369 bool isLL = ae == StrIntrinsicNode::LL;
1370 bool isLU = ae == StrIntrinsicNode::LU;
1371 bool isUL = ae == StrIntrinsicNode::UL;
1372
1373 bool str1_isL = isLL || isLU;
1374 bool str2_isL = isLL || isUL;
1375
1376 // for L strings, 1 byte for 1 character
1377 // for U strings, 2 bytes for 1 character
1378 int str1_chr_size = str1_isL ? 1 : 2;
1379 int str2_chr_size = str2_isL ? 1 : 2;
1380 int minCharsInWord = isLL ? wordSize : wordSize / 2;
1381
1382 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1383 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1384
1385 BLOCK_COMMENT("string_compare {");
1386
1387 // Bizarrely, the counts are passed in bytes, regardless of whether they
1388 // are L or U strings, however the result is always in characters.
1389 if (!str1_isL) {
1390 sraiw(cnt1, cnt1, 1);
1391 }
1392 if (!str2_isL) {
1393 sraiw(cnt2, cnt2, 1);
1394 }
1395
1396 // Compute the minimum of the string lengths and save the difference in result.
1397 sub(result, cnt1, cnt2);
1398 bgt(cnt1, cnt2, L);
1399 mv(cnt2, cnt1);
1400 bind(L);
1401
1402 // A very short string
1403 mv(t0, minCharsInWord);
1404 ble(cnt2, t0, SHORT_STRING);
1405
1406 // Compare longwords
1407 {
1408 if (str1_isL == str2_isL) { // LL or UU
1409 string_compare_long_same_encoding(result,
1410 str1, str2, isLL,
1411 cnt1, cnt2, tmp1, tmp2, tmp3,
1412 STUB_THRESHOLD, &STUB, &SHORT_STRING, &DONE);
1413 } else { // LU or UL
1414 string_compare_long_different_encoding(result,
1415 str1, str2, isLU,
1416 cnt1, cnt2, tmp1, tmp2, tmp3,
1417 STUB_THRESHOLD, &STUB, &DONE);
1418 }
1419 }
1420
1421 bind(STUB);
1422 RuntimeAddress stub = nullptr;
1423 switch (ae) {
1424 case StrIntrinsicNode::LL:
1425 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1426 break;
1427 case StrIntrinsicNode::UU:
1428 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1429 break;
1430 case StrIntrinsicNode::LU:
1431 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1432 break;
1433 case StrIntrinsicNode::UL:
1434 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1435 break;
1436 default:
1437 ShouldNotReachHere();
1438 }
1439 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1440 address call = reloc_call(stub);
1441 if (call == nullptr) {
1442 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1443 ciEnv::current()->record_failure("CodeCache is full");
1444 return;
1445 }
1446 j(DONE);
1447
1448 bind(SHORT_STRING);
1449 // Is the minimum length zero?
1450 beqz(cnt2, DONE);
1451 // arrange code to do most branches while loading and loading next characters
1452 // while comparing previous
1453 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1454 addi(str1, str1, str1_chr_size);
1455 subi(cnt2, cnt2, 1);
1456 beqz(cnt2, SHORT_LAST_INIT);
1457 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1458 addi(str2, str2, str2_chr_size);
1459 j(SHORT_LOOP_START);
1460 bind(SHORT_LOOP);
1461 subi(cnt2, cnt2, 1);
1462 beqz(cnt2, SHORT_LAST);
1463 bind(SHORT_LOOP_START);
1464 (this->*str1_load_chr)(tmp2, Address(str1), t0);
1465 addi(str1, str1, str1_chr_size);
1466 (this->*str2_load_chr)(t0, Address(str2), t0);
1467 addi(str2, str2, str2_chr_size);
1468 bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1469 subi(cnt2, cnt2, 1);
1470 beqz(cnt2, SHORT_LAST2);
1471 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1472 addi(str1, str1, str1_chr_size);
1473 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1474 addi(str2, str2, str2_chr_size);
1475 beq(tmp2, t0, SHORT_LOOP);
1476 sub(result, tmp2, t0);
1477 j(DONE);
1478 bind(SHORT_LOOP_TAIL);
1479 sub(result, tmp1, cnt1);
1480 j(DONE);
1481 bind(SHORT_LAST2);
1482 beq(tmp2, t0, DONE);
1483 sub(result, tmp2, t0);
1484
1485 j(DONE);
1486 bind(SHORT_LAST_INIT);
1487 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1488 addi(str2, str2, str2_chr_size);
1489 bind(SHORT_LAST);
1490 beq(tmp1, cnt1, DONE);
1491 sub(result, tmp1, cnt1);
1492
1493 bind(DONE);
1494
1495 BLOCK_COMMENT("} string_compare");
1496 }
1497
1498 void C2_MacroAssembler::arrays_equals(Register a1, Register a2,
1499 Register tmp1, Register tmp2, Register tmp3,
1500 Register result, int elem_size) {
1501 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1502 assert_different_registers(a1, a2, result, tmp1, tmp2, tmp3, t0);
1503
1504 int elem_per_word = wordSize / elem_size;
1505 int log_elem_size = exact_log2(elem_size);
1506 int length_offset = arrayOopDesc::length_offset_in_bytes();
1507 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1508
1509 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1510
1511 Register cnt1 = tmp3;
1512 Register cnt2 = tmp1; // cnt2 only used in array length compare
1513 Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01;
1514
1515 BLOCK_COMMENT("arrays_equals {");
1516
1517 // if (a1 == a2), return true
1518 beq(a1, a2, SAME);
1519
1520 mv(result, false);
1521 // if (a1 == nullptr || a2 == nullptr)
1522 // return false;
1523 beqz(a1, DONE);
1524 beqz(a2, DONE);
1525
1526 // if (a1.length != a2.length)
1527 // return false;
1528 lwu(cnt1, Address(a1, length_offset));
1529 lwu(cnt2, Address(a2, length_offset));
1530 bne(cnt1, cnt2, DONE);
1531
1532 la(a1, Address(a1, base_offset));
1533 la(a2, Address(a2, base_offset));
1534
1535 // Load 4 bytes once to compare for alignment before main loop.
1536 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1537 subi(cnt1, cnt1, elem_per_word / 2);
1538 bltz(cnt1, TAIL03);
1539 lwu(tmp1, Address(a1));
1540 lwu(tmp2, Address(a2));
1541 addi(a1, a1, 4);
1542 addi(a2, a2, 4);
1543 bne(tmp1, tmp2, DONE);
1544 }
1545
1546 // Check for short strings, i.e. smaller than wordSize.
1547 subi(cnt1, cnt1, elem_per_word);
1548 bltz(cnt1, SHORT);
1549
1550 #ifdef ASSERT
1551 if (AvoidUnalignedAccesses) {
1552 Label align_ok;
1553 orr(t0, a1, a2);
1554 andi(t0, t0, 0x7);
1555 beqz(t0, align_ok);
1556 stop("bad alignment");
1557 bind(align_ok);
1558 }
1559 #endif
1560
1561 // Main 8 byte comparison loop.
1562 bind(NEXT_WORD); {
1563 ld(tmp1, Address(a1));
1564 ld(tmp2, Address(a2));
1565 subi(cnt1, cnt1, elem_per_word);
1566 addi(a1, a1, wordSize);
1567 addi(a2, a2, wordSize);
1568 bne(tmp1, tmp2, DONE);
1569 } bgez(cnt1, NEXT_WORD);
1570
1571 addi(tmp1, cnt1, elem_per_word);
1572 beqz(tmp1, SAME);
1573
1574 bind(SHORT);
1575 test_bit(tmp1, cnt1, 2 - log_elem_size);
1576 beqz(tmp1, TAIL03); // 0-7 bytes left.
1577 {
1578 lwu(tmp1, Address(a1));
1579 lwu(tmp2, Address(a2));
1580 addi(a1, a1, 4);
1581 addi(a2, a2, 4);
1582 bne(tmp1, tmp2, DONE);
1583 }
1584
1585 bind(TAIL03);
1586 test_bit(tmp1, cnt1, 1 - log_elem_size);
1587 beqz(tmp1, TAIL01); // 0-3 bytes left.
1588 {
1589 lhu(tmp1, Address(a1));
1590 lhu(tmp2, Address(a2));
1591 addi(a1, a1, 2);
1592 addi(a2, a2, 2);
1593 bne(tmp1, tmp2, DONE);
1594 }
1595
1596 bind(TAIL01);
1597 if (elem_size == 1) { // Only needed when comparing byte arrays.
1598 test_bit(tmp1, cnt1, 0);
1599 beqz(tmp1, SAME); // 0-1 bytes left.
1600 {
1601 lbu(tmp1, Address(a1));
1602 lbu(tmp2, Address(a2));
1603 bne(tmp1, tmp2, DONE);
1604 }
1605 }
1606
1607 bind(SAME);
1608 mv(result, true);
1609 // That's it.
1610 bind(DONE);
1611
1612 BLOCK_COMMENT("} arrays_equals");
1613 }
1614
1615 // Compare Strings
1616
1617 // For Strings we're passed the address of the first characters in a1 and a2
1618 // and the length in cnt1. There are two implementations.
1619 // For arrays >= 8 bytes, all comparisons (except for the tail) are performed
1620 // 8 bytes at a time. For the tail, we compare a halfword, then a short, and then a byte.
1621 // For strings < 8 bytes, we compare a halfword, then a short, and then a byte.
1622
1623 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1624 Register result, Register cnt1)
1625 {
1626 Label SAME, DONE, SHORT, NEXT_WORD, TAIL03, TAIL01;
1627 Register tmp1 = t0;
1628 Register tmp2 = t1;
1629
1630 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1631
1632 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
1633
1634 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
1635
1636 BLOCK_COMMENT("string_equals {");
1637
1638 mv(result, false);
1639
1640 // Load 4 bytes once to compare for alignment before main loop.
1641 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
1642 subi(cnt1, cnt1, 4);
1643 bltz(cnt1, TAIL03);
1644 lwu(tmp1, Address(a1));
1645 lwu(tmp2, Address(a2));
1646 addi(a1, a1, 4);
1647 addi(a2, a2, 4);
1648 bne(tmp1, tmp2, DONE);
1649 }
1650
1651 // Check for short strings, i.e. smaller than wordSize.
1652 subi(cnt1, cnt1, wordSize);
1653 bltz(cnt1, SHORT);
1654
1655 #ifdef ASSERT
1656 if (AvoidUnalignedAccesses) {
1657 Label align_ok;
1658 orr(t0, a1, a2);
1659 andi(t0, t0, 0x7);
1660 beqz(t0, align_ok);
1661 stop("bad alignment");
1662 bind(align_ok);
1663 }
1664 #endif
1665
1666 // Main 8 byte comparison loop.
1667 bind(NEXT_WORD); {
1668 ld(tmp1, Address(a1));
1669 ld(tmp2, Address(a2));
1670 subi(cnt1, cnt1, wordSize);
1671 addi(a1, a1, wordSize);
1672 addi(a2, a2, wordSize);
1673 bne(tmp1, tmp2, DONE);
1674 } bgez(cnt1, NEXT_WORD);
1675
1676 addi(tmp1, cnt1, wordSize);
1677 beqz(tmp1, SAME);
1678
1679 bind(SHORT);
1680 // 0-7 bytes left.
1681 test_bit(tmp1, cnt1, 2);
1682 beqz(tmp1, TAIL03);
1683 {
1684 lwu(tmp1, Address(a1));
1685 lwu(tmp2, Address(a2));
1686 addi(a1, a1, 4);
1687 addi(a2, a2, 4);
1688 bne(tmp1, tmp2, DONE);
1689 }
1690
1691 bind(TAIL03);
1692 // 0-3 bytes left.
1693 test_bit(tmp1, cnt1, 1);
1694 beqz(tmp1, TAIL01);
1695 {
1696 lhu(tmp1, Address(a1));
1697 lhu(tmp2, Address(a2));
1698 addi(a1, a1, 2);
1699 addi(a2, a2, 2);
1700 bne(tmp1, tmp2, DONE);
1701 }
1702
1703 bind(TAIL01);
1704 // 0-1 bytes left.
1705 test_bit(tmp1, cnt1, 0);
1706 beqz(tmp1, SAME);
1707 {
1708 lbu(tmp1, Address(a1));
1709 lbu(tmp2, Address(a2));
1710 bne(tmp1, tmp2, DONE);
1711 }
1712
1713 // Arrays are equal.
1714 bind(SAME);
1715 mv(result, true);
1716
1717 // That's it.
1718 bind(DONE);
1719 BLOCK_COMMENT("} string_equals");
1720 }
1721
1722 // jdk.internal.util.ArraysSupport.vectorizedHashCode
1723 void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
1724 Register tmp1, Register tmp2, Register tmp3,
1725 Register tmp4, Register tmp5, Register tmp6,
1726 BasicType eltype)
1727 {
1728 assert(!UseRVV, "sanity");
1729 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, t0, t1);
1730
1731 const int elsize = arrays_hashcode_elsize(eltype);
1732 const int chunks_end_shift = exact_log2(elsize);
1733
1734 switch (eltype) {
1735 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
1736 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
1737 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
1738 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
1739 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
1740 default:
1741 ShouldNotReachHere();
1742 }
1743
1744 const int stride = 4;
1745 const Register pow31_4 = tmp1;
1746 const Register pow31_3 = tmp2;
1747 const Register pow31_2 = tmp3;
1748 const Register chunks = tmp4;
1749 const Register chunks_end = chunks;
1750
1751 Label DONE, TAIL, TAIL_LOOP, WIDE_LOOP;
1752
1753 // result has a value initially
1754
1755 beqz(cnt, DONE);
1756
1757 andi(chunks, cnt, ~(stride - 1));
1758 beqz(chunks, TAIL);
1759
1760 mv(pow31_4, 923521); // [31^^4]
1761 mv(pow31_3, 29791); // [31^^3]
1762 mv(pow31_2, 961); // [31^^2]
1763
1764 shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
1765 andi(cnt, cnt, stride - 1); // don't forget about tail!
1766
1767 bind(WIDE_LOOP);
1768 arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype);
1769 arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype);
1770 arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
1771 arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
1772 mulw(result, result, pow31_4); // 31^^4 * h
1773 mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0]
1774 addw(result, result, t0);
1775 mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1]
1776 addw(result, result, t1);
1777 slli(t0, tmp5, 5); // optimize 31^^1 * ary[i+2]
1778 subw(tmp5, t0, tmp5); // with ary[i+2]<<5 - ary[i+2]
1779 addw(result, result, tmp5);
1780 addw(result, result, tmp6); // 31^^4 * h + 31^^3 * ary[i+0] + 31^^2 * ary[i+1]
1781 // + 31^^1 * ary[i+2] + 31^^0 * ary[i+3]
1782 addi(ary, ary, elsize * stride);
1783 bne(ary, chunks_end, WIDE_LOOP);
1784 beqz(cnt, DONE);
1785
1786 bind(TAIL);
1787 shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
1788
1789 bind(TAIL_LOOP);
1790 arrays_hashcode_elload(t0, Address(ary), eltype);
1791 slli(t1, result, 5); // optimize 31 * result
1792 subw(result, t1, result); // with result<<5 - result
1793 addw(result, result, t0);
1794 addi(ary, ary, elsize);
1795 bne(ary, chunks_end, TAIL_LOOP);
1796
1797 bind(DONE);
1798 BLOCK_COMMENT("} // arrays_hashcode");
1799 }
1800
1801 void C2_MacroAssembler::arrays_hashcode_v(Register ary, Register cnt, Register result,
1802 Register tmp1, Register tmp2, Register tmp3,
1803 BasicType eltype)
1804 {
1805 assert(UseRVV, "sanity");
1806 assert(StubRoutines::riscv::arrays_hashcode_powers_of_31() != nullptr, "sanity");
1807 assert_different_registers(ary, cnt, result, tmp1, tmp2, tmp3, t0, t1);
1808
1809 // The MaxVectorSize should have been set by detecting RVV max vector register
1810 // size when check UseRVV (i.e. MaxVectorSize == VM_Version::_initial_vector_length).
1811 // Let's use T_INT as all hashCode calculations eventually deal with ints.
1812 const int lmul = 2;
1813 const int stride = MaxVectorSize / sizeof(jint) * lmul;
1814
1815 const int elsize_bytes = arrays_hashcode_elsize(eltype);
1816 const int elsize_shift = exact_log2(elsize_bytes);
1817
1818 switch (eltype) {
1819 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode_v(unsigned byte) {"); break;
1820 case T_CHAR: BLOCK_COMMENT("arrays_hashcode_v(char) {"); break;
1821 case T_BYTE: BLOCK_COMMENT("arrays_hashcode_v(byte) {"); break;
1822 case T_SHORT: BLOCK_COMMENT("arrays_hashcode_v(short) {"); break;
1823 case T_INT: BLOCK_COMMENT("arrays_hashcode_v(int) {"); break;
1824 default:
1825 ShouldNotReachHere();
1826 }
1827
1828 const Register pow31_highest = tmp1;
1829 const Register ary_end = tmp2;
1830 const Register consumed = tmp3;
1831
1832 const VectorRegister v_sum = v2;
1833 const VectorRegister v_src = v4;
1834 const VectorRegister v_coeffs = v6;
1835 const VectorRegister v_tmp = v8;
1836
1837 const address adr_pows31 = StubRoutines::riscv::arrays_hashcode_powers_of_31()
1838 + sizeof(jint);
1839 Label VEC_LOOP, DONE, SCALAR_TAIL, SCALAR_TAIL_LOOP;
1840
1841 // NB: at this point (a) 'result' already has some value,
1842 // (b) 'cnt' is not 0 or 1, see java code for details.
1843
1844 andi(t0, cnt, ~(stride - 1));
1845 beqz(t0, SCALAR_TAIL);
1846
1847 la(t1, ExternalAddress(adr_pows31));
1848 lw(pow31_highest, Address(t1, -1 * sizeof(jint)));
1849
1850 vsetvli(consumed, cnt, Assembler::e32, Assembler::m2);
1851 vle32_v(v_coeffs, t1); // 31^^(stride - 1) ... 31^^0
1852 vmv_v_x(v_sum, x0);
1853
1854 bind(VEC_LOOP);
1855 arrays_hashcode_elload_v(v_src, v_tmp, ary, eltype);
1856 vmul_vv(v_src, v_src, v_coeffs);
1857 vmadd_vx(v_sum, pow31_highest, v_src);
1858 mulw(result, result, pow31_highest);
1859 shadd(ary, consumed, ary, t0, elsize_shift);
1860 subw(cnt, cnt, consumed);
1861 andi(t1, cnt, ~(stride - 1));
1862 bnez(t1, VEC_LOOP);
1863
1864 vmv_s_x(v_tmp, x0);
1865 vredsum_vs(v_sum, v_sum, v_tmp);
1866 vmv_x_s(t0, v_sum);
1867 addw(result, result, t0);
1868 beqz(cnt, DONE);
1869
1870 bind(SCALAR_TAIL);
1871 shadd(ary_end, cnt, ary, t0, elsize_shift);
1872
1873 bind(SCALAR_TAIL_LOOP);
1874 arrays_hashcode_elload(t0, Address(ary), eltype);
1875 slli(t1, result, 5); // optimize 31 * result
1876 subw(result, t1, result); // with result<<5 - result
1877 addw(result, result, t0);
1878 addi(ary, ary, elsize_bytes);
1879 bne(ary, ary_end, SCALAR_TAIL_LOOP);
1880
1881 bind(DONE);
1882 BLOCK_COMMENT("} // arrays_hashcode_v");
1883 }
1884
1885 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
1886 switch (eltype) {
1887 case T_BOOLEAN: return sizeof(jboolean);
1888 case T_BYTE: return sizeof(jbyte);
1889 case T_SHORT: return sizeof(jshort);
1890 case T_CHAR: return sizeof(jchar);
1891 case T_INT: return sizeof(jint);
1892 default:
1893 ShouldNotReachHere();
1894 return -1;
1895 }
1896 }
1897
1898 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
1899 switch (eltype) {
1900 // T_BOOLEAN used as surrogate for unsigned byte
1901 case T_BOOLEAN: lbu(dst, src); break;
1902 case T_BYTE: lb(dst, src); break;
1903 case T_SHORT: lh(dst, src); break;
1904 case T_CHAR: lhu(dst, src); break;
1905 case T_INT: lw(dst, src); break;
1906 default:
1907 ShouldNotReachHere();
1908 }
1909 }
1910
1911 void C2_MacroAssembler::arrays_hashcode_elload_v(VectorRegister vdst,
1912 VectorRegister vtmp,
1913 Register src,
1914 BasicType eltype) {
1915 assert_different_registers(vdst, vtmp);
1916 switch (eltype) {
1917 case T_BOOLEAN:
1918 vle8_v(vtmp, src);
1919 vzext_vf4(vdst, vtmp);
1920 break;
1921 case T_BYTE:
1922 vle8_v(vtmp, src);
1923 vsext_vf4(vdst, vtmp);
1924 break;
1925 case T_CHAR:
1926 vle16_v(vtmp, src);
1927 vzext_vf2(vdst, vtmp);
1928 break;
1929 case T_SHORT:
1930 vle16_v(vtmp, src);
1931 vsext_vf2(vdst, vtmp);
1932 break;
1933 case T_INT:
1934 vle32_v(vdst, src);
1935 break;
1936 default:
1937 ShouldNotReachHere();
1938 }
1939 }
1940
1941 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1942 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1943 bool is_far, bool is_unordered);
1944
1945 static conditional_branch_insn conditional_branches[] =
1946 {
1947 /* SHORT branches */
1948 (conditional_branch_insn)&MacroAssembler::beq,
1949 (conditional_branch_insn)&MacroAssembler::bgt,
1950 nullptr, // BoolTest::overflow
1951 (conditional_branch_insn)&MacroAssembler::blt,
1952 (conditional_branch_insn)&MacroAssembler::bne,
1953 (conditional_branch_insn)&MacroAssembler::ble,
1954 nullptr, // BoolTest::no_overflow
1955 (conditional_branch_insn)&MacroAssembler::bge,
1956
1957 /* UNSIGNED branches */
1958 (conditional_branch_insn)&MacroAssembler::beq,
1959 (conditional_branch_insn)&MacroAssembler::bgtu,
1960 nullptr,
1961 (conditional_branch_insn)&MacroAssembler::bltu,
1962 (conditional_branch_insn)&MacroAssembler::bne,
1963 (conditional_branch_insn)&MacroAssembler::bleu,
1964 nullptr,
1965 (conditional_branch_insn)&MacroAssembler::bgeu
1966 };
1967
1968 static float_conditional_branch_insn float_conditional_branches[] =
1969 {
1970 /* FLOAT SHORT branches */
1971 (float_conditional_branch_insn)&MacroAssembler::float_beq,
1972 (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1973 nullptr, // BoolTest::overflow
1974 (float_conditional_branch_insn)&MacroAssembler::float_blt,
1975 (float_conditional_branch_insn)&MacroAssembler::float_bne,
1976 (float_conditional_branch_insn)&MacroAssembler::float_ble,
1977 nullptr, // BoolTest::no_overflow
1978 (float_conditional_branch_insn)&MacroAssembler::float_bge,
1979
1980 /* DOUBLE SHORT branches */
1981 (float_conditional_branch_insn)&MacroAssembler::double_beq,
1982 (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1983 nullptr,
1984 (float_conditional_branch_insn)&MacroAssembler::double_blt,
1985 (float_conditional_branch_insn)&MacroAssembler::double_bne,
1986 (float_conditional_branch_insn)&MacroAssembler::double_ble,
1987 nullptr,
1988 (float_conditional_branch_insn)&MacroAssembler::double_bge
1989 };
1990
1991 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1992 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1993 "invalid conditional branch index");
1994 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1995 }
1996
1997 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1998 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1999 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
2000 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
2001 "invalid float conditional branch index");
2002 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
2003 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
2004 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
2005 }
2006
2007 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
2008 switch (cmpFlag) {
2009 case BoolTest::eq:
2010 case BoolTest::le:
2011 beqz(op1, L, is_far);
2012 break;
2013 case BoolTest::ne:
2014 case BoolTest::gt:
2015 bnez(op1, L, is_far);
2016 break;
2017 default:
2018 ShouldNotReachHere();
2019 }
2020 }
2021
2022 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
2023 switch (cmpFlag) {
2024 case BoolTest::eq:
2025 beqz(op1, L, is_far);
2026 break;
2027 case BoolTest::ne:
2028 bnez(op1, L, is_far);
2029 break;
2030 default:
2031 ShouldNotReachHere();
2032 }
2033 }
2034
2035 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
2036 bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2037 int op_select = cmpFlag & (~unsigned_branch_mask);
2038
2039 switch (op_select) {
2040 case BoolTest::eq:
2041 cmov_eq(op1, op2, dst, src);
2042 break;
2043 case BoolTest::ne:
2044 cmov_ne(op1, op2, dst, src);
2045 break;
2046 case BoolTest::le:
2047 if (is_unsigned) {
2048 cmov_leu(op1, op2, dst, src);
2049 } else {
2050 cmov_le(op1, op2, dst, src);
2051 }
2052 break;
2053 case BoolTest::ge:
2054 if (is_unsigned) {
2055 cmov_geu(op1, op2, dst, src);
2056 } else {
2057 cmov_ge(op1, op2, dst, src);
2058 }
2059 break;
2060 case BoolTest::lt:
2061 if (is_unsigned) {
2062 cmov_ltu(op1, op2, dst, src);
2063 } else {
2064 cmov_lt(op1, op2, dst, src);
2065 }
2066 break;
2067 case BoolTest::gt:
2068 if (is_unsigned) {
2069 cmov_gtu(op1, op2, dst, src);
2070 } else {
2071 cmov_gt(op1, op2, dst, src);
2072 }
2073 break;
2074 default:
2075 assert(false, "unsupported compare condition");
2076 ShouldNotReachHere();
2077 }
2078 }
2079
2080 void C2_MacroAssembler::enc_cmove_cmp_fp(int cmpFlag, FloatRegister op1, FloatRegister op2, Register dst, Register src, bool is_single) {
2081 int op_select = cmpFlag & (~unsigned_branch_mask);
2082
2083 switch (op_select) {
2084 case BoolTest::eq:
2085 cmov_cmp_fp_eq(op1, op2, dst, src, is_single);
2086 break;
2087 case BoolTest::ne:
2088 cmov_cmp_fp_ne(op1, op2, dst, src, is_single);
2089 break;
2090 case BoolTest::le:
2091 cmov_cmp_fp_le(op1, op2, dst, src, is_single);
2092 break;
2093 case BoolTest::ge:
2094 cmov_cmp_fp_ge(op1, op2, dst, src, is_single);
2095 break;
2096 case BoolTest::lt:
2097 cmov_cmp_fp_lt(op1, op2, dst, src, is_single);
2098 break;
2099 case BoolTest::gt:
2100 cmov_cmp_fp_gt(op1, op2, dst, src, is_single);
2101 break;
2102 default:
2103 assert(false, "unsupported compare condition");
2104 ShouldNotReachHere();
2105 }
2106 }
2107
2108 void C2_MacroAssembler::enc_cmove_fp_cmp(int cmpFlag, Register op1, Register op2,
2109 FloatRegister dst, FloatRegister src, bool is_single) {
2110 bool is_unsigned = (cmpFlag & unsigned_branch_mask) == unsigned_branch_mask;
2111 int op_select = cmpFlag & (~unsigned_branch_mask);
2112
2113 switch (op_select) {
2114 case BoolTest::eq:
2115 cmov_fp_eq(op1, op2, dst, src, is_single);
2116 break;
2117 case BoolTest::ne:
2118 cmov_fp_ne(op1, op2, dst, src, is_single);
2119 break;
2120 case BoolTest::le:
2121 if (is_unsigned) {
2122 cmov_fp_leu(op1, op2, dst, src, is_single);
2123 } else {
2124 cmov_fp_le(op1, op2, dst, src, is_single);
2125 }
2126 break;
2127 case BoolTest::ge:
2128 if (is_unsigned) {
2129 cmov_fp_geu(op1, op2, dst, src, is_single);
2130 } else {
2131 cmov_fp_ge(op1, op2, dst, src, is_single);
2132 }
2133 break;
2134 case BoolTest::lt:
2135 if (is_unsigned) {
2136 cmov_fp_ltu(op1, op2, dst, src, is_single);
2137 } else {
2138 cmov_fp_lt(op1, op2, dst, src, is_single);
2139 }
2140 break;
2141 case BoolTest::gt:
2142 if (is_unsigned) {
2143 cmov_fp_gtu(op1, op2, dst, src, is_single);
2144 } else {
2145 cmov_fp_gt(op1, op2, dst, src, is_single);
2146 }
2147 break;
2148 default:
2149 assert(false, "unsupported compare condition");
2150 ShouldNotReachHere();
2151 }
2152 }
2153
2154 void C2_MacroAssembler::enc_cmove_fp_cmp_fp(int cmpFlag,
2155 FloatRegister op1, FloatRegister op2,
2156 FloatRegister dst, FloatRegister src,
2157 bool cmp_single, bool cmov_single) {
2158 int op_select = cmpFlag & (~unsigned_branch_mask);
2159
2160 switch (op_select) {
2161 case BoolTest::eq:
2162 cmov_fp_cmp_fp_eq(op1, op2, dst, src, cmp_single, cmov_single);
2163 break;
2164 case BoolTest::ne:
2165 cmov_fp_cmp_fp_ne(op1, op2, dst, src, cmp_single, cmov_single);
2166 break;
2167 case BoolTest::le:
2168 cmov_fp_cmp_fp_le(op1, op2, dst, src, cmp_single, cmov_single);
2169 break;
2170 case BoolTest::ge:
2171 cmov_fp_cmp_fp_ge(op1, op2, dst, src, cmp_single, cmov_single);
2172 break;
2173 case BoolTest::lt:
2174 cmov_fp_cmp_fp_lt(op1, op2, dst, src, cmp_single, cmov_single);
2175 break;
2176 case BoolTest::gt:
2177 cmov_fp_cmp_fp_gt(op1, op2, dst, src, cmp_single, cmov_single);
2178 break;
2179 default:
2180 assert(false, "unsupported compare condition");
2181 ShouldNotReachHere();
2182 }
2183 }
2184
2185 // Set dst to NaN if any NaN input.
2186 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
2187 FLOAT_TYPE ft, bool is_min) {
2188 assert_cond((ft != FLOAT_TYPE::half_precision) || UseZfh);
2189
2190 Label Done, Compare;
2191
2192 switch (ft) {
2193 case FLOAT_TYPE::half_precision:
2194 fclass_h(t0, src1);
2195 fclass_h(t1, src2);
2196
2197 orr(t0, t0, t1);
2198 andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2199 beqz(t0, Compare);
2200
2201 fadd_h(dst, src1, src2);
2202 j(Done);
2203
2204 bind(Compare);
2205 if (is_min) {
2206 fmin_h(dst, src1, src2);
2207 } else {
2208 fmax_h(dst, src1, src2);
2209 }
2210 break;
2211 case FLOAT_TYPE::single_precision:
2212 fclass_s(t0, src1);
2213 fclass_s(t1, src2);
2214
2215 orr(t0, t0, t1);
2216 andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2217 beqz(t0, Compare);
2218
2219 fadd_s(dst, src1, src2);
2220 j(Done);
2221
2222 bind(Compare);
2223 if (is_min) {
2224 fmin_s(dst, src1, src2);
2225 } else {
2226 fmax_s(dst, src1, src2);
2227 }
2228 break;
2229 case FLOAT_TYPE::double_precision:
2230 fclass_d(t0, src1);
2231 fclass_d(t1, src2);
2232
2233 orr(t0, t0, t1);
2234 andi(t0, t0, FClassBits::nan); // if src1 or src2 is quiet or signaling NaN then return NaN
2235 beqz(t0, Compare);
2236
2237 fadd_d(dst, src1, src2);
2238 j(Done);
2239
2240 bind(Compare);
2241 if (is_min) {
2242 fmin_d(dst, src1, src2);
2243 } else {
2244 fmax_d(dst, src1, src2);
2245 }
2246 break;
2247 default:
2248 ShouldNotReachHere();
2249 }
2250
2251 bind(Done);
2252 }
2253
2254 // According to Java SE specification, for floating-point round operations, if
2255 // the input is NaN, +/-infinity, or +/-0, the same input is returned as the
2256 // rounded result; this differs from behavior of RISC-V fcvt instructions (which
2257 // round out-of-range values to the nearest max or min value), therefore special
2258 // handling is needed by NaN, +/-Infinity, +/-0.
2259 void C2_MacroAssembler::round_double_mode(FloatRegister dst, FloatRegister src, int round_mode,
2260 Register tmp1, Register tmp2, Register tmp3) {
2261
2262 assert_different_registers(dst, src);
2263 assert_different_registers(tmp1, tmp2, tmp3);
2264
2265 // Set rounding mode for conversions
2266 // Here we use similar modes to double->long and long->double conversions
2267 // Different mode for long->double conversion matter only if long value was not representable as double,
2268 // we got long value as a result of double->long conversion so, it is definitely representable
2269 RoundingMode rm;
2270 switch (round_mode) {
2271 case RoundDoubleModeNode::rmode_ceil:
2272 rm = RoundingMode::rup;
2273 break;
2274 case RoundDoubleModeNode::rmode_floor:
2275 rm = RoundingMode::rdn;
2276 break;
2277 case RoundDoubleModeNode::rmode_rint:
2278 rm = RoundingMode::rne;
2279 break;
2280 default:
2281 ShouldNotReachHere();
2282 }
2283
2284 // tmp1 - is a register to store double converted to long int
2285 // tmp2 - is a register to create constant for comparison
2286 // tmp3 - is a register where we store modified result of double->long conversion
2287 Label done, bad_val;
2288
2289 // Conversion from double to long
2290 fcvt_l_d(tmp1, src, rm);
2291
2292 // Generate constant (tmp2)
2293 // tmp2 = 100...0000
2294 addi(tmp2, zr, 1);
2295 slli(tmp2, tmp2, 63);
2296
2297 // Prepare converted long (tmp1)
2298 // as a result when conversion overflow we got:
2299 // tmp1 = 011...1111 or 100...0000
2300 // Convert it to: tmp3 = 100...0000
2301 addi(tmp3, tmp1, 1);
2302 andi(tmp3, tmp3, -2);
2303 beq(tmp3, tmp2, bad_val);
2304
2305 // Conversion from long to double
2306 fcvt_d_l(dst, tmp1, rm);
2307 // Add sign of input value to result for +/- 0 cases
2308 fsgnj_d(dst, dst, src);
2309 j(done);
2310
2311 // If got conversion overflow return src
2312 bind(bad_val);
2313 fmv_d(dst, src);
2314
2315 bind(done);
2316 }
2317
2318 // According to Java SE specification, for floating-point signum operations, if
2319 // on input we have NaN or +/-0.0 value we should return it,
2320 // otherwise return +/- 1.0 using sign of input.
2321 // one - gives us a floating-point 1.0 (got from matching rule)
2322 // bool is_double - specifies single or double precision operations will be used.
2323 void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_double) {
2324 Label done;
2325
2326 is_double ? fclass_d(t0, dst)
2327 : fclass_s(t0, dst);
2328
2329 // check if input is -0, +0, signaling NaN or quiet NaN
2330 andi(t0, t0, FClassBits::zero | FClassBits::nan);
2331
2332 bnez(t0, done);
2333
2334 // use floating-point 1.0 with a sign of input
2335 is_double ? fsgnj_d(dst, one, dst)
2336 : fsgnj_s(dst, one, dst);
2337
2338 bind(done);
2339 }
2340
2341 static void float16_to_float_slow_path(C2_MacroAssembler& masm, C2GeneralStub<FloatRegister, Register, Register>& stub) {
2342 #define __ masm.
2343 FloatRegister dst = stub.data<0>();
2344 Register src = stub.data<1>();
2345 Register tmp = stub.data<2>();
2346 __ bind(stub.entry());
2347
2348 // following instructions mainly focus on NaN, as riscv does not handle
2349 // NaN well with fcvt, but the code also works for Inf at the same time.
2350
2351 // construct a NaN in 32 bits from the NaN in 16 bits,
2352 // we need the payloads of non-canonical NaNs to be preserved.
2353 __ mv(tmp, 0x7f800000);
2354 // sign-bit was already set via sign-extension if necessary.
2355 __ slli(t0, src, 13);
2356 __ orr(tmp, t0, tmp);
2357 __ fmv_w_x(dst, tmp);
2358
2359 __ j(stub.continuation());
2360 #undef __
2361 }
2362
2363 // j.l.Float.float16ToFloat
2364 void C2_MacroAssembler::float16_to_float(FloatRegister dst, Register src, Register tmp) {
2365 auto stub = C2CodeStub::make<FloatRegister, Register, Register>(dst, src, tmp, 20, float16_to_float_slow_path);
2366
2367 // On riscv, NaN needs a special process as fcvt does not work in that case.
2368 // On riscv, Inf does not need a special process as fcvt can handle it correctly.
2369 // but we consider to get the slow path to process NaN and Inf at the same time,
2370 // as both of them are rare cases, and if we try to get the slow path to handle
2371 // only NaN case it would sacrifise the performance for normal cases,
2372 // i.e. non-NaN and non-Inf cases.
2373
2374 // check whether it's a NaN or +/- Inf.
2375 mv(t0, 0x7c00);
2376 andr(tmp, src, t0);
2377 // jump to stub processing NaN and Inf cases.
2378 beq(t0, tmp, stub->entry(), /* is_far */ true);
2379
2380 // non-NaN or non-Inf cases, just use built-in instructions.
2381 fmv_h_x(dst, src);
2382 fcvt_s_h(dst, dst);
2383
2384 bind(stub->continuation());
2385 }
2386
2387 static void float_to_float16_slow_path(C2_MacroAssembler& masm, C2GeneralStub<Register, FloatRegister, Register>& stub) {
2388 #define __ masm.
2389 Register dst = stub.data<0>();
2390 FloatRegister src = stub.data<1>();
2391 Register tmp = stub.data<2>();
2392 __ bind(stub.entry());
2393
2394 __ float_to_float16_NaN(dst, src, t0, tmp);
2395
2396 __ j(stub.continuation());
2397 #undef __
2398 }
2399
2400 // j.l.Float.floatToFloat16
2401 void C2_MacroAssembler::float_to_float16(Register dst, FloatRegister src, FloatRegister ftmp, Register xtmp) {
2402 auto stub = C2CodeStub::make<Register, FloatRegister, Register>(dst, src, xtmp, 64, float_to_float16_slow_path);
2403
2404 // On riscv, NaN needs a special process as fcvt does not work in that case.
2405
2406 // check whether it's a NaN.
2407 // replace fclass with feq as performance optimization.
2408 feq_s(t0, src, src);
2409 // jump to stub processing NaN cases.
2410 beqz(t0, stub->entry(), /* is_far */ true);
2411
2412 // non-NaN cases, just use built-in instructions.
2413 fcvt_h_s(ftmp, src);
2414 fmv_x_h(dst, ftmp);
2415
2416 bind(stub->continuation());
2417 }
2418
2419 static void float16_to_float_v_slow_path(C2_MacroAssembler& masm, C2GeneralStub<VectorRegister, VectorRegister, uint>& stub) {
2420 #define __ masm.
2421 VectorRegister dst = stub.data<0>();
2422 VectorRegister src = stub.data<1>();
2423 uint vector_length = stub.data<2>();
2424 __ bind(stub.entry());
2425
2426 // following instructions mainly focus on NaN, as riscv does not handle
2427 // NaN well with vfwcvt_f_f_v, but the code also works for Inf at the same time.
2428 //
2429 // construct NaN's in 32 bits from the NaN's in 16 bits,
2430 // we need the payloads of non-canonical NaNs to be preserved.
2431
2432 // adjust vector type to 2 * SEW.
2433 __ vsetvli_helper(T_FLOAT, vector_length, Assembler::m1);
2434 // widen and sign-extend src data.
2435 __ vsext_vf2(dst, src, Assembler::v0_t);
2436 __ mv(t0, 0x7f800000);
2437 // sign-bit was already set via sign-extension if necessary.
2438 __ vsll_vi(dst, dst, 13, Assembler::v0_t);
2439 __ vor_vx(dst, dst, t0, Assembler::v0_t);
2440
2441 __ j(stub.continuation());
2442 #undef __
2443 }
2444
2445 // j.l.Float.float16ToFloat
2446 void C2_MacroAssembler::float16_to_float_v(VectorRegister dst, VectorRegister src, uint vector_length) {
2447 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, uint>
2448 (dst, src, vector_length, 24, float16_to_float_v_slow_path);
2449 assert_different_registers(dst, src);
2450
2451 // On riscv, NaN needs a special process as vfwcvt_f_f_v does not work in that case.
2452 // On riscv, Inf does not need a special process as vfwcvt_f_f_v can handle it correctly.
2453 // but we consider to get the slow path to process NaN and Inf at the same time,
2454 // as both of them are rare cases, and if we try to get the slow path to handle
2455 // only NaN case it would sacrifise the performance for normal cases,
2456 // i.e. non-NaN and non-Inf cases.
2457
2458 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2);
2459
2460 // check whether there is a NaN or +/- Inf.
2461 mv(t0, 0x7c00);
2462 vand_vx(v0, src, t0);
2463 // v0 will be used as mask in slow path.
2464 vmseq_vx(v0, v0, t0);
2465 vcpop_m(t0, v0);
2466
2467 // For non-NaN or non-Inf cases, just use built-in instructions.
2468 vfwcvt_f_f_v(dst, src);
2469
2470 // jump to stub processing NaN and Inf cases if there is any of them in the vector-wide.
2471 bnez(t0, stub->entry(), /* is_far */ true);
2472
2473 bind(stub->continuation());
2474 }
2475
2476 static void float_to_float16_v_slow_path(C2_MacroAssembler& masm,
2477 C2GeneralStub<VectorRegister, VectorRegister, VectorRegister>& stub) {
2478 #define __ masm.
2479 VectorRegister dst = stub.data<0>();
2480 VectorRegister src = stub.data<1>();
2481 VectorRegister vtmp = stub.data<2>();
2482 assert_different_registers(dst, src, vtmp);
2483
2484 __ bind(stub.entry());
2485
2486 // Active elements (NaNs) are marked in v0 mask register.
2487 // mul is already set to mf2 in float_to_float16_v.
2488
2489 // Float (32 bits)
2490 // Bit: 31 30 to 23 22 to 0
2491 // +---+------------------+-----------------------------+
2492 // | S | Exponent | Mantissa (Fraction) |
2493 // +---+------------------+-----------------------------+
2494 // 1 bit 8 bits 23 bits
2495 //
2496 // Float (16 bits)
2497 // Bit: 15 14 to 10 9 to 0
2498 // +---+----------------+------------------+
2499 // | S | Exponent | Mantissa |
2500 // +---+----------------+------------------+
2501 // 1 bit 5 bits 10 bits
2502 const int fp_sign_bits = 1;
2503 const int fp32_bits = 32;
2504 const int fp32_mantissa_2nd_part_bits = 9;
2505 const int fp32_mantissa_3rd_part_bits = 4;
2506 const int fp16_exponent_bits = 5;
2507 const int fp16_mantissa_bits = 10;
2508
2509 // preserve the sign bit and exponent, clear mantissa.
2510 __ vnsra_wi(dst, src, fp32_bits - fp_sign_bits - fp16_exponent_bits, Assembler::v0_t);
2511 __ vsll_vi(dst, dst, fp16_mantissa_bits, Assembler::v0_t);
2512
2513 // Preserve high order bit of float NaN in the
2514 // binary16 result NaN (tenth bit); OR in remaining
2515 // bits into lower 9 bits of binary 16 significand.
2516 // | (doppel & 0x007f_e000) >> 13 // 10 bits
2517 // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
2518 // | (doppel & 0x0000_000f)); // 4 bits
2519 //
2520 // Check j.l.Float.floatToFloat16 for more information.
2521 // 10 bits
2522 __ vnsrl_wi(vtmp, src, fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2523 __ mv(t0, 0x3ff); // retain first part of mantissa in a float 32
2524 __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2525 __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2526 // 9 bits
2527 __ vnsrl_wi(vtmp, src, fp32_mantissa_3rd_part_bits, Assembler::v0_t);
2528 __ mv(t0, 0x1ff); // retain second part of mantissa in a float 32
2529 __ vand_vx(vtmp, vtmp, t0, Assembler::v0_t);
2530 __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2531 // 4 bits
2532 // Narrow shift is necessary to move data from 32 bits element to 16 bits element in vector register.
2533 __ vnsrl_wi(vtmp, src, 0, Assembler::v0_t);
2534 __ vand_vi(vtmp, vtmp, 0xf, Assembler::v0_t);
2535 __ vor_vv(dst, dst, vtmp, Assembler::v0_t);
2536
2537 __ j(stub.continuation());
2538 #undef __
2539 }
2540
2541 // j.l.Float.float16ToFloat
2542 void C2_MacroAssembler::float_to_float16_v(VectorRegister dst, VectorRegister src,
2543 VectorRegister vtmp, Register tmp, uint vector_length) {
2544 assert_different_registers(dst, src, vtmp);
2545
2546 auto stub = C2CodeStub::make<VectorRegister, VectorRegister, VectorRegister>
2547 (dst, src, vtmp, 56, float_to_float16_v_slow_path);
2548
2549 // On riscv, NaN needs a special process as vfncvt_f_f_w does not work in that case.
2550
2551 vsetvli_helper(BasicType::T_FLOAT, vector_length, Assembler::m1);
2552
2553 // check whether there is a NaN.
2554 // replace v_fclass with vmfne_vv as performance optimization.
2555 vmfne_vv(v0, src, src);
2556 vcpop_m(t0, v0);
2557
2558 vsetvli_helper(BasicType::T_SHORT, vector_length, Assembler::mf2, tmp);
2559
2560 // For non-NaN cases, just use built-in instructions.
2561 vfncvt_f_f_w(dst, src);
2562
2563 // jump to stub processing NaN cases.
2564 bnez(t0, stub->entry(), /* is_far */ true);
2565
2566 bind(stub->continuation());
2567 }
2568
2569 void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, BasicType bt, int vlen) {
2570 vsetvli_helper(bt, vlen);
2571
2572 // check if input is -0, +0, signaling NaN or quiet NaN
2573 vfclass_v(v0, dst);
2574 mv(t0, FClassBits::zero | FClassBits::nan);
2575 vand_vx(v0, v0, t0);
2576 vmseq_vi(v0, v0, 0);
2577
2578 // use floating-point 1.0 with a sign of input
2579 vfsgnj_vv(dst, one, dst, v0_t);
2580 }
2581
2582 // j.l.Math.round(float)
2583 // Returns the closest int to the argument, with ties rounding to positive infinity.
2584 // We need to handle 3 special cases defined by java api spec:
2585 // NaN,
2586 // float >= Integer.MAX_VALUE,
2587 // float <= Integer.MIN_VALUE.
2588 void C2_MacroAssembler::java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2589 BasicType bt, uint vector_length) {
2590 // In riscv, there is no straight corresponding rounding mode to satisfy the behaviour defined,
2591 // in java api spec, i.e. any rounding mode can not handle some corner cases, e.g.
2592 // RNE is the closest one, but it ties to "even", which means 1.5/2.5 both will be converted
2593 // to 2, instead of 2 and 3 respectively.
2594 // RUP does not work either, although java api requires "rounding to positive infinity",
2595 // but both 1.3/1.8 will be converted to 2, instead of 1 and 2 respectively.
2596 //
2597 // The optimal solution for non-NaN cases is:
2598 // src+0.5 => dst, with rdn rounding mode,
2599 // convert dst from float to int, with rnd rounding mode.
2600 // and, this solution works as expected for float >= Integer.MAX_VALUE and float <= Integer.MIN_VALUE.
2601 //
2602 // But, we still need to handle NaN explicilty with vector mask instructions.
2603 //
2604 // Check MacroAssembler::java_round_float and C2_MacroAssembler::vector_round_sve in aarch64 for more details.
2605
2606 csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2607 vsetvli_helper(bt, vector_length);
2608
2609 // don't rearrage the instructions sequence order without performance testing.
2610 // check MacroAssembler::java_round_float in riscv64 for more details.
2611 mv(t0, jint_cast(0.5f));
2612 fmv_w_x(ftmp, t0);
2613
2614 // replacing vfclass with feq as performance optimization
2615 vmfeq_vv(v0, src, src);
2616 // set dst = 0 in cases of NaN
2617 vmv_v_x(dst, zr);
2618
2619 // dst = (src + 0.5) rounded down towards negative infinity
2620 vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2621 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2622
2623 csrwi(CSR_FRM, C2_MacroAssembler::rne);
2624 }
2625
2626 // java.lang.Math.round(double a)
2627 // Returns the closest long to the argument, with ties rounding to positive infinity.
2628 void C2_MacroAssembler::java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp,
2629 BasicType bt, uint vector_length) {
2630 // check C2_MacroAssembler::java_round_float_v above for more details.
2631
2632 csrwi(CSR_FRM, C2_MacroAssembler::rdn);
2633 vsetvli_helper(bt, vector_length);
2634
2635 mv(t0, julong_cast(0.5));
2636 fmv_d_x(ftmp, t0);
2637
2638 // replacing vfclass with feq as performance optimization
2639 vmfeq_vv(v0, src, src);
2640 // set dst = 0 in cases of NaN
2641 vmv_v_x(dst, zr);
2642
2643 // dst = (src + 0.5) rounded down towards negative infinity
2644 vfadd_vf(dst, src, ftmp, Assembler::v0_t);
2645 vfcvt_x_f_v(dst, dst, Assembler::v0_t); // in RoundingMode::rdn
2646
2647 csrwi(CSR_FRM, C2_MacroAssembler::rne);
2648 }
2649
2650 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
2651 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE,
2652 Assembler::LMUL lmul) {
2653 Label loop;
2654 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
2655
2656 bind(loop);
2657 vsetvli(tmp1, cnt, sew, lmul);
2658 vlex_v(vr1, a1, sew);
2659 vlex_v(vr2, a2, sew);
2660 vmsne_vv(vrs, vr1, vr2);
2661 vfirst_m(tmp2, vrs);
2662 bgez(tmp2, DONE);
2663 sub(cnt, cnt, tmp1);
2664 if (!islatin) {
2665 slli(tmp1, tmp1, 1); // get byte counts
2666 }
2667 add(a1, a1, tmp1);
2668 add(a2, a2, tmp1);
2669 bnez(cnt, loop);
2670
2671 mv(result, true);
2672 }
2673
2674 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt) {
2675 Label DONE;
2676 Register tmp1 = t0;
2677 Register tmp2 = t1;
2678
2679 BLOCK_COMMENT("string_equals_v {");
2680
2681 mv(result, false);
2682
2683 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, true, DONE, Assembler::m2);
2684
2685 bind(DONE);
2686 BLOCK_COMMENT("} string_equals_v");
2687 }
2688
2689 // used by C2 ClearArray patterns.
2690 // base: Address of a buffer to be zeroed
2691 // cnt: Count in HeapWords
2692 //
2693 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
2694 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
2695 Label loop;
2696
2697 // making zero words
2698 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2699 vxor_vv(v4, v4, v4);
2700
2701 bind(loop);
2702 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
2703 vse64_v(v4, base);
2704 sub(cnt, cnt, t0);
2705 shadd(base, t0, base, t0, 3);
2706 bnez(cnt, loop);
2707 }
2708
2709 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
2710 Register cnt1, int elem_size) {
2711 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
2712 assert_different_registers(a1, a2, result, cnt1, t0, t1);
2713
2714 Label DONE;
2715 Register tmp1 = t0;
2716 Register tmp2 = t1;
2717 Register cnt2 = tmp2;
2718 int length_offset = arrayOopDesc::length_offset_in_bytes();
2719 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
2720
2721 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
2722
2723 BLOCK_COMMENT("arrays_equals_v {");
2724
2725 // if (a1 == a2), return true
2726 mv(result, true);
2727 beq(a1, a2, DONE);
2728
2729 mv(result, false);
2730 // if a1 == null or a2 == null, return false
2731 beqz(a1, DONE);
2732 beqz(a2, DONE);
2733 // if (a1.length != a2.length), return false
2734 lwu(cnt1, Address(a1, length_offset));
2735 lwu(cnt2, Address(a2, length_offset));
2736 bne(cnt1, cnt2, DONE);
2737
2738 la(a1, Address(a1, base_offset));
2739 la(a2, Address(a2, base_offset));
2740
2741 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE, Assembler::m2);
2742
2743 bind(DONE);
2744
2745 BLOCK_COMMENT("} arrays_equals_v");
2746 }
2747
2748 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
2749 Register result, Register tmp1, Register tmp2, int encForm) {
2750 Label DIFFERENCE, DONE, L, loop;
2751 bool encLL = encForm == StrIntrinsicNode::LL;
2752 bool encLU = encForm == StrIntrinsicNode::LU;
2753 bool encUL = encForm == StrIntrinsicNode::UL;
2754
2755 bool str1_isL = encLL || encLU;
2756 bool str2_isL = encLL || encUL;
2757
2758 int minCharsInWord = encLL ? wordSize : wordSize / 2;
2759
2760 BLOCK_COMMENT("string_compare_v {");
2761
2762 // for Latin strings, 1 byte for 1 character
2763 // for UTF16 strings, 2 bytes for 1 character
2764 if (!str1_isL)
2765 sraiw(cnt1, cnt1, 1);
2766 if (!str2_isL)
2767 sraiw(cnt2, cnt2, 1);
2768
2769 // if str1 == str2, return the difference
2770 // save the minimum of the string lengths in cnt2.
2771 sub(result, cnt1, cnt2);
2772 bgt(cnt1, cnt2, L);
2773 mv(cnt2, cnt1);
2774 bind(L);
2775
2776 // We focus on the optimization of small sized string.
2777 // Please check below document for string size distribution statistics.
2778 // https://cr.openjdk.org/~shade/density/string-density-report.pdf
2779 if (str1_isL == str2_isL) { // LL or UU
2780 // Below construction of v regs and lmul is based on test on 2 different boards,
2781 // vlen == 128 and vlen == 256 respectively.
2782 if (!encLL && MaxVectorSize == 16) { // UU
2783 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v4, v8, v4, encLL, DIFFERENCE, Assembler::m4);
2784 } else { // UU + MaxVectorSize or LL
2785 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE, Assembler::m2);
2786 }
2787
2788 j(DONE);
2789 } else { // LU or UL
2790 Register strL = encLU ? str1 : str2;
2791 Register strU = encLU ? str2 : str1;
2792 VectorRegister vstr1 = encLU ? v8 : v4;
2793 VectorRegister vstr2 = encLU ? v4 : v8;
2794
2795 bind(loop);
2796 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
2797 vle8_v(vstr1, strL);
2798 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
2799 vzext_vf2(vstr2, vstr1);
2800 vle16_v(vstr1, strU);
2801 vmsne_vv(v4, vstr2, vstr1);
2802 vfirst_m(tmp2, v4);
2803 bgez(tmp2, DIFFERENCE);
2804 sub(cnt2, cnt2, tmp1);
2805 add(strL, strL, tmp1);
2806 shadd(strU, tmp1, strU, tmp1, 1);
2807 bnez(cnt2, loop);
2808 j(DONE);
2809 }
2810
2811 bind(DIFFERENCE);
2812 slli(tmp1, tmp2, 1);
2813 add(str1, str1, str1_isL ? tmp2 : tmp1);
2814 add(str2, str2, str2_isL ? tmp2 : tmp1);
2815 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
2816 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
2817 sub(result, tmp1, tmp2);
2818
2819 bind(DONE);
2820
2821 BLOCK_COMMENT("} string_compare_v");
2822 }
2823
2824 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
2825 Label loop;
2826 assert_different_registers(src, dst, len, tmp, t0);
2827
2828 BLOCK_COMMENT("byte_array_inflate_v {");
2829 bind(loop);
2830 vsetvli(tmp, len, Assembler::e8, Assembler::m2);
2831 vle8_v(v6, src);
2832 vsetvli(t0, len, Assembler::e16, Assembler::m4);
2833 vzext_vf2(v4, v6);
2834 vse16_v(v4, dst);
2835 sub(len, len, tmp);
2836 add(src, src, tmp);
2837 shadd(dst, tmp, dst, tmp, 1);
2838 bnez(len, loop);
2839 BLOCK_COMMENT("} byte_array_inflate_v");
2840 }
2841
2842 // Compress char[] array to byte[].
2843 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
2844 // result: the array length if every element in array can be encoded,
2845 // otherwise, the index of first non-latin1 (> 0xff) character.
2846 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
2847 Register result, Register tmp) {
2848 encode_iso_array_v(src, dst, len, result, tmp, false);
2849 }
2850
2851 // Intrinsic for
2852 //
2853 // - sun.nio.cs.ISO_8859_1.Encoder#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2854 // Encodes char[] to byte[] in ISO-8859-1
2855 //
2856 // - java.lang.StringCoding#encodeISOArray0(byte[] sa, int sp, byte[] da, int dp, int len)
2857 // Encodes byte[] (containing UTF-16) to byte[] in ISO-8859-1
2858 //
2859 // - java.lang.StringCoding#encodeAsciiArray0(char[] sa, int sp, byte[] da, int dp, int len)
2860 // Encodes char[] to byte[] in ASCII
2861 //
2862 // This version always returns the number of characters copied. A successful
2863 // copy will complete with the post-condition: 'res' == 'len', while an
2864 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
2865 //
2866 // Clobbers: src, dst, len, result, t0
2867 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
2868 Register result, Register tmp, bool ascii) {
2869 Label loop, fail, done;
2870
2871 BLOCK_COMMENT("encode_iso_array_v {");
2872 mv(result, 0);
2873
2874 bind(loop);
2875 mv(tmp, ascii ? 0x7f : 0xff);
2876 vsetvli(t0, len, Assembler::e16, Assembler::m2);
2877 vle16_v(v2, src);
2878
2879 vmsgtu_vx(v1, v2, tmp);
2880 vfirst_m(tmp, v1);
2881 vmsbf_m(v0, v1);
2882 // compress char to byte
2883 vsetvli(t0, len, Assembler::e8);
2884 vncvt_x_x_w(v1, v2, Assembler::v0_t);
2885 vse8_v(v1, dst, Assembler::v0_t);
2886
2887 // fail if char > 0x7f/0xff
2888 bgez(tmp, fail);
2889 add(result, result, t0);
2890 add(dst, dst, t0);
2891 sub(len, len, t0);
2892 shadd(src, t0, src, t0, 1);
2893 bnez(len, loop);
2894 j(done);
2895
2896 bind(fail);
2897 add(result, result, tmp);
2898
2899 bind(done);
2900 BLOCK_COMMENT("} encode_iso_array_v");
2901 }
2902
2903 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
2904 Label LOOP, SET_RESULT, DONE;
2905
2906 BLOCK_COMMENT("count_positives_v {");
2907 assert_different_registers(ary, len, result, tmp);
2908
2909 mv(result, zr);
2910
2911 bind(LOOP);
2912 vsetvli(t0, len, Assembler::e8, Assembler::m4);
2913 vle8_v(v4, ary);
2914 vmslt_vx(v4, v4, zr);
2915 vfirst_m(tmp, v4);
2916 bgez(tmp, SET_RESULT);
2917 // if tmp == -1, all bytes are positive
2918 add(result, result, t0);
2919
2920 sub(len, len, t0);
2921 add(ary, ary, t0);
2922 bnez(len, LOOP);
2923 j(DONE);
2924
2925 // add remaining positive bytes count
2926 bind(SET_RESULT);
2927 add(result, result, tmp);
2928
2929 bind(DONE);
2930 BLOCK_COMMENT("} count_positives_v");
2931 }
2932
2933 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
2934 Register ch, Register result,
2935 Register tmp1, Register tmp2,
2936 bool isL) {
2937 mv(result, zr);
2938
2939 Label loop, MATCH, DONE;
2940 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
2941 bind(loop);
2942 vsetvli(tmp1, cnt1, sew, Assembler::m4);
2943 vlex_v(v4, str1, sew);
2944 vmseq_vx(v4, v4, ch);
2945 vfirst_m(tmp2, v4);
2946 bgez(tmp2, MATCH); // if equal, return index
2947
2948 add(result, result, tmp1);
2949 sub(cnt1, cnt1, tmp1);
2950 if (!isL) slli(tmp1, tmp1, 1);
2951 add(str1, str1, tmp1);
2952 bnez(cnt1, loop);
2953
2954 mv(result, -1);
2955 j(DONE);
2956
2957 bind(MATCH);
2958 add(result, result, tmp2);
2959
2960 bind(DONE);
2961 }
2962
2963 // Set dst to NaN if any NaN input.
2964 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2965 BasicType bt, bool is_min, uint vector_length) {
2966 assert_different_registers(dst, src1, src2);
2967
2968 vsetvli_helper(bt, vector_length);
2969
2970 is_min ? vfmin_vv(dst, src1, src2)
2971 : vfmax_vv(dst, src1, src2);
2972
2973 vmfne_vv(v0, src1, src1);
2974 vfadd_vv(dst, src1, src1, Assembler::v0_t);
2975 vmfne_vv(v0, src2, src2);
2976 vfadd_vv(dst, src2, src2, Assembler::v0_t);
2977 }
2978
2979 // Set dst to NaN if any NaN input.
2980 // The destination vector register elements corresponding to masked-off elements
2981 // are handled with a mask-undisturbed policy.
2982 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
2983 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
2984 BasicType bt, bool is_min, uint vector_length) {
2985 assert_different_registers(src1, src2, tmp1, tmp2);
2986 vsetvli_helper(bt, vector_length);
2987
2988 // Check vector elements of src1 and src2 for NaN.
2989 vmfeq_vv(tmp1, src1, src1);
2990 vmfeq_vv(tmp2, src2, src2);
2991
2992 vmandn_mm(v0, vmask, tmp1);
2993 vfadd_vv(dst, src1, src1, Assembler::v0_t);
2994 vmandn_mm(v0, vmask, tmp2);
2995 vfadd_vv(dst, src2, src2, Assembler::v0_t);
2996
2997 vmand_mm(tmp2, tmp1, tmp2);
2998 vmand_mm(v0, vmask, tmp2);
2999 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
3000 : vfmax_vv(dst, src1, src2, Assembler::v0_t);
3001 }
3002
3003 // Set dst to NaN if any NaN input.
3004 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
3005 FloatRegister src1, VectorRegister src2,
3006 VectorRegister tmp1, VectorRegister tmp2,
3007 bool is_double, bool is_min, uint vector_length, VectorMask vm) {
3008 assert_different_registers(dst, src1);
3009 assert_different_registers(src2, tmp1, tmp2);
3010
3011 Label L_done, L_NaN_1, L_NaN_2;
3012 // Set dst to src1 if src1 is NaN
3013 is_double ? feq_d(t0, src1, src1)
3014 : feq_s(t0, src1, src1);
3015 beqz(t0, L_NaN_2);
3016
3017 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
3018 vfmv_s_f(tmp2, src1);
3019
3020 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
3021 : vfredmax_vs(tmp1, src2, tmp2, vm);
3022 vfmv_f_s(dst, tmp1);
3023
3024 // Checking NaNs in src2
3025 vmfne_vv(tmp1, src2, src2, vm);
3026 vcpop_m(t0, tmp1, vm);
3027 beqz(t0, L_done);
3028
3029 bind(L_NaN_1);
3030 vfredusum_vs(tmp1, src2, tmp2, vm);
3031 vfmv_f_s(dst, tmp1);
3032 j(L_done);
3033
3034 bind(L_NaN_2);
3035 is_double ? fmv_d(dst, src1)
3036 : fmv_s(dst, src1);
3037 bind(L_done);
3038 }
3039
3040 bool C2_MacroAssembler::in_scratch_emit_size() {
3041 if (ciEnv::current()->task() != nullptr) {
3042 PhaseOutput* phase_output = Compile::current()->output();
3043 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
3044 return true;
3045 }
3046 }
3047 return MacroAssembler::in_scratch_emit_size();
3048 }
3049
3050 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
3051 VectorRegister src2, VectorRegister tmp,
3052 int opc, BasicType bt, uint vector_length, VectorMask vm) {
3053 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3054 vsetvli_helper(bt, vector_length);
3055 vmv_s_x(tmp, src1);
3056 switch (opc) {
3057 case Op_AddReductionVI:
3058 case Op_AddReductionVL:
3059 vredsum_vs(tmp, src2, tmp, vm);
3060 break;
3061 case Op_AndReductionV:
3062 vredand_vs(tmp, src2, tmp, vm);
3063 break;
3064 case Op_OrReductionV:
3065 vredor_vs(tmp, src2, tmp, vm);
3066 break;
3067 case Op_XorReductionV:
3068 vredxor_vs(tmp, src2, tmp, vm);
3069 break;
3070 case Op_MaxReductionV:
3071 vredmax_vs(tmp, src2, tmp, vm);
3072 break;
3073 case Op_MinReductionV:
3074 vredmin_vs(tmp, src2, tmp, vm);
3075 break;
3076 default:
3077 ShouldNotReachHere();
3078 }
3079 vmv_x_s(dst, tmp);
3080 }
3081
3082 void C2_MacroAssembler::reduce_mul_integral_v(Register dst, Register src1, VectorRegister src2,
3083 VectorRegister vtmp1, VectorRegister vtmp2,
3084 BasicType bt, uint vector_length, VectorMask vm) {
3085 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
3086 vsetvli_helper(bt, vector_length);
3087
3088 vector_length /= 2;
3089 if (vm != Assembler::unmasked) {
3090 // This behaviour is consistent with spec requirements of vector API, for `reduceLanes`:
3091 // If no elements are selected, an operation-specific identity value is returned.
3092 // If the operation is MUL, then the identity value is one.
3093 vmv_v_i(vtmp1, 1);
3094 vmerge_vvm(vtmp2, vtmp1, src2); // vm == v0
3095 slidedown_v(vtmp1, vtmp2, vector_length);
3096
3097 vsetvli_helper(bt, vector_length);
3098 vmul_vv(vtmp1, vtmp1, vtmp2);
3099 } else {
3100 slidedown_v(vtmp1, src2, vector_length);
3101
3102 vsetvli_helper(bt, vector_length);
3103 vmul_vv(vtmp1, vtmp1, src2);
3104 }
3105
3106 while (vector_length > 1) {
3107 vector_length /= 2;
3108 slidedown_v(vtmp2, vtmp1, vector_length);
3109 vsetvli_helper(bt, vector_length);
3110 vmul_vv(vtmp1, vtmp1, vtmp2);
3111 }
3112
3113 vmv_x_s(dst, vtmp1);
3114 if (bt == T_INT) {
3115 mulw(dst, dst, src1);
3116 } else {
3117 mul(dst, dst, src1);
3118 }
3119 }
3120
3121 // Set vl and vtype for full and partial vector operations.
3122 // (vma = mu, vta = tu, vill = false)
3123 void C2_MacroAssembler::vsetvli_helper(BasicType bt, uint vector_length, LMUL vlmul, Register tmp) {
3124 Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
3125 if (vector_length <= 31) {
3126 vsetivli(tmp, vector_length, sew, vlmul);
3127 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
3128 vsetvli(tmp, x0, sew, vlmul);
3129 } else {
3130 mv(tmp, vector_length);
3131 vsetvli(tmp, tmp, sew, vlmul);
3132 }
3133 }
3134
3135 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3136 int cond, BasicType bt, uint vector_length, VectorMask vm) {
3137 assert(is_integral_type(bt), "unsupported element type");
3138 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3139 vsetvli_helper(bt, vector_length);
3140 if (vm == Assembler::v0_t) {
3141 vmclr_m(vd);
3142 }
3143 switch (cond) {
3144 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
3145 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
3146 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
3147 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
3148 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
3149 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
3150 case BoolTest::ule: vmsleu_vv(vd, src1, src2, vm); break;
3151 case BoolTest::uge: vmsgeu_vv(vd, src1, src2, vm); break;
3152 case BoolTest::ult: vmsltu_vv(vd, src1, src2, vm); break;
3153 case BoolTest::ugt: vmsgtu_vv(vd, src1, src2, vm); break;
3154 default:
3155 assert(false, "unsupported compare condition");
3156 ShouldNotReachHere();
3157 }
3158 }
3159
3160 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
3161 int cond, BasicType bt, uint vector_length, VectorMask vm) {
3162 assert(is_floating_point_type(bt), "unsupported element type");
3163 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
3164 vsetvli_helper(bt, vector_length);
3165 if (vm == Assembler::v0_t) {
3166 vmclr_m(vd);
3167 }
3168 switch (cond) {
3169 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
3170 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
3171 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
3172 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
3173 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
3174 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
3175 default:
3176 assert(false, "unsupported compare condition");
3177 ShouldNotReachHere();
3178 }
3179 }
3180
3181 // In Matcher::scalable_predicate_reg_slots,
3182 // we assume each predicate register is one-eighth of the size of
3183 // scalable vector register, one mask bit per vector byte.
3184 void C2_MacroAssembler::spill_vmask(VectorRegister v, int offset) {
3185 vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3186 add(t0, sp, offset);
3187 vse8_v(v, t0);
3188 }
3189
3190 void C2_MacroAssembler::unspill_vmask(VectorRegister v, int offset) {
3191 vsetvli_helper(T_BYTE, MaxVectorSize >> 3);
3192 add(t0, sp, offset);
3193 vle8_v(v, t0);
3194 }
3195
3196 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3197 VectorRegister src, BasicType src_bt, bool is_signed) {
3198 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
3199 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3200 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
3201 // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
3202 // and the overlap is in the highest-numbered part of the destination register group.
3203 // Since LMUL=1, vd and vs cannot be the same.
3204 assert_different_registers(dst, src);
3205
3206 vsetvli_helper(dst_bt, vector_length);
3207 if (is_signed) {
3208 if (src_bt == T_BYTE) {
3209 switch (dst_bt) {
3210 case T_SHORT:
3211 vsext_vf2(dst, src);
3212 break;
3213 case T_INT:
3214 vsext_vf4(dst, src);
3215 break;
3216 case T_LONG:
3217 vsext_vf8(dst, src);
3218 break;
3219 default:
3220 ShouldNotReachHere();
3221 }
3222 } else if (src_bt == T_SHORT) {
3223 if (dst_bt == T_INT) {
3224 vsext_vf2(dst, src);
3225 } else {
3226 vsext_vf4(dst, src);
3227 }
3228 } else if (src_bt == T_INT) {
3229 vsext_vf2(dst, src);
3230 }
3231 } else {
3232 if (src_bt == T_BYTE) {
3233 switch (dst_bt) {
3234 case T_SHORT:
3235 vzext_vf2(dst, src);
3236 break;
3237 case T_INT:
3238 vzext_vf4(dst, src);
3239 break;
3240 case T_LONG:
3241 vzext_vf8(dst, src);
3242 break;
3243 default:
3244 ShouldNotReachHere();
3245 }
3246 } else if (src_bt == T_SHORT) {
3247 if (dst_bt == T_INT) {
3248 vzext_vf2(dst, src);
3249 } else {
3250 vzext_vf4(dst, src);
3251 }
3252 } else if (src_bt == T_INT) {
3253 vzext_vf2(dst, src);
3254 }
3255 }
3256 }
3257
3258 // Vector narrow from src to dst with specified element sizes.
3259 // High part of dst vector will be filled with zero.
3260 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, uint vector_length,
3261 VectorRegister src, BasicType src_bt) {
3262 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
3263 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
3264 mv(t0, vector_length);
3265 if (src_bt == T_LONG) {
3266 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
3267 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
3268 // So we can currently only scale down by 1/2 the width at a time.
3269 vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
3270 vncvt_x_x_w(dst, src);
3271 if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
3272 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3273 vncvt_x_x_w(dst, dst);
3274 if (dst_bt == T_BYTE) {
3275 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3276 vncvt_x_x_w(dst, dst);
3277 }
3278 }
3279 } else if (src_bt == T_INT) {
3280 // T_SHORT
3281 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
3282 vncvt_x_x_w(dst, src);
3283 if (dst_bt == T_BYTE) {
3284 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3285 vncvt_x_x_w(dst, dst);
3286 }
3287 } else if (src_bt == T_SHORT) {
3288 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
3289 vncvt_x_x_w(dst, src);
3290 }
3291 }
3292
3293 #define VFCVT_SAFE(VFLOATCVT) \
3294 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
3295 assert_different_registers(dst, src); \
3296 vxor_vv(dst, dst, dst); \
3297 vmfeq_vv(v0, src, src); \
3298 VFLOATCVT(dst, src, Assembler::v0_t); \
3299 }
3300
3301 VFCVT_SAFE(vfcvt_rtz_x_f_v);
3302
3303 #undef VFCVT_SAFE
3304
3305 // Extract a scalar element from an vector at position 'idx'.
3306 // The input elements in src are expected to be of integral type.
3307 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src,
3308 BasicType bt, int idx, VectorRegister vtmp) {
3309 assert(is_integral_type(bt), "unsupported element type");
3310 assert(idx >= 0, "idx cannot be negative");
3311 // Only need the first element after vector slidedown
3312 vsetvli_helper(bt, 1);
3313 if (idx == 0) {
3314 vmv_x_s(dst, src);
3315 } else {
3316 slidedown_v(vtmp, src, idx);
3317 vmv_x_s(dst, vtmp);
3318 }
3319 }
3320
3321 // Extract a scalar element from an vector at position 'idx'.
3322 // The input elements in src are expected to be of floating point type.
3323 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src,
3324 BasicType bt, int idx, VectorRegister vtmp) {
3325 assert(is_floating_point_type(bt), "unsupported element type");
3326 assert(idx >= 0, "idx cannot be negative");
3327 // Only need the first element after vector slidedown
3328 vsetvli_helper(bt, 1);
3329 if (idx == 0) {
3330 vfmv_f_s(dst, src);
3331 } else {
3332 slidedown_v(vtmp, src, idx);
3333 vfmv_f_s(dst, vtmp);
3334 }
3335 }
3336
3337 // Move elements down a vector register group.
3338 // Offset is the start index (offset) for the source.
3339 void C2_MacroAssembler::slidedown_v(VectorRegister dst, VectorRegister src,
3340 uint32_t offset, Register tmp) {
3341 if (is_uimm5(offset)) {
3342 vslidedown_vi(dst, src, offset);
3343 } else {
3344 mv(tmp, offset);
3345 vslidedown_vx(dst, src, tmp);
3346 }
3347 }