1 /*
2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "opto/c2_MacroAssembler.hpp"
30 #include "opto/compile.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/subnode.hpp"
34 #include "runtime/stubRoutines.hpp"
35
36 #ifdef PRODUCT
37 #define BLOCK_COMMENT(str) /* nothing */
38 #define STOP(error) stop(error)
39 #else
40 #define BLOCK_COMMENT(str) block_comment(str)
41 #define STOP(error) block_comment(error); stop(error)
42 #endif
43
44 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
45
46 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg,
47 Register tmp1Reg, Register tmp2Reg, Register tmp3Reg) {
48 // Use cr register to indicate the fast_lock result: zero for success; non-zero for failure.
49 Register flag = t1;
50 Register oop = objectReg;
51 Register box = boxReg;
52 Register disp_hdr = tmp1Reg;
53 Register tmp = tmp2Reg;
54 Label cont;
55 Label object_has_monitor;
56 Label count, no_count;
57
58 assert_different_registers(oop, box, tmp, disp_hdr, flag, tmp3Reg, t0);
59
60 // Load markWord from object into displaced_header.
61 ld(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
62
63 if (DiagnoseSyncOnValueBasedClasses != 0) {
64 load_klass(flag, oop);
65 lwu(flag, Address(flag, Klass::access_flags_offset()));
66 test_bit(flag, flag, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
67 bnez(flag, cont, true /* is_far */);
68 }
69
70 // Check for existing monitor
71 test_bit(t0, disp_hdr, exact_log2(markWord::monitor_value));
72 bnez(t0, object_has_monitor);
73
74 if (LockingMode == LM_MONITOR) {
75 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
76 j(cont);
77 } else if (LockingMode == LM_LEGACY) {
78 // Set tmp to be (markWord of object | UNLOCK_VALUE).
79 ori(tmp, disp_hdr, markWord::unlocked_value);
80
81 // Initialize the box. (Must happen before we update the object mark!)
82 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
83
84 // Compare object markWord with an unlocked value (tmp) and if
85 // equal exchange the stack address of our box with object markWord.
86 // On failure disp_hdr contains the possibly locked markWord.
87 cmpxchg(/*memory address*/oop, /*expected value*/tmp, /*new value*/box, Assembler::int64, Assembler::aq,
88 Assembler::rl, /*result*/disp_hdr);
89 mv(flag, zr);
90 beq(disp_hdr, tmp, cont); // prepare zero flag and goto cont if we won the cas
91
92 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
93
94 // If the compare-and-exchange succeeded, then we found an unlocked
95 // object, will have now locked it will continue at label cont
96 // We did not see an unlocked object so try the fast recursive case.
97
98 // Check if the owner is self by comparing the value in the
99 // markWord of object (disp_hdr) with the stack pointer.
100 sub(disp_hdr, disp_hdr, sp);
101 mv(tmp, (intptr_t) (~(os::vm_page_size()-1) | (uintptr_t)markWord::lock_mask_in_place));
102 // If (mark & lock_mask) == 0 and mark - sp < page_size, we are stack-locking and goto cont,
103 // hence we can store 0 as the displaced header in the box, which indicates that it is a
104 // recursive lock.
105 andr(tmp/*==0?*/, disp_hdr, tmp);
106 sd(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
107 mv(flag, tmp); // we can use the value of tmp as the result here
108 j(cont);
109 } else {
110 assert(LockingMode == LM_LIGHTWEIGHT, "");
111 Label slow;
112 lightweight_lock(oop, disp_hdr, tmp, tmp3Reg, slow);
113
114 // Indicate success on completion.
115 mv(flag, zr);
116 j(count);
117 bind(slow);
118 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow-path
119 j(no_count);
120 }
121
122 // Handle existing monitor.
123 bind(object_has_monitor);
124 // The object's monitor m is unlocked iff m->owner == NULL,
125 // otherwise m->owner may contain a thread or a stack address.
126 //
127 // Try to CAS m->owner from NULL to current thread.
128 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value));
129 cmpxchg(/*memory address*/tmp, /*expected value*/zr, /*new value*/xthread, Assembler::int64, Assembler::aq,
130 Assembler::rl, /*result*/flag); // cas succeeds if flag == zr(expected)
131
132 if (LockingMode != LM_LIGHTWEIGHT) {
133 // Store a non-null value into the box to avoid looking like a re-entrant
134 // lock. The fast-path monitor unlock code checks for
135 // markWord::monitor_value so use markWord::unused_mark which has the
136 // relevant bit set, and also matches ObjectSynchronizer::slow_enter.
137 mv(tmp, (address)markWord::unused_mark().value());
138 sd(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
139 }
140
141 beqz(flag, cont); // CAS success means locking succeeded
142
143 bne(flag, xthread, cont); // Check for recursive locking
144
145 // Recursive lock case
146 mv(flag, zr);
147 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1, t0, tmp);
148
149 bind(cont);
150 // zero flag indicates success
151 // non-zero flag indicates failure
152 bnez(flag, no_count);
153
154 bind(count);
155 increment(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
156
157 bind(no_count);
158 }
159
160 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg,
161 Register tmp1Reg, Register tmp2Reg) {
162 // Use cr register to indicate the fast_unlock result: zero for success; non-zero for failure.
163 Register flag = t1;
164 Register oop = objectReg;
165 Register box = boxReg;
166 Register disp_hdr = tmp1Reg;
167 Register tmp = tmp2Reg;
168 Label cont;
169 Label object_has_monitor;
170 Label count, no_count;
171
172 assert_different_registers(oop, box, tmp, disp_hdr, flag, t0);
173
174 if (LockingMode == LM_LEGACY) {
175 // Find the lock address and load the displaced header from the stack.
176 ld(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
177
178 // If the displaced header is 0, we have a recursive unlock.
179 mv(flag, disp_hdr);
180 beqz(disp_hdr, cont);
181 }
182
183 // Handle existing monitor.
184 ld(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
185 test_bit(t0, tmp, exact_log2(markWord::monitor_value));
186 bnez(t0, object_has_monitor);
187
188 if (LockingMode == LM_MONITOR) {
189 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
190 j(cont);
191 } else if (LockingMode == LM_LEGACY) {
192 // Check if it is still a light weight lock, this is true if we
193 // see the stack address of the basicLock in the markWord of the
194 // object.
195
196 cmpxchg(/*memory address*/oop, /*expected value*/box, /*new value*/disp_hdr, Assembler::int64, Assembler::relaxed,
197 Assembler::rl, /*result*/tmp);
198 xorr(flag, box, tmp); // box == tmp if cas succeeds
199 j(cont);
200 } else {
201 assert(LockingMode == LM_LIGHTWEIGHT, "");
202 Label slow;
203 lightweight_unlock(oop, tmp, box, disp_hdr, slow);
204
205 // Indicate success on completion.
206 mv(flag, zr);
207 j(count);
208 bind(slow);
209 mv(flag, 1); // Set non-zero flag to indicate 'failure' -> take slow path
210 j(no_count);
211 }
212
213 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
214
215 // Handle existing monitor.
216 bind(object_has_monitor);
217 STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
218 add(tmp, tmp, -(int)markWord::monitor_value); // monitor
219
220 if (LockingMode == LM_LIGHTWEIGHT) {
221 // If the owner is anonymous, we need to fix it -- in an outline stub.
222 Register tmp2 = disp_hdr;
223 ld(tmp2, Address(tmp, ObjectMonitor::owner_offset()));
224 test_bit(t0, tmp2, exact_log2(ObjectMonitor::ANONYMOUS_OWNER));
225 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmp, tmp2);
226 Compile::current()->output()->add_stub(stub);
227 bnez(t0, stub->entry(), /* is_far */ true);
228 bind(stub->continuation());
229 }
230
231 ld(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
232
233 Label notRecursive;
234 beqz(disp_hdr, notRecursive); // Will be 0 if not recursive.
235
236 // Recursive lock
237 addi(disp_hdr, disp_hdr, -1);
238 sd(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
239 mv(flag, zr);
240 j(cont);
241
242 bind(notRecursive);
243 ld(flag, Address(tmp, ObjectMonitor::EntryList_offset()));
244 ld(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
245 orr(flag, flag, disp_hdr); // Will be 0 if both are 0.
246 bnez(flag, cont);
247 // need a release store here
248 la(tmp, Address(tmp, ObjectMonitor::owner_offset()));
249 membar(MacroAssembler::LoadStore | MacroAssembler::StoreStore);
250 sd(zr, Address(tmp)); // set unowned
251
252 bind(cont);
253 // zero flag indicates success
254 // non-zero flag indicates failure
255 bnez(flag, no_count);
256
257 bind(count);
258 decrement(Address(xthread, JavaThread::held_monitor_count_offset()), 1, t0, tmp);
259
260 bind(no_count);
261 }
262
263 // short string
264 // StringUTF16.indexOfChar
265 // StringLatin1.indexOfChar
266 void C2_MacroAssembler::string_indexof_char_short(Register str1, Register cnt1,
267 Register ch, Register result,
268 bool isL)
269 {
270 Register ch1 = t0;
271 Register index = t1;
272
273 BLOCK_COMMENT("string_indexof_char_short {");
274
275 Label LOOP, LOOP1, LOOP4, LOOP8;
276 Label MATCH, MATCH1, MATCH2, MATCH3,
277 MATCH4, MATCH5, MATCH6, MATCH7, NOMATCH;
278
279 mv(result, -1);
280 mv(index, zr);
281
282 bind(LOOP);
283 addi(t0, index, 8);
284 ble(t0, cnt1, LOOP8);
285 addi(t0, index, 4);
286 ble(t0, cnt1, LOOP4);
287 j(LOOP1);
288
289 bind(LOOP8);
290 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
291 beq(ch, ch1, MATCH);
292 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
293 beq(ch, ch1, MATCH1);
294 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
295 beq(ch, ch1, MATCH2);
296 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
297 beq(ch, ch1, MATCH3);
298 isL ? lbu(ch1, Address(str1, 4)) : lhu(ch1, Address(str1, 8));
299 beq(ch, ch1, MATCH4);
300 isL ? lbu(ch1, Address(str1, 5)) : lhu(ch1, Address(str1, 10));
301 beq(ch, ch1, MATCH5);
302 isL ? lbu(ch1, Address(str1, 6)) : lhu(ch1, Address(str1, 12));
303 beq(ch, ch1, MATCH6);
304 isL ? lbu(ch1, Address(str1, 7)) : lhu(ch1, Address(str1, 14));
305 beq(ch, ch1, MATCH7);
306 addi(index, index, 8);
307 addi(str1, str1, isL ? 8 : 16);
308 blt(index, cnt1, LOOP);
309 j(NOMATCH);
310
311 bind(LOOP4);
312 isL ? lbu(ch1, Address(str1, 0)) : lhu(ch1, Address(str1, 0));
313 beq(ch, ch1, MATCH);
314 isL ? lbu(ch1, Address(str1, 1)) : lhu(ch1, Address(str1, 2));
315 beq(ch, ch1, MATCH1);
316 isL ? lbu(ch1, Address(str1, 2)) : lhu(ch1, Address(str1, 4));
317 beq(ch, ch1, MATCH2);
318 isL ? lbu(ch1, Address(str1, 3)) : lhu(ch1, Address(str1, 6));
319 beq(ch, ch1, MATCH3);
320 addi(index, index, 4);
321 addi(str1, str1, isL ? 4 : 8);
322 bge(index, cnt1, NOMATCH);
323
324 bind(LOOP1);
325 isL ? lbu(ch1, Address(str1)) : lhu(ch1, Address(str1));
326 beq(ch, ch1, MATCH);
327 addi(index, index, 1);
328 addi(str1, str1, isL ? 1 : 2);
329 blt(index, cnt1, LOOP1);
330 j(NOMATCH);
331
332 bind(MATCH1);
333 addi(index, index, 1);
334 j(MATCH);
335
336 bind(MATCH2);
337 addi(index, index, 2);
338 j(MATCH);
339
340 bind(MATCH3);
341 addi(index, index, 3);
342 j(MATCH);
343
344 bind(MATCH4);
345 addi(index, index, 4);
346 j(MATCH);
347
348 bind(MATCH5);
349 addi(index, index, 5);
350 j(MATCH);
351
352 bind(MATCH6);
353 addi(index, index, 6);
354 j(MATCH);
355
356 bind(MATCH7);
357 addi(index, index, 7);
358
359 bind(MATCH);
360 mv(result, index);
361 bind(NOMATCH);
362 BLOCK_COMMENT("} string_indexof_char_short");
363 }
364
365 // StringUTF16.indexOfChar
366 // StringLatin1.indexOfChar
367 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
368 Register ch, Register result,
369 Register tmp1, Register tmp2,
370 Register tmp3, Register tmp4,
371 bool isL)
372 {
373 Label CH1_LOOP, HIT, NOMATCH, DONE, DO_LONG;
374 Register ch1 = t0;
375 Register orig_cnt = t1;
376 Register mask1 = tmp3;
377 Register mask2 = tmp2;
378 Register match_mask = tmp1;
379 Register trailing_char = tmp4;
380 Register unaligned_elems = tmp4;
381
382 BLOCK_COMMENT("string_indexof_char {");
383 beqz(cnt1, NOMATCH);
384
385 addi(t0, cnt1, isL ? -32 : -16);
386 bgtz(t0, DO_LONG);
387 string_indexof_char_short(str1, cnt1, ch, result, isL);
388 j(DONE);
389
390 bind(DO_LONG);
391 mv(orig_cnt, cnt1);
392 if (AvoidUnalignedAccesses) {
393 Label ALIGNED;
394 andi(unaligned_elems, str1, 0x7);
395 beqz(unaligned_elems, ALIGNED);
396 sub(unaligned_elems, unaligned_elems, 8);
397 neg(unaligned_elems, unaligned_elems);
398 if (!isL) {
399 srli(unaligned_elems, unaligned_elems, 1);
400 }
401 // do unaligned part per element
402 string_indexof_char_short(str1, unaligned_elems, ch, result, isL);
403 bgez(result, DONE);
404 mv(orig_cnt, cnt1);
405 sub(cnt1, cnt1, unaligned_elems);
406 bind(ALIGNED);
407 }
408
409 // duplicate ch
410 if (isL) {
411 slli(ch1, ch, 8);
412 orr(ch, ch1, ch);
413 }
414 slli(ch1, ch, 16);
415 orr(ch, ch1, ch);
416 slli(ch1, ch, 32);
417 orr(ch, ch1, ch);
418
419 if (!isL) {
420 slli(cnt1, cnt1, 1);
421 }
422
423 uint64_t mask0101 = UCONST64(0x0101010101010101);
424 uint64_t mask0001 = UCONST64(0x0001000100010001);
425 mv(mask1, isL ? mask0101 : mask0001);
426 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
427 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
428 mv(mask2, isL ? mask7f7f : mask7fff);
429
430 bind(CH1_LOOP);
431 ld(ch1, Address(str1));
432 addi(str1, str1, 8);
433 addi(cnt1, cnt1, -8);
434 compute_match_mask(ch1, ch, match_mask, mask1, mask2);
435 bnez(match_mask, HIT);
436 bgtz(cnt1, CH1_LOOP);
437 j(NOMATCH);
438
439 bind(HIT);
440 ctzc_bit(trailing_char, match_mask, isL, ch1, result);
441 srli(trailing_char, trailing_char, 3);
442 addi(cnt1, cnt1, 8);
443 ble(cnt1, trailing_char, NOMATCH);
444 // match case
445 if (!isL) {
446 srli(cnt1, cnt1, 1);
447 srli(trailing_char, trailing_char, 1);
448 }
449
450 sub(result, orig_cnt, cnt1);
451 add(result, result, trailing_char);
452 j(DONE);
453
454 bind(NOMATCH);
455 mv(result, -1);
456
457 bind(DONE);
458 BLOCK_COMMENT("} string_indexof_char");
459 }
460
461 typedef void (MacroAssembler::* load_chr_insn)(Register rd, const Address &adr, Register temp);
462
463 // Search for needle in haystack and return index or -1
464 // x10: result
465 // x11: haystack
466 // x12: haystack_len
467 // x13: needle
468 // x14: needle_len
469 void C2_MacroAssembler::string_indexof(Register haystack, Register needle,
470 Register haystack_len, Register needle_len,
471 Register tmp1, Register tmp2,
472 Register tmp3, Register tmp4,
473 Register tmp5, Register tmp6,
474 Register result, int ae)
475 {
476 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
477
478 Label LINEARSEARCH, LINEARSTUB, DONE, NOMATCH;
479
480 Register ch1 = t0;
481 Register ch2 = t1;
482 Register nlen_tmp = tmp1; // needle len tmp
483 Register hlen_tmp = tmp2; // haystack len tmp
484 Register result_tmp = tmp4;
485
486 bool isLL = ae == StrIntrinsicNode::LL;
487
488 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
489 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
490 int needle_chr_shift = needle_isL ? 0 : 1;
491 int haystack_chr_shift = haystack_isL ? 0 : 1;
492 int needle_chr_size = needle_isL ? 1 : 2;
493 int haystack_chr_size = haystack_isL ? 1 : 2;
494 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
495 (load_chr_insn)&MacroAssembler::lhu;
496 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
497 (load_chr_insn)&MacroAssembler::lhu;
498
499 BLOCK_COMMENT("string_indexof {");
500
501 // Note, inline_string_indexOf() generates checks:
502 // if (pattern.count > src.count) return -1;
503 // if (pattern.count == 0) return 0;
504
505 // We have two strings, a source string in haystack, haystack_len and a pattern string
506 // in needle, needle_len. Find the first occurrence of pattern in source or return -1.
507
508 // For larger pattern and source we use a simplified Boyer Moore algorithm.
509 // With a small pattern and source we use linear scan.
510
511 // needle_len >=8 && needle_len < 256 && needle_len < haystack_len/4, use bmh algorithm.
512 sub(result_tmp, haystack_len, needle_len);
513 // needle_len < 8, use linear scan
514 sub(t0, needle_len, 8);
515 bltz(t0, LINEARSEARCH);
516 // needle_len >= 256, use linear scan
517 sub(t0, needle_len, 256);
518 bgez(t0, LINEARSTUB);
519 // needle_len >= haystack_len/4, use linear scan
520 srli(t0, haystack_len, 2);
521 bge(needle_len, t0, LINEARSTUB);
522
523 // Boyer-Moore-Horspool introduction:
524 // The Boyer Moore alogorithm is based on the description here:-
525 //
526 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
527 //
528 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
529 // and the 'Good Suffix' rule.
530 //
531 // These rules are essentially heuristics for how far we can shift the
532 // pattern along the search string.
533 //
534 // The implementation here uses the 'Bad Character' rule only because of the
535 // complexity of initialisation for the 'Good Suffix' rule.
536 //
537 // This is also known as the Boyer-Moore-Horspool algorithm:
538 //
539 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
540 //
541 // #define ASIZE 256
542 //
543 // int bm(unsigned char *pattern, int m, unsigned char *src, int n) {
544 // int i, j;
545 // unsigned c;
546 // unsigned char bc[ASIZE];
547 //
548 // /* Preprocessing */
549 // for (i = 0; i < ASIZE; ++i)
550 // bc[i] = m;
551 // for (i = 0; i < m - 1; ) {
552 // c = pattern[i];
553 // ++i;
554 // // c < 256 for Latin1 string, so, no need for branch
555 // #ifdef PATTERN_STRING_IS_LATIN1
556 // bc[c] = m - i;
557 // #else
558 // if (c < ASIZE) bc[c] = m - i;
559 // #endif
560 // }
561 //
562 // /* Searching */
563 // j = 0;
564 // while (j <= n - m) {
565 // c = src[i+j];
566 // if (pattern[m-1] == c)
567 // int k;
568 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
569 // if (k < 0) return j;
570 // // c < 256 for Latin1 string, so, no need for branch
571 // #ifdef SOURCE_STRING_IS_LATIN1_AND_PATTERN_STRING_IS_LATIN1
572 // // LL case: (c< 256) always true. Remove branch
573 // j += bc[pattern[j+m-1]];
574 // #endif
575 // #ifdef SOURCE_STRING_IS_UTF_AND_PATTERN_STRING_IS_UTF
576 // // UU case: need if (c<ASIZE) check. Skip 1 character if not.
577 // if (c < ASIZE)
578 // j += bc[pattern[j+m-1]];
579 // else
580 // j += 1
581 // #endif
582 // #ifdef SOURCE_IS_UTF_AND_PATTERN_IS_LATIN1
583 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
584 // if (c < ASIZE)
585 // j += bc[pattern[j+m-1]];
586 // else
587 // j += m
588 // #endif
589 // }
590 // return -1;
591 // }
592
593 // temp register:t0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, result
594 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
595 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
596
597 Register haystack_end = haystack_len;
598 Register skipch = tmp2;
599
600 // pattern length is >=8, so, we can read at least 1 register for cases when
601 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
602 // UL case. We'll re-read last character in inner pre-loop code to have
603 // single outer pre-loop load
604 const int firstStep = isLL ? 7 : 3;
605
606 const int ASIZE = 256;
607 const int STORE_BYTES = 8; // 8 bytes stored per instruction(sd)
608
609 sub(sp, sp, ASIZE);
610
611 // init BC offset table with default value: needle_len
612 slli(t0, needle_len, 8);
613 orr(t0, t0, needle_len); // [63...16][needle_len][needle_len]
614 slli(tmp1, t0, 16);
615 orr(t0, tmp1, t0); // [63...32][needle_len][needle_len][needle_len][needle_len]
616 slli(tmp1, t0, 32);
617 orr(tmp5, tmp1, t0); // tmp5: 8 elements [needle_len]
618
619 mv(ch1, sp); // ch1 is t0
620 mv(tmp6, ASIZE / STORE_BYTES); // loop iterations
621
622 bind(BM_INIT_LOOP);
623 // for (i = 0; i < ASIZE; ++i)
624 // bc[i] = m;
625 for (int i = 0; i < 4; i++) {
626 sd(tmp5, Address(ch1, i * wordSize));
627 }
628 add(ch1, ch1, 32);
629 sub(tmp6, tmp6, 4);
630 bgtz(tmp6, BM_INIT_LOOP);
631
632 sub(nlen_tmp, needle_len, 1); // m - 1, index of the last element in pattern
633 Register orig_haystack = tmp5;
634 mv(orig_haystack, haystack);
635 // result_tmp = tmp4
636 shadd(haystack_end, result_tmp, haystack, haystack_end, haystack_chr_shift);
637 sub(ch2, needle_len, 1); // bc offset init value, ch2 is t1
638 mv(tmp3, needle);
639
640 // for (i = 0; i < m - 1; ) {
641 // c = pattern[i];
642 // ++i;
643 // // c < 256 for Latin1 string, so, no need for branch
644 // #ifdef PATTERN_STRING_IS_LATIN1
645 // bc[c] = m - i;
646 // #else
647 // if (c < ASIZE) bc[c] = m - i;
648 // #endif
649 // }
650 bind(BCLOOP);
651 (this->*needle_load_1chr)(ch1, Address(tmp3), noreg);
652 add(tmp3, tmp3, needle_chr_size);
653 if (!needle_isL) {
654 // ae == StrIntrinsicNode::UU
655 mv(tmp6, ASIZE);
656 bgeu(ch1, tmp6, BCSKIP);
657 }
658 add(tmp4, sp, ch1);
659 sb(ch2, Address(tmp4)); // store skip offset to BC offset table
660
661 bind(BCSKIP);
662 sub(ch2, ch2, 1); // for next pattern element, skip distance -1
663 bgtz(ch2, BCLOOP);
664
665 // tmp6: pattern end, address after needle
666 shadd(tmp6, needle_len, needle, tmp6, needle_chr_shift);
667 if (needle_isL == haystack_isL) {
668 // load last 8 bytes (8LL/4UU symbols)
669 ld(tmp6, Address(tmp6, -wordSize));
670 } else {
671 // UL: from UTF-16(source) search Latin1(pattern)
672 lwu(tmp6, Address(tmp6, -wordSize / 2)); // load last 4 bytes(4 symbols)
673 // convert Latin1 to UTF. eg: 0x0000abcd -> 0x0a0b0c0d
674 // We'll have to wait until load completed, but it's still faster than per-character loads+checks
675 srli(tmp3, tmp6, BitsPerByte * (wordSize / 2 - needle_chr_size)); // pattern[m-1], eg:0x0000000a
676 slli(ch2, tmp6, XLEN - 24);
677 srli(ch2, ch2, XLEN - 8); // pattern[m-2], 0x0000000b
678 slli(ch1, tmp6, XLEN - 16);
679 srli(ch1, ch1, XLEN - 8); // pattern[m-3], 0x0000000c
680 andi(tmp6, tmp6, 0xff); // pattern[m-4], 0x0000000d
681 slli(ch2, ch2, 16);
682 orr(ch2, ch2, ch1); // 0x00000b0c
683 slli(result, tmp3, 48); // use result as temp register
684 orr(tmp6, tmp6, result); // 0x0a00000d
685 slli(result, ch2, 16);
686 orr(tmp6, tmp6, result); // UTF-16:0x0a0b0c0d
687 }
688
689 // i = m - 1;
690 // skipch = j + i;
691 // if (skipch == pattern[m - 1]
692 // for (k = m - 2; k >= 0 && pattern[k] == src[k + j]; --k);
693 // else
694 // move j with bad char offset table
695 bind(BMLOOPSTR2);
696 // compare pattern to source string backward
697 shadd(result, nlen_tmp, haystack, result, haystack_chr_shift);
698 (this->*haystack_load_1chr)(skipch, Address(result), noreg);
699 sub(nlen_tmp, nlen_tmp, firstStep); // nlen_tmp is positive here, because needle_len >= 8
700 if (needle_isL == haystack_isL) {
701 // re-init tmp3. It's for free because it's executed in parallel with
702 // load above. Alternative is to initialize it before loop, but it'll
703 // affect performance on in-order systems with 2 or more ld/st pipelines
704 srli(tmp3, tmp6, BitsPerByte * (wordSize - needle_chr_size)); // UU/LL: pattern[m-1]
705 }
706 if (!isLL) { // UU/UL case
707 slli(ch2, nlen_tmp, 1); // offsets in bytes
708 }
709 bne(tmp3, skipch, BMSKIP); // if not equal, skipch is bad char
710 add(result, haystack, isLL ? nlen_tmp : ch2);
711 // load 8 bytes from source string
712 // if isLL is false then read granularity can be 2
713 load_long_misaligned(ch2, Address(result), ch1, isLL ? 1 : 2); // can use ch1 as temp register here as it will be trashed by next mv anyway
714 mv(ch1, tmp6);
715 if (isLL) {
716 j(BMLOOPSTR1_AFTER_LOAD);
717 } else {
718 sub(nlen_tmp, nlen_tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
719 j(BMLOOPSTR1_CMP);
720 }
721
722 bind(BMLOOPSTR1);
723 shadd(ch1, nlen_tmp, needle, ch1, needle_chr_shift);
724 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
725 shadd(ch2, nlen_tmp, haystack, ch2, haystack_chr_shift);
726 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
727
728 bind(BMLOOPSTR1_AFTER_LOAD);
729 sub(nlen_tmp, nlen_tmp, 1);
730 bltz(nlen_tmp, BMLOOPSTR1_LASTCMP);
731
732 bind(BMLOOPSTR1_CMP);
733 beq(ch1, ch2, BMLOOPSTR1);
734
735 bind(BMSKIP);
736 if (!isLL) {
737 // if we've met UTF symbol while searching Latin1 pattern, then we can
738 // skip needle_len symbols
739 if (needle_isL != haystack_isL) {
740 mv(result_tmp, needle_len);
741 } else {
742 mv(result_tmp, 1);
743 }
744 mv(t0, ASIZE);
745 bgeu(skipch, t0, BMADV);
746 }
747 add(result_tmp, sp, skipch);
748 lbu(result_tmp, Address(result_tmp)); // load skip offset
749
750 bind(BMADV);
751 sub(nlen_tmp, needle_len, 1);
752 // move haystack after bad char skip offset
753 shadd(haystack, result_tmp, haystack, result, haystack_chr_shift);
754 ble(haystack, haystack_end, BMLOOPSTR2);
755 add(sp, sp, ASIZE);
756 j(NOMATCH);
757
758 bind(BMLOOPSTR1_LASTCMP);
759 bne(ch1, ch2, BMSKIP);
760
761 bind(BMMATCH);
762 sub(result, haystack, orig_haystack);
763 if (!haystack_isL) {
764 srli(result, result, 1);
765 }
766 add(sp, sp, ASIZE);
767 j(DONE);
768
769 bind(LINEARSTUB);
770 sub(t0, needle_len, 16); // small patterns still should be handled by simple algorithm
771 bltz(t0, LINEARSEARCH);
772 mv(result, zr);
773 RuntimeAddress stub = nullptr;
774 if (isLL) {
775 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ll());
776 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
777 } else if (needle_isL) {
778 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_ul());
779 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
780 } else {
781 stub = RuntimeAddress(StubRoutines::riscv::string_indexof_linear_uu());
782 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
783 }
784 address call = trampoline_call(stub);
785 if (call == nullptr) {
786 DEBUG_ONLY(reset_labels(LINEARSEARCH, DONE, NOMATCH));
787 ciEnv::current()->record_failure("CodeCache is full");
788 return;
789 }
790 j(DONE);
791
792 bind(NOMATCH);
793 mv(result, -1);
794 j(DONE);
795
796 bind(LINEARSEARCH);
797 string_indexof_linearscan(haystack, needle, haystack_len, needle_len, tmp1, tmp2, tmp3, tmp4, -1, result, ae);
798
799 bind(DONE);
800 BLOCK_COMMENT("} string_indexof");
801 }
802
803 // string_indexof
804 // result: x10
805 // src: x11
806 // src_count: x12
807 // pattern: x13
808 // pattern_count: x14 or 1/2/3/4
809 void C2_MacroAssembler::string_indexof_linearscan(Register haystack, Register needle,
810 Register haystack_len, Register needle_len,
811 Register tmp1, Register tmp2,
812 Register tmp3, Register tmp4,
813 int needle_con_cnt, Register result, int ae)
814 {
815 // Note:
816 // needle_con_cnt > 0 means needle_len register is invalid, needle length is constant
817 // for UU/LL: needle_con_cnt[1, 4], UL: needle_con_cnt = 1
818 assert(needle_con_cnt <= 4, "Invalid needle constant count");
819 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
820
821 Register ch1 = t0;
822 Register ch2 = t1;
823 Register hlen_neg = haystack_len, nlen_neg = needle_len;
824 Register nlen_tmp = tmp1, hlen_tmp = tmp2, result_tmp = tmp4;
825
826 bool isLL = ae == StrIntrinsicNode::LL;
827
828 bool needle_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
829 bool haystack_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
830 int needle_chr_shift = needle_isL ? 0 : 1;
831 int haystack_chr_shift = haystack_isL ? 0 : 1;
832 int needle_chr_size = needle_isL ? 1 : 2;
833 int haystack_chr_size = haystack_isL ? 1 : 2;
834
835 load_chr_insn needle_load_1chr = needle_isL ? (load_chr_insn)&MacroAssembler::lbu :
836 (load_chr_insn)&MacroAssembler::lhu;
837 load_chr_insn haystack_load_1chr = haystack_isL ? (load_chr_insn)&MacroAssembler::lbu :
838 (load_chr_insn)&MacroAssembler::lhu;
839 load_chr_insn load_2chr = isLL ? (load_chr_insn)&MacroAssembler::lhu : (load_chr_insn)&MacroAssembler::lwu;
840 load_chr_insn load_4chr = isLL ? (load_chr_insn)&MacroAssembler::lwu : (load_chr_insn)&MacroAssembler::ld;
841
842 Label DO1, DO2, DO3, MATCH, NOMATCH, DONE;
843
844 Register first = tmp3;
845
846 if (needle_con_cnt == -1) {
847 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
848
849 sub(t0, needle_len, needle_isL == haystack_isL ? 4 : 2);
850 bltz(t0, DOSHORT);
851
852 (this->*needle_load_1chr)(first, Address(needle), noreg);
853 slli(t0, needle_len, needle_chr_shift);
854 add(needle, needle, t0);
855 neg(nlen_neg, t0);
856 slli(t0, result_tmp, haystack_chr_shift);
857 add(haystack, haystack, t0);
858 neg(hlen_neg, t0);
859
860 bind(FIRST_LOOP);
861 add(t0, haystack, hlen_neg);
862 (this->*haystack_load_1chr)(ch2, Address(t0), noreg);
863 beq(first, ch2, STR1_LOOP);
864
865 bind(STR2_NEXT);
866 add(hlen_neg, hlen_neg, haystack_chr_size);
867 blez(hlen_neg, FIRST_LOOP);
868 j(NOMATCH);
869
870 bind(STR1_LOOP);
871 add(nlen_tmp, nlen_neg, needle_chr_size);
872 add(hlen_tmp, hlen_neg, haystack_chr_size);
873 bgez(nlen_tmp, MATCH);
874
875 bind(STR1_NEXT);
876 add(ch1, needle, nlen_tmp);
877 (this->*needle_load_1chr)(ch1, Address(ch1), noreg);
878 add(ch2, haystack, hlen_tmp);
879 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
880 bne(ch1, ch2, STR2_NEXT);
881 add(nlen_tmp, nlen_tmp, needle_chr_size);
882 add(hlen_tmp, hlen_tmp, haystack_chr_size);
883 bltz(nlen_tmp, STR1_NEXT);
884 j(MATCH);
885
886 bind(DOSHORT);
887 if (needle_isL == haystack_isL) {
888 sub(t0, needle_len, 2);
889 bltz(t0, DO1);
890 bgtz(t0, DO3);
891 }
892 }
893
894 if (needle_con_cnt == 4) {
895 Label CH1_LOOP;
896 (this->*load_4chr)(ch1, Address(needle), noreg);
897 sub(result_tmp, haystack_len, 4);
898 slli(tmp3, result_tmp, haystack_chr_shift); // result as tmp
899 add(haystack, haystack, tmp3);
900 neg(hlen_neg, tmp3);
901 if (AvoidUnalignedAccesses) {
902 // preload first value, then we will read by 1 character per loop, instead of four
903 // just shifting previous ch2 right by size of character in bits
904 add(tmp3, haystack, hlen_neg);
905 (this->*load_4chr)(ch2, Address(tmp3), noreg);
906 if (isLL) {
907 // need to erase 1 most significant byte in 32-bit value of ch2
908 slli(ch2, ch2, 40);
909 srli(ch2, ch2, 32);
910 } else {
911 slli(ch2, ch2, 16); // 2 most significant bytes will be erased by this operation
912 }
913 }
914
915 bind(CH1_LOOP);
916 add(tmp3, haystack, hlen_neg);
917 if (AvoidUnalignedAccesses) {
918 srli(ch2, ch2, isLL ? 8 : 16);
919 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 3 : 6), noreg);
920 slli(tmp3, tmp3, isLL ? 24 : 48);
921 add(ch2, ch2, tmp3);
922 } else {
923 (this->*load_4chr)(ch2, Address(tmp3), noreg);
924 }
925 beq(ch1, ch2, MATCH);
926 add(hlen_neg, hlen_neg, haystack_chr_size);
927 blez(hlen_neg, CH1_LOOP);
928 j(NOMATCH);
929 }
930
931 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 2) {
932 Label CH1_LOOP;
933 BLOCK_COMMENT("string_indexof DO2 {");
934 bind(DO2);
935 (this->*load_2chr)(ch1, Address(needle), noreg);
936 if (needle_con_cnt == 2) {
937 sub(result_tmp, haystack_len, 2);
938 }
939 slli(tmp3, result_tmp, haystack_chr_shift);
940 add(haystack, haystack, tmp3);
941 neg(hlen_neg, tmp3);
942 if (AvoidUnalignedAccesses) {
943 // preload first value, then we will read by 1 character per loop, instead of two
944 // just shifting previous ch2 right by size of character in bits
945 add(tmp3, haystack, hlen_neg);
946 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
947 slli(ch2, ch2, isLL ? 8 : 16);
948 }
949 bind(CH1_LOOP);
950 add(tmp3, haystack, hlen_neg);
951 if (AvoidUnalignedAccesses) {
952 srli(ch2, ch2, isLL ? 8 : 16);
953 (this->*haystack_load_1chr)(tmp3, Address(tmp3, isLL ? 1 : 2), noreg);
954 slli(tmp3, tmp3, isLL ? 8 : 16);
955 add(ch2, ch2, tmp3);
956 } else {
957 (this->*load_2chr)(ch2, Address(tmp3), noreg);
958 }
959 beq(ch1, ch2, MATCH);
960 add(hlen_neg, hlen_neg, haystack_chr_size);
961 blez(hlen_neg, CH1_LOOP);
962 j(NOMATCH);
963 BLOCK_COMMENT("} string_indexof DO2");
964 }
965
966 if ((needle_con_cnt == -1 && needle_isL == haystack_isL) || needle_con_cnt == 3) {
967 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
968 BLOCK_COMMENT("string_indexof DO3 {");
969
970 bind(DO3);
971 (this->*load_2chr)(first, Address(needle), noreg);
972 (this->*needle_load_1chr)(ch1, Address(needle, 2 * needle_chr_size), noreg);
973 if (needle_con_cnt == 3) {
974 sub(result_tmp, haystack_len, 3);
975 }
976 slli(hlen_tmp, result_tmp, haystack_chr_shift);
977 add(haystack, haystack, hlen_tmp);
978 neg(hlen_neg, hlen_tmp);
979
980 bind(FIRST_LOOP);
981 add(ch2, haystack, hlen_neg);
982 if (AvoidUnalignedAccesses) {
983 (this->*haystack_load_1chr)(tmp2, Address(ch2, isLL ? 1 : 2), noreg); // we need a temp register, we can safely use hlen_tmp here, which is a synonym for tmp2
984 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
985 slli(tmp2, tmp2, isLL ? 8 : 16);
986 add(ch2, ch2, tmp2);
987 } else {
988 (this->*load_2chr)(ch2, Address(ch2), noreg);
989 }
990 beq(first, ch2, STR1_LOOP);
991
992 bind(STR2_NEXT);
993 add(hlen_neg, hlen_neg, haystack_chr_size);
994 blez(hlen_neg, FIRST_LOOP);
995 j(NOMATCH);
996
997 bind(STR1_LOOP);
998 add(hlen_tmp, hlen_neg, 2 * haystack_chr_size);
999 add(ch2, haystack, hlen_tmp);
1000 (this->*haystack_load_1chr)(ch2, Address(ch2), noreg);
1001 bne(ch1, ch2, STR2_NEXT);
1002 j(MATCH);
1003 BLOCK_COMMENT("} string_indexof DO3");
1004 }
1005
1006 if (needle_con_cnt == -1 || needle_con_cnt == 1) {
1007 Label DO1_LOOP;
1008
1009 BLOCK_COMMENT("string_indexof DO1 {");
1010 bind(DO1);
1011 (this->*needle_load_1chr)(ch1, Address(needle), noreg);
1012 sub(result_tmp, haystack_len, 1);
1013 slli(tmp3, result_tmp, haystack_chr_shift);
1014 add(haystack, haystack, tmp3);
1015 neg(hlen_neg, tmp3);
1016
1017 bind(DO1_LOOP);
1018 add(tmp3, haystack, hlen_neg);
1019 (this->*haystack_load_1chr)(ch2, Address(tmp3), noreg);
1020 beq(ch1, ch2, MATCH);
1021 add(hlen_neg, hlen_neg, haystack_chr_size);
1022 blez(hlen_neg, DO1_LOOP);
1023 BLOCK_COMMENT("} string_indexof DO1");
1024 }
1025
1026 bind(NOMATCH);
1027 mv(result, -1);
1028 j(DONE);
1029
1030 bind(MATCH);
1031 srai(t0, hlen_neg, haystack_chr_shift);
1032 add(result, result_tmp, t0);
1033
1034 bind(DONE);
1035 }
1036
1037 // Compare strings.
1038 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1039 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1040 Register tmp3, int ae)
1041 {
1042 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1043 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1044 SHORT_LOOP_START, TAIL_CHECK, L;
1045
1046 const int STUB_THRESHOLD = 64 + 8;
1047 bool isLL = ae == StrIntrinsicNode::LL;
1048 bool isLU = ae == StrIntrinsicNode::LU;
1049 bool isUL = ae == StrIntrinsicNode::UL;
1050
1051 bool str1_isL = isLL || isLU;
1052 bool str2_isL = isLL || isUL;
1053
1054 // for L strings, 1 byte for 1 character
1055 // for U strings, 2 bytes for 1 character
1056 int str1_chr_size = str1_isL ? 1 : 2;
1057 int str2_chr_size = str2_isL ? 1 : 2;
1058 int minCharsInWord = isLL ? wordSize : wordSize / 2;
1059
1060 load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1061 load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu;
1062
1063 BLOCK_COMMENT("string_compare {");
1064
1065 // Bizzarely, the counts are passed in bytes, regardless of whether they
1066 // are L or U strings, however the result is always in characters.
1067 if (!str1_isL) {
1068 sraiw(cnt1, cnt1, 1);
1069 }
1070 if (!str2_isL) {
1071 sraiw(cnt2, cnt2, 1);
1072 }
1073
1074 // Compute the minimum of the string lengths and save the difference in result.
1075 sub(result, cnt1, cnt2);
1076 bgt(cnt1, cnt2, L);
1077 mv(cnt2, cnt1);
1078 bind(L);
1079
1080 // A very short string
1081 mv(t0, minCharsInWord);
1082 ble(cnt2, t0, SHORT_STRING);
1083
1084 // Compare longwords
1085 // load first parts of strings and finish initialization while loading
1086 {
1087 if (str1_isL == str2_isL) { // LL or UU
1088 // check if str1 and str2 is same pointer
1089 beq(str1, str2, DONE);
1090 // load 8 bytes once to compare
1091 ld(tmp1, Address(str1));
1092 ld(tmp2, Address(str2));
1093 mv(t0, STUB_THRESHOLD);
1094 bge(cnt2, t0, STUB);
1095 sub(cnt2, cnt2, minCharsInWord);
1096 beqz(cnt2, TAIL_CHECK);
1097 // convert cnt2 from characters to bytes
1098 if (!str1_isL) {
1099 slli(cnt2, cnt2, 1);
1100 }
1101 add(str2, str2, cnt2);
1102 add(str1, str1, cnt2);
1103 sub(cnt2, zr, cnt2);
1104 } else if (isLU) { // LU case
1105 lwu(tmp1, Address(str1));
1106 ld(tmp2, Address(str2));
1107 mv(t0, STUB_THRESHOLD);
1108 bge(cnt2, t0, STUB);
1109 addi(cnt2, cnt2, -4);
1110 add(str1, str1, cnt2);
1111 sub(cnt1, zr, cnt2);
1112 slli(cnt2, cnt2, 1);
1113 add(str2, str2, cnt2);
1114 inflate_lo32(tmp3, tmp1);
1115 mv(tmp1, tmp3);
1116 sub(cnt2, zr, cnt2);
1117 addi(cnt1, cnt1, 4);
1118 } else { // UL case
1119 ld(tmp1, Address(str1));
1120 lwu(tmp2, Address(str2));
1121 mv(t0, STUB_THRESHOLD);
1122 bge(cnt2, t0, STUB);
1123 addi(cnt2, cnt2, -4);
1124 slli(t0, cnt2, 1);
1125 sub(cnt1, zr, t0);
1126 add(str1, str1, t0);
1127 add(str2, str2, cnt2);
1128 inflate_lo32(tmp3, tmp2);
1129 mv(tmp2, tmp3);
1130 sub(cnt2, zr, cnt2);
1131 addi(cnt1, cnt1, 8);
1132 }
1133 addi(cnt2, cnt2, isUL ? 4 : 8);
1134 bne(tmp1, tmp2, DIFFERENCE);
1135 bgez(cnt2, TAIL);
1136
1137 // main loop
1138 bind(NEXT_WORD);
1139 if (str1_isL == str2_isL) { // LL or UU
1140 add(t0, str1, cnt2);
1141 ld(tmp1, Address(t0));
1142 add(t0, str2, cnt2);
1143 ld(tmp2, Address(t0));
1144 addi(cnt2, cnt2, 8);
1145 } else if (isLU) { // LU case
1146 add(t0, str1, cnt1);
1147 lwu(tmp1, Address(t0));
1148 add(t0, str2, cnt2);
1149 ld(tmp2, Address(t0));
1150 addi(cnt1, cnt1, 4);
1151 inflate_lo32(tmp3, tmp1);
1152 mv(tmp1, tmp3);
1153 addi(cnt2, cnt2, 8);
1154 } else { // UL case
1155 add(t0, str2, cnt2);
1156 lwu(tmp2, Address(t0));
1157 add(t0, str1, cnt1);
1158 ld(tmp1, Address(t0));
1159 inflate_lo32(tmp3, tmp2);
1160 mv(tmp2, tmp3);
1161 addi(cnt1, cnt1, 8);
1162 addi(cnt2, cnt2, 4);
1163 }
1164 bne(tmp1, tmp2, DIFFERENCE);
1165 bltz(cnt2, NEXT_WORD);
1166 bind(TAIL);
1167 if (str1_isL == str2_isL) { // LL or UU
1168 load_long_misaligned(tmp1, Address(str1), tmp3, isLL ? 1 : 2);
1169 load_long_misaligned(tmp2, Address(str2), tmp3, isLL ? 1 : 2);
1170 } else if (isLU) { // LU case
1171 load_int_misaligned(tmp1, Address(str1), tmp3, false);
1172 load_long_misaligned(tmp2, Address(str2), tmp3, 2);
1173 inflate_lo32(tmp3, tmp1);
1174 mv(tmp1, tmp3);
1175 } else { // UL case
1176 load_int_misaligned(tmp2, Address(str2), tmp3, false);
1177 load_long_misaligned(tmp1, Address(str1), tmp3, 2);
1178 inflate_lo32(tmp3, tmp2);
1179 mv(tmp2, tmp3);
1180 }
1181 bind(TAIL_CHECK);
1182 beq(tmp1, tmp2, DONE);
1183
1184 // Find the first different characters in the longwords and
1185 // compute their difference.
1186 bind(DIFFERENCE);
1187 xorr(tmp3, tmp1, tmp2);
1188 ctzc_bit(result, tmp3, isLL); // count zero from lsb to msb
1189 srl(tmp1, tmp1, result);
1190 srl(tmp2, tmp2, result);
1191 if (isLL) {
1192 andi(tmp1, tmp1, 0xFF);
1193 andi(tmp2, tmp2, 0xFF);
1194 } else {
1195 andi(tmp1, tmp1, 0xFFFF);
1196 andi(tmp2, tmp2, 0xFFFF);
1197 }
1198 sub(result, tmp1, tmp2);
1199 j(DONE);
1200 }
1201
1202 bind(STUB);
1203 RuntimeAddress stub = nullptr;
1204 switch (ae) {
1205 case StrIntrinsicNode::LL:
1206 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LL());
1207 break;
1208 case StrIntrinsicNode::UU:
1209 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UU());
1210 break;
1211 case StrIntrinsicNode::LU:
1212 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_LU());
1213 break;
1214 case StrIntrinsicNode::UL:
1215 stub = RuntimeAddress(StubRoutines::riscv::compare_long_string_UL());
1216 break;
1217 default:
1218 ShouldNotReachHere();
1219 }
1220 assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1221 address call = trampoline_call(stub);
1222 if (call == nullptr) {
1223 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1224 ciEnv::current()->record_failure("CodeCache is full");
1225 return;
1226 }
1227 j(DONE);
1228
1229 bind(SHORT_STRING);
1230 // Is the minimum length zero?
1231 beqz(cnt2, DONE);
1232 // arrange code to do most branches while loading and loading next characters
1233 // while comparing previous
1234 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1235 addi(str1, str1, str1_chr_size);
1236 addi(cnt2, cnt2, -1);
1237 beqz(cnt2, SHORT_LAST_INIT);
1238 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1239 addi(str2, str2, str2_chr_size);
1240 j(SHORT_LOOP_START);
1241 bind(SHORT_LOOP);
1242 addi(cnt2, cnt2, -1);
1243 beqz(cnt2, SHORT_LAST);
1244 bind(SHORT_LOOP_START);
1245 (this->*str1_load_chr)(tmp2, Address(str1), t0);
1246 addi(str1, str1, str1_chr_size);
1247 (this->*str2_load_chr)(t0, Address(str2), t0);
1248 addi(str2, str2, str2_chr_size);
1249 bne(tmp1, cnt1, SHORT_LOOP_TAIL);
1250 addi(cnt2, cnt2, -1);
1251 beqz(cnt2, SHORT_LAST2);
1252 (this->*str1_load_chr)(tmp1, Address(str1), t0);
1253 addi(str1, str1, str1_chr_size);
1254 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1255 addi(str2, str2, str2_chr_size);
1256 beq(tmp2, t0, SHORT_LOOP);
1257 sub(result, tmp2, t0);
1258 j(DONE);
1259 bind(SHORT_LOOP_TAIL);
1260 sub(result, tmp1, cnt1);
1261 j(DONE);
1262 bind(SHORT_LAST2);
1263 beq(tmp2, t0, DONE);
1264 sub(result, tmp2, t0);
1265
1266 j(DONE);
1267 bind(SHORT_LAST_INIT);
1268 (this->*str2_load_chr)(cnt1, Address(str2), t0);
1269 addi(str2, str2, str2_chr_size);
1270 bind(SHORT_LAST);
1271 beq(tmp1, cnt1, DONE);
1272 sub(result, tmp1, cnt1);
1273
1274 bind(DONE);
1275
1276 BLOCK_COMMENT("} string_compare");
1277 }
1278
1279 void C2_MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
1280 Register tmp4, Register tmp5, Register tmp6, Register result,
1281 Register cnt1, int elem_size) {
1282 Label DONE, SAME, NEXT_DWORD, SHORT, TAIL, TAIL2, IS_TMP5_ZR;
1283 Register tmp1 = t0;
1284 Register tmp2 = t1;
1285 Register cnt2 = tmp2; // cnt2 only used in array length compare
1286 Register elem_per_word = tmp6;
1287 int log_elem_size = exact_log2(elem_size);
1288 int length_offset = arrayOopDesc::length_offset_in_bytes();
1289 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1290
1291 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
1292 assert_different_registers(a1, a2, result, cnt1, t0, t1, tmp3, tmp4, tmp5, tmp6);
1293 mv(elem_per_word, wordSize / elem_size);
1294
1295 BLOCK_COMMENT("arrays_equals {");
1296
1297 // if (a1 == a2), return true
1298 beq(a1, a2, SAME);
1299
1300 mv(result, false);
1301 beqz(a1, DONE);
1302 beqz(a2, DONE);
1303 lwu(cnt1, Address(a1, length_offset));
1304 lwu(cnt2, Address(a2, length_offset));
1305 bne(cnt2, cnt1, DONE);
1306 beqz(cnt1, SAME);
1307
1308 slli(tmp5, cnt1, 3 + log_elem_size);
1309 sub(tmp5, zr, tmp5);
1310 add(a1, a1, base_offset);
1311 add(a2, a2, base_offset);
1312 ld(tmp3, Address(a1, 0));
1313 ld(tmp4, Address(a2, 0));
1314 ble(cnt1, elem_per_word, SHORT); // short or same
1315
1316 // Main 16 byte comparison loop with 2 exits
1317 bind(NEXT_DWORD); {
1318 ld(tmp1, Address(a1, wordSize));
1319 ld(tmp2, Address(a2, wordSize));
1320 sub(cnt1, cnt1, 2 * wordSize / elem_size);
1321 blez(cnt1, TAIL);
1322 bne(tmp3, tmp4, DONE);
1323 ld(tmp3, Address(a1, 2 * wordSize));
1324 ld(tmp4, Address(a2, 2 * wordSize));
1325 add(a1, a1, 2 * wordSize);
1326 add(a2, a2, 2 * wordSize);
1327 ble(cnt1, elem_per_word, TAIL2);
1328 } beq(tmp1, tmp2, NEXT_DWORD);
1329 j(DONE);
1330
1331 bind(TAIL);
1332 xorr(tmp4, tmp3, tmp4);
1333 xorr(tmp2, tmp1, tmp2);
1334 sll(tmp2, tmp2, tmp5);
1335 orr(tmp5, tmp4, tmp2);
1336 j(IS_TMP5_ZR);
1337
1338 bind(TAIL2);
1339 bne(tmp1, tmp2, DONE);
1340
1341 bind(SHORT);
1342 xorr(tmp4, tmp3, tmp4);
1343 sll(tmp5, tmp4, tmp5);
1344
1345 bind(IS_TMP5_ZR);
1346 bnez(tmp5, DONE);
1347
1348 bind(SAME);
1349 mv(result, true);
1350 // That's it.
1351 bind(DONE);
1352
1353 BLOCK_COMMENT("} array_equals");
1354 }
1355
1356 // Compare Strings
1357
1358 // For Strings we're passed the address of the first characters in a1
1359 // and a2 and the length in cnt1.
1360 // elem_size is the element size in bytes: either 1 or 2.
1361 // There are two implementations. For arrays >= 8 bytes, all
1362 // comparisons (for hw supporting unaligned access: including the final one,
1363 // which may overlap) are performed 8 bytes at a time.
1364 // For strings < 8 bytes (and for tails of long strings when
1365 // AvoidUnalignedAccesses is true), we compare a
1366 // halfword, then a short, and then a byte.
1367
1368 void C2_MacroAssembler::string_equals(Register a1, Register a2,
1369 Register result, Register cnt1, int elem_size)
1370 {
1371 Label SAME, DONE, SHORT, NEXT_WORD;
1372 Register tmp1 = t0;
1373 Register tmp2 = t1;
1374
1375 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
1376 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2);
1377
1378 BLOCK_COMMENT("string_equals {");
1379
1380 beqz(cnt1, SAME);
1381 mv(result, false);
1382
1383 // Check for short strings, i.e. smaller than wordSize.
1384 sub(cnt1, cnt1, wordSize);
1385 bltz(cnt1, SHORT);
1386
1387 // Main 8 byte comparison loop.
1388 bind(NEXT_WORD); {
1389 ld(tmp1, Address(a1, 0));
1390 add(a1, a1, wordSize);
1391 ld(tmp2, Address(a2, 0));
1392 add(a2, a2, wordSize);
1393 sub(cnt1, cnt1, wordSize);
1394 bne(tmp1, tmp2, DONE);
1395 } bgez(cnt1, NEXT_WORD);
1396
1397 if (!AvoidUnalignedAccesses) {
1398 // Last longword. In the case where length == 4 we compare the
1399 // same longword twice, but that's still faster than another
1400 // conditional branch.
1401 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
1402 // length == 4.
1403 add(tmp1, a1, cnt1);
1404 ld(tmp1, Address(tmp1, 0));
1405 add(tmp2, a2, cnt1);
1406 ld(tmp2, Address(tmp2, 0));
1407 bne(tmp1, tmp2, DONE);
1408 j(SAME);
1409 } else {
1410 add(tmp1, cnt1, wordSize);
1411 beqz(tmp1, SAME);
1412 }
1413
1414 bind(SHORT);
1415 Label TAIL03, TAIL01;
1416
1417 // 0-7 bytes left.
1418 test_bit(tmp1, cnt1, 2);
1419 beqz(tmp1, TAIL03);
1420 {
1421 lwu(tmp1, Address(a1, 0));
1422 add(a1, a1, 4);
1423 lwu(tmp2, Address(a2, 0));
1424 add(a2, a2, 4);
1425 bne(tmp1, tmp2, DONE);
1426 }
1427
1428 bind(TAIL03);
1429 // 0-3 bytes left.
1430 test_bit(tmp1, cnt1, 1);
1431 beqz(tmp1, TAIL01);
1432 {
1433 lhu(tmp1, Address(a1, 0));
1434 add(a1, a1, 2);
1435 lhu(tmp2, Address(a2, 0));
1436 add(a2, a2, 2);
1437 bne(tmp1, tmp2, DONE);
1438 }
1439
1440 bind(TAIL01);
1441 if (elem_size == 1) { // Only needed when comparing 1-byte elements
1442 // 0-1 bytes left.
1443 test_bit(tmp1, cnt1, 0);
1444 beqz(tmp1, SAME);
1445 {
1446 lbu(tmp1, Address(a1, 0));
1447 lbu(tmp2, Address(a2, 0));
1448 bne(tmp1, tmp2, DONE);
1449 }
1450 }
1451
1452 // Arrays are equal.
1453 bind(SAME);
1454 mv(result, true);
1455
1456 // That's it.
1457 bind(DONE);
1458 BLOCK_COMMENT("} string_equals");
1459 }
1460
1461 typedef void (Assembler::*conditional_branch_insn)(Register op1, Register op2, Label& label, bool is_far);
1462 typedef void (MacroAssembler::*float_conditional_branch_insn)(FloatRegister op1, FloatRegister op2, Label& label,
1463 bool is_far, bool is_unordered);
1464
1465 static conditional_branch_insn conditional_branches[] =
1466 {
1467 /* SHORT branches */
1468 (conditional_branch_insn)&MacroAssembler::beq,
1469 (conditional_branch_insn)&MacroAssembler::bgt,
1470 nullptr, // BoolTest::overflow
1471 (conditional_branch_insn)&MacroAssembler::blt,
1472 (conditional_branch_insn)&MacroAssembler::bne,
1473 (conditional_branch_insn)&MacroAssembler::ble,
1474 nullptr, // BoolTest::no_overflow
1475 (conditional_branch_insn)&MacroAssembler::bge,
1476
1477 /* UNSIGNED branches */
1478 (conditional_branch_insn)&MacroAssembler::beq,
1479 (conditional_branch_insn)&MacroAssembler::bgtu,
1480 nullptr,
1481 (conditional_branch_insn)&MacroAssembler::bltu,
1482 (conditional_branch_insn)&MacroAssembler::bne,
1483 (conditional_branch_insn)&MacroAssembler::bleu,
1484 nullptr,
1485 (conditional_branch_insn)&MacroAssembler::bgeu
1486 };
1487
1488 static float_conditional_branch_insn float_conditional_branches[] =
1489 {
1490 /* FLOAT SHORT branches */
1491 (float_conditional_branch_insn)&MacroAssembler::float_beq,
1492 (float_conditional_branch_insn)&MacroAssembler::float_bgt,
1493 nullptr, // BoolTest::overflow
1494 (float_conditional_branch_insn)&MacroAssembler::float_blt,
1495 (float_conditional_branch_insn)&MacroAssembler::float_bne,
1496 (float_conditional_branch_insn)&MacroAssembler::float_ble,
1497 nullptr, // BoolTest::no_overflow
1498 (float_conditional_branch_insn)&MacroAssembler::float_bge,
1499
1500 /* DOUBLE SHORT branches */
1501 (float_conditional_branch_insn)&MacroAssembler::double_beq,
1502 (float_conditional_branch_insn)&MacroAssembler::double_bgt,
1503 nullptr,
1504 (float_conditional_branch_insn)&MacroAssembler::double_blt,
1505 (float_conditional_branch_insn)&MacroAssembler::double_bne,
1506 (float_conditional_branch_insn)&MacroAssembler::double_ble,
1507 nullptr,
1508 (float_conditional_branch_insn)&MacroAssembler::double_bge
1509 };
1510
1511 void C2_MacroAssembler::cmp_branch(int cmpFlag, Register op1, Register op2, Label& label, bool is_far) {
1512 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(conditional_branches) / sizeof(conditional_branches[0])),
1513 "invalid conditional branch index");
1514 (this->*conditional_branches[cmpFlag])(op1, op2, label, is_far);
1515 }
1516
1517 // This is a function should only be used by C2. Flip the unordered when unordered-greater, C2 would use
1518 // unordered-lesser instead of unordered-greater. Finally, commute the result bits at function do_one_bytecode().
1519 void C2_MacroAssembler::float_cmp_branch(int cmpFlag, FloatRegister op1, FloatRegister op2, Label& label, bool is_far) {
1520 assert(cmpFlag >= 0 && cmpFlag < (int)(sizeof(float_conditional_branches) / sizeof(float_conditional_branches[0])),
1521 "invalid float conditional branch index");
1522 int booltest_flag = cmpFlag & ~(C2_MacroAssembler::double_branch_mask);
1523 (this->*float_conditional_branches[cmpFlag])(op1, op2, label, is_far,
1524 (booltest_flag == (BoolTest::ge) || booltest_flag == (BoolTest::gt)) ? false : true);
1525 }
1526
1527 void C2_MacroAssembler::enc_cmpUEqNeLeGt_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1528 switch (cmpFlag) {
1529 case BoolTest::eq:
1530 case BoolTest::le:
1531 beqz(op1, L, is_far);
1532 break;
1533 case BoolTest::ne:
1534 case BoolTest::gt:
1535 bnez(op1, L, is_far);
1536 break;
1537 default:
1538 ShouldNotReachHere();
1539 }
1540 }
1541
1542 void C2_MacroAssembler::enc_cmpEqNe_imm0_branch(int cmpFlag, Register op1, Label& L, bool is_far) {
1543 switch (cmpFlag) {
1544 case BoolTest::eq:
1545 beqz(op1, L, is_far);
1546 break;
1547 case BoolTest::ne:
1548 bnez(op1, L, is_far);
1549 break;
1550 default:
1551 ShouldNotReachHere();
1552 }
1553 }
1554
1555 void C2_MacroAssembler::enc_cmove(int cmpFlag, Register op1, Register op2, Register dst, Register src) {
1556 Label L;
1557 cmp_branch(cmpFlag ^ (1 << neg_cond_bits), op1, op2, L);
1558 mv(dst, src);
1559 bind(L);
1560 }
1561
1562 // Set dst to NaN if any NaN input.
1563 void C2_MacroAssembler::minmax_fp(FloatRegister dst, FloatRegister src1, FloatRegister src2,
1564 bool is_double, bool is_min) {
1565 assert_different_registers(dst, src1, src2);
1566
1567 Label Done, Compare;
1568
1569 is_double ? fclass_d(t0, src1)
1570 : fclass_s(t0, src1);
1571 is_double ? fclass_d(t1, src2)
1572 : fclass_s(t1, src2);
1573 orr(t0, t0, t1);
1574 andi(t0, t0, 0b1100000000); //if src1 or src2 is quiet or signaling NaN then return NaN
1575 beqz(t0, Compare);
1576 is_double ? fadd_d(dst, src1, src2)
1577 : fadd_s(dst, src1, src2);
1578 j(Done);
1579
1580 bind(Compare);
1581 if (is_double) {
1582 is_min ? fmin_d(dst, src1, src2)
1583 : fmax_d(dst, src1, src2);
1584 } else {
1585 is_min ? fmin_s(dst, src1, src2)
1586 : fmax_s(dst, src1, src2);
1587 }
1588
1589 bind(Done);
1590 }
1591
1592 void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2,
1593 VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) {
1594 Label loop;
1595 Assembler::SEW sew = islatin ? Assembler::e8 : Assembler::e16;
1596
1597 bind(loop);
1598 vsetvli(tmp1, cnt, sew, Assembler::m2);
1599 vlex_v(vr1, a1, sew);
1600 vlex_v(vr2, a2, sew);
1601 vmsne_vv(vrs, vr1, vr2);
1602 vfirst_m(tmp2, vrs);
1603 bgez(tmp2, DONE);
1604 sub(cnt, cnt, tmp1);
1605 if (!islatin) {
1606 slli(tmp1, tmp1, 1); // get byte counts
1607 }
1608 add(a1, a1, tmp1);
1609 add(a2, a2, tmp1);
1610 bnez(cnt, loop);
1611
1612 mv(result, true);
1613 }
1614
1615 void C2_MacroAssembler::string_equals_v(Register a1, Register a2, Register result, Register cnt, int elem_size) {
1616 Label DONE;
1617 Register tmp1 = t0;
1618 Register tmp2 = t1;
1619
1620 BLOCK_COMMENT("string_equals_v {");
1621
1622 mv(result, false);
1623
1624 if (elem_size == 2) {
1625 srli(cnt, cnt, 1);
1626 }
1627
1628 element_compare(a1, a2, result, cnt, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1629
1630 bind(DONE);
1631 BLOCK_COMMENT("} string_equals_v");
1632 }
1633
1634 // used by C2 ClearArray patterns.
1635 // base: Address of a buffer to be zeroed
1636 // cnt: Count in HeapWords
1637 //
1638 // base, cnt, v4, v5, v6, v7 and t0 are clobbered.
1639 void C2_MacroAssembler::clear_array_v(Register base, Register cnt) {
1640 Label loop;
1641
1642 // making zero words
1643 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1644 vxor_vv(v4, v4, v4);
1645
1646 bind(loop);
1647 vsetvli(t0, cnt, Assembler::e64, Assembler::m4);
1648 vse64_v(v4, base);
1649 sub(cnt, cnt, t0);
1650 shadd(base, t0, base, t0, 3);
1651 bnez(cnt, loop);
1652 }
1653
1654 void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register result,
1655 Register cnt1, int elem_size) {
1656 Label DONE;
1657 Register tmp1 = t0;
1658 Register tmp2 = t1;
1659 Register cnt2 = tmp2;
1660 int length_offset = arrayOopDesc::length_offset_in_bytes();
1661 int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
1662
1663 BLOCK_COMMENT("arrays_equals_v {");
1664
1665 // if (a1 == a2), return true
1666 mv(result, true);
1667 beq(a1, a2, DONE);
1668
1669 mv(result, false);
1670 // if a1 == null or a2 == null, return false
1671 beqz(a1, DONE);
1672 beqz(a2, DONE);
1673 // if (a1.length != a2.length), return false
1674 lwu(cnt1, Address(a1, length_offset));
1675 lwu(cnt2, Address(a2, length_offset));
1676 bne(cnt1, cnt2, DONE);
1677
1678 la(a1, Address(a1, base_offset));
1679 la(a2, Address(a2, base_offset));
1680
1681 element_compare(a1, a2, result, cnt1, tmp1, tmp2, v2, v4, v2, elem_size == 1, DONE);
1682
1683 bind(DONE);
1684
1685 BLOCK_COMMENT("} arrays_equals_v");
1686 }
1687
1688 void C2_MacroAssembler::string_compare_v(Register str1, Register str2, Register cnt1, Register cnt2,
1689 Register result, Register tmp1, Register tmp2, int encForm) {
1690 Label DIFFERENCE, DONE, L, loop;
1691 bool encLL = encForm == StrIntrinsicNode::LL;
1692 bool encLU = encForm == StrIntrinsicNode::LU;
1693 bool encUL = encForm == StrIntrinsicNode::UL;
1694
1695 bool str1_isL = encLL || encLU;
1696 bool str2_isL = encLL || encUL;
1697
1698 int minCharsInWord = encLL ? wordSize : wordSize / 2;
1699
1700 BLOCK_COMMENT("string_compare {");
1701
1702 // for Latin strings, 1 byte for 1 character
1703 // for UTF16 strings, 2 bytes for 1 character
1704 if (!str1_isL)
1705 sraiw(cnt1, cnt1, 1);
1706 if (!str2_isL)
1707 sraiw(cnt2, cnt2, 1);
1708
1709 // if str1 == str2, return the difference
1710 // save the minimum of the string lengths in cnt2.
1711 sub(result, cnt1, cnt2);
1712 bgt(cnt1, cnt2, L);
1713 mv(cnt2, cnt1);
1714 bind(L);
1715
1716 if (str1_isL == str2_isL) { // LL or UU
1717 element_compare(str1, str2, zr, cnt2, tmp1, tmp2, v2, v4, v2, encLL, DIFFERENCE);
1718 j(DONE);
1719 } else { // LU or UL
1720 Register strL = encLU ? str1 : str2;
1721 Register strU = encLU ? str2 : str1;
1722 VectorRegister vstr1 = encLU ? v8 : v4;
1723 VectorRegister vstr2 = encLU ? v4 : v8;
1724
1725 bind(loop);
1726 vsetvli(tmp1, cnt2, Assembler::e8, Assembler::m2);
1727 vle8_v(vstr1, strL);
1728 vsetvli(tmp1, cnt2, Assembler::e16, Assembler::m4);
1729 vzext_vf2(vstr2, vstr1);
1730 vle16_v(vstr1, strU);
1731 vmsne_vv(v4, vstr2, vstr1);
1732 vfirst_m(tmp2, v4);
1733 bgez(tmp2, DIFFERENCE);
1734 sub(cnt2, cnt2, tmp1);
1735 add(strL, strL, tmp1);
1736 shadd(strU, tmp1, strU, tmp1, 1);
1737 bnez(cnt2, loop);
1738 j(DONE);
1739 }
1740
1741 bind(DIFFERENCE);
1742 slli(tmp1, tmp2, 1);
1743 add(str1, str1, str1_isL ? tmp2 : tmp1);
1744 add(str2, str2, str2_isL ? tmp2 : tmp1);
1745 str1_isL ? lbu(tmp1, Address(str1, 0)) : lhu(tmp1, Address(str1, 0));
1746 str2_isL ? lbu(tmp2, Address(str2, 0)) : lhu(tmp2, Address(str2, 0));
1747 sub(result, tmp1, tmp2);
1748
1749 bind(DONE);
1750 }
1751
1752 void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Register len, Register tmp) {
1753 Label loop;
1754 assert_different_registers(src, dst, len, tmp, t0);
1755
1756 BLOCK_COMMENT("byte_array_inflate_v {");
1757 bind(loop);
1758 vsetvli(tmp, len, Assembler::e8, Assembler::m2);
1759 vle8_v(v6, src);
1760 vsetvli(t0, len, Assembler::e16, Assembler::m4);
1761 vzext_vf2(v4, v6);
1762 vse16_v(v4, dst);
1763 sub(len, len, tmp);
1764 add(src, src, tmp);
1765 shadd(dst, tmp, dst, tmp, 1);
1766 bnez(len, loop);
1767 BLOCK_COMMENT("} byte_array_inflate_v");
1768 }
1769
1770 // Compress char[] array to byte[].
1771 // result: the array length if every element in array can be encoded; 0, otherwise.
1772 void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
1773 Register result, Register tmp) {
1774 Label done;
1775 encode_iso_array_v(src, dst, len, result, tmp, false);
1776 beqz(len, done);
1777 mv(result, zr);
1778 bind(done);
1779 }
1780
1781 // Intrinsic for
1782 //
1783 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
1784 // return the number of characters copied.
1785 // - java/lang/StringUTF16.compress
1786 // return zero (0) if copy fails, otherwise 'len'.
1787 //
1788 // This version always returns the number of characters copied. A successful
1789 // copy will complete with the post-condition: 'res' == 'len', while an
1790 // unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
1791 //
1792 // Clobbers: src, dst, len, result, t0
1793 void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
1794 Register result, Register tmp, bool ascii) {
1795 Label loop, fail, done;
1796
1797 BLOCK_COMMENT("encode_iso_array_v {");
1798 mv(result, 0);
1799
1800 bind(loop);
1801 mv(tmp, ascii ? 0x7f : 0xff);
1802 vsetvli(t0, len, Assembler::e16, Assembler::m2);
1803 vle16_v(v2, src);
1804
1805 vmsgtu_vx(v1, v2, tmp);
1806 vfirst_m(tmp, v1);
1807 vmsbf_m(v0, v1);
1808 // compress char to byte
1809 vsetvli(t0, len, Assembler::e8);
1810 vncvt_x_x_w(v1, v2, Assembler::v0_t);
1811 vse8_v(v1, dst, Assembler::v0_t);
1812
1813 // fail if char > 0x7f/0xff
1814 bgez(tmp, fail);
1815 add(result, result, t0);
1816 add(dst, dst, t0);
1817 sub(len, len, t0);
1818 shadd(src, t0, src, t0, 1);
1819 bnez(len, loop);
1820 j(done);
1821
1822 bind(fail);
1823 add(result, result, tmp);
1824
1825 bind(done);
1826 BLOCK_COMMENT("} encode_iso_array_v");
1827 }
1828
1829 void C2_MacroAssembler::count_positives_v(Register ary, Register len, Register result, Register tmp) {
1830 Label LOOP, SET_RESULT, DONE;
1831
1832 BLOCK_COMMENT("count_positives_v {");
1833 assert_different_registers(ary, len, result, tmp);
1834
1835 mv(result, zr);
1836
1837 bind(LOOP);
1838 vsetvli(t0, len, Assembler::e8, Assembler::m4);
1839 vle8_v(v4, ary);
1840 vmslt_vx(v4, v4, zr);
1841 vfirst_m(tmp, v4);
1842 bgez(tmp, SET_RESULT);
1843 // if tmp == -1, all bytes are positive
1844 add(result, result, t0);
1845
1846 sub(len, len, t0);
1847 add(ary, ary, t0);
1848 bnez(len, LOOP);
1849 j(DONE);
1850
1851 // add remaining positive bytes count
1852 bind(SET_RESULT);
1853 add(result, result, tmp);
1854
1855 bind(DONE);
1856 BLOCK_COMMENT("} count_positives_v");
1857 }
1858
1859 void C2_MacroAssembler::string_indexof_char_v(Register str1, Register cnt1,
1860 Register ch, Register result,
1861 Register tmp1, Register tmp2,
1862 bool isL) {
1863 mv(result, zr);
1864
1865 Label loop, MATCH, DONE;
1866 Assembler::SEW sew = isL ? Assembler::e8 : Assembler::e16;
1867 bind(loop);
1868 vsetvli(tmp1, cnt1, sew, Assembler::m4);
1869 vlex_v(v4, str1, sew);
1870 vmseq_vx(v4, v4, ch);
1871 vfirst_m(tmp2, v4);
1872 bgez(tmp2, MATCH); // if equal, return index
1873
1874 add(result, result, tmp1);
1875 sub(cnt1, cnt1, tmp1);
1876 if (!isL) slli(tmp1, tmp1, 1);
1877 add(str1, str1, tmp1);
1878 bnez(cnt1, loop);
1879
1880 mv(result, -1);
1881 j(DONE);
1882
1883 bind(MATCH);
1884 add(result, result, tmp2);
1885
1886 bind(DONE);
1887 }
1888
1889 // Set dst to NaN if any NaN input.
1890 void C2_MacroAssembler::minmax_fp_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
1891 bool is_double, bool is_min, int vector_length) {
1892 assert_different_registers(dst, src1, src2);
1893
1894 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
1895
1896 is_min ? vfmin_vv(dst, src1, src2)
1897 : vfmax_vv(dst, src1, src2);
1898
1899 vmfne_vv(v0, src1, src1);
1900 vfadd_vv(dst, src1, src1, Assembler::v0_t);
1901 vmfne_vv(v0, src2, src2);
1902 vfadd_vv(dst, src2, src2, Assembler::v0_t);
1903 }
1904
1905 // Set dst to NaN if any NaN input.
1906 // The destination vector register elements corresponding to masked-off elements
1907 // are handled with a mask-undisturbed policy.
1908 void C2_MacroAssembler::minmax_fp_masked_v(VectorRegister dst, VectorRegister src1, VectorRegister src2,
1909 VectorRegister vmask, VectorRegister tmp1, VectorRegister tmp2,
1910 bool is_double, bool is_min, int vector_length) {
1911 assert_different_registers(src1, src2, tmp1, tmp2);
1912 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
1913
1914 // Check vector elements of src1 and src2 for NaN.
1915 vmfeq_vv(tmp1, src1, src1);
1916 vmfeq_vv(tmp2, src2, src2);
1917
1918 vmandn_mm(v0, vmask, tmp1);
1919 vfadd_vv(dst, src1, src1, Assembler::v0_t);
1920 vmandn_mm(v0, vmask, tmp2);
1921 vfadd_vv(dst, src2, src2, Assembler::v0_t);
1922
1923 vmand_mm(tmp2, tmp1, tmp2);
1924 vmand_mm(v0, vmask, tmp2);
1925 is_min ? vfmin_vv(dst, src1, src2, Assembler::v0_t)
1926 : vfmax_vv(dst, src1, src2, Assembler::v0_t);
1927 }
1928
1929 // Set dst to NaN if any NaN input.
1930 void C2_MacroAssembler::reduce_minmax_fp_v(FloatRegister dst,
1931 FloatRegister src1, VectorRegister src2,
1932 VectorRegister tmp1, VectorRegister tmp2,
1933 bool is_double, bool is_min, int vector_length, VectorMask vm) {
1934 assert_different_registers(dst, src1);
1935 assert_different_registers(src2, tmp1, tmp2);
1936
1937 Label L_done, L_NaN_1, L_NaN_2;
1938 // Set dst to src1 if src1 is NaN
1939 is_double ? feq_d(t0, src1, src1)
1940 : feq_s(t0, src1, src1);
1941 beqz(t0, L_NaN_2);
1942
1943 vsetvli_helper(is_double ? T_DOUBLE : T_FLOAT, vector_length);
1944 vfmv_s_f(tmp2, src1);
1945
1946 is_min ? vfredmin_vs(tmp1, src2, tmp2, vm)
1947 : vfredmax_vs(tmp1, src2, tmp2, vm);
1948 vfmv_f_s(dst, tmp1);
1949
1950 // Checking NaNs in src2
1951 vmfne_vv(tmp1, src2, src2, vm);
1952 vcpop_m(t0, tmp1, vm);
1953 beqz(t0, L_done);
1954
1955 bind(L_NaN_1);
1956 vfredusum_vs(tmp1, src2, tmp2, vm);
1957 vfmv_f_s(dst, tmp1);
1958 j(L_done);
1959
1960 bind(L_NaN_2);
1961 is_double ? fmv_d(dst, src1)
1962 : fmv_s(dst, src1);
1963 bind(L_done);
1964 }
1965
1966 bool C2_MacroAssembler::in_scratch_emit_size() {
1967 if (ciEnv::current()->task() != nullptr) {
1968 PhaseOutput* phase_output = Compile::current()->output();
1969 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
1970 return true;
1971 }
1972 }
1973 return MacroAssembler::in_scratch_emit_size();
1974 }
1975
1976 void C2_MacroAssembler::reduce_integral_v(Register dst, Register src1,
1977 VectorRegister src2, VectorRegister tmp,
1978 int opc, BasicType bt, int vector_length, VectorMask vm) {
1979 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
1980 vsetvli_helper(bt, vector_length);
1981 vmv_s_x(tmp, src1);
1982 switch (opc) {
1983 case Op_AddReductionVI:
1984 case Op_AddReductionVL:
1985 vredsum_vs(tmp, src2, tmp, vm);
1986 break;
1987 case Op_AndReductionV:
1988 vredand_vs(tmp, src2, tmp, vm);
1989 break;
1990 case Op_OrReductionV:
1991 vredor_vs(tmp, src2, tmp, vm);
1992 break;
1993 case Op_XorReductionV:
1994 vredxor_vs(tmp, src2, tmp, vm);
1995 break;
1996 case Op_MaxReductionV:
1997 vredmax_vs(tmp, src2, tmp, vm);
1998 break;
1999 case Op_MinReductionV:
2000 vredmin_vs(tmp, src2, tmp, vm);
2001 break;
2002 default:
2003 ShouldNotReachHere();
2004 }
2005 vmv_x_s(dst, tmp);
2006 }
2007
2008 // Set vl and vtype for full and partial vector operations.
2009 // (vma = mu, vta = tu, vill = false)
2010 void C2_MacroAssembler::vsetvli_helper(BasicType bt, int vector_length, LMUL vlmul, Register tmp) {
2011 Assembler::SEW sew = Assembler::elemtype_to_sew(bt);
2012 if (vector_length <= 31) {
2013 vsetivli(tmp, vector_length, sew, vlmul);
2014 } else if (vector_length == (MaxVectorSize / type2aelembytes(bt))) {
2015 vsetvli(tmp, x0, sew, vlmul);
2016 } else {
2017 mv(tmp, vector_length);
2018 vsetvli(tmp, tmp, sew, vlmul);
2019 }
2020 }
2021
2022 void C2_MacroAssembler::compare_integral_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2023 int cond, BasicType bt, int vector_length, VectorMask vm) {
2024 assert(is_integral_type(bt), "unsupported element type");
2025 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2026 vsetvli_helper(bt, vector_length);
2027 vmclr_m(vd);
2028 switch (cond) {
2029 case BoolTest::eq: vmseq_vv(vd, src1, src2, vm); break;
2030 case BoolTest::ne: vmsne_vv(vd, src1, src2, vm); break;
2031 case BoolTest::le: vmsle_vv(vd, src1, src2, vm); break;
2032 case BoolTest::ge: vmsge_vv(vd, src1, src2, vm); break;
2033 case BoolTest::lt: vmslt_vv(vd, src1, src2, vm); break;
2034 case BoolTest::gt: vmsgt_vv(vd, src1, src2, vm); break;
2035 default:
2036 assert(false, "unsupported compare condition");
2037 ShouldNotReachHere();
2038 }
2039 }
2040
2041 void C2_MacroAssembler::compare_fp_v(VectorRegister vd, VectorRegister src1, VectorRegister src2,
2042 int cond, BasicType bt, int vector_length, VectorMask vm) {
2043 assert(is_floating_point_type(bt), "unsupported element type");
2044 assert(vm == Assembler::v0_t ? vd != v0 : true, "should be different registers");
2045 vsetvli_helper(bt, vector_length);
2046 vmclr_m(vd);
2047 switch (cond) {
2048 case BoolTest::eq: vmfeq_vv(vd, src1, src2, vm); break;
2049 case BoolTest::ne: vmfne_vv(vd, src1, src2, vm); break;
2050 case BoolTest::le: vmfle_vv(vd, src1, src2, vm); break;
2051 case BoolTest::ge: vmfge_vv(vd, src1, src2, vm); break;
2052 case BoolTest::lt: vmflt_vv(vd, src1, src2, vm); break;
2053 case BoolTest::gt: vmfgt_vv(vd, src1, src2, vm); break;
2054 default:
2055 assert(false, "unsupported compare condition");
2056 ShouldNotReachHere();
2057 }
2058 }
2059
2060 void C2_MacroAssembler::integer_extend_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2061 VectorRegister src, BasicType src_bt) {
2062 assert(type2aelembytes(dst_bt) > type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 8 && type2aelembytes(src_bt) <= 4, "invalid element size");
2063 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2064 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#52-vector-operands
2065 // The destination EEW is greater than the source EEW, the source EMUL is at least 1,
2066 // and the overlap is in the highest-numbered part of the destination register group.
2067 // Since LMUL=1, vd and vs cannot be the same.
2068 assert_different_registers(dst, src);
2069
2070 vsetvli_helper(dst_bt, vector_length);
2071 if (src_bt == T_BYTE) {
2072 switch (dst_bt) {
2073 case T_SHORT:
2074 vsext_vf2(dst, src);
2075 break;
2076 case T_INT:
2077 vsext_vf4(dst, src);
2078 break;
2079 case T_LONG:
2080 vsext_vf8(dst, src);
2081 break;
2082 default:
2083 ShouldNotReachHere();
2084 }
2085 } else if (src_bt == T_SHORT) {
2086 if (dst_bt == T_INT) {
2087 vsext_vf2(dst, src);
2088 } else {
2089 vsext_vf4(dst, src);
2090 }
2091 } else if (src_bt == T_INT) {
2092 vsext_vf2(dst, src);
2093 }
2094 }
2095
2096 // Vector narrow from src to dst with specified element sizes.
2097 // High part of dst vector will be filled with zero.
2098 void C2_MacroAssembler::integer_narrow_v(VectorRegister dst, BasicType dst_bt, int vector_length,
2099 VectorRegister src, BasicType src_bt) {
2100 assert(type2aelembytes(dst_bt) < type2aelembytes(src_bt) && type2aelembytes(dst_bt) <= 4 && type2aelembytes(src_bt) <= 8, "invalid element size");
2101 assert(dst_bt != T_FLOAT && dst_bt != T_DOUBLE && src_bt != T_FLOAT && src_bt != T_DOUBLE, "unsupported element type");
2102 mv(t0, vector_length);
2103 if (src_bt == T_LONG) {
2104 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#117-vector-narrowing-integer-right-shift-instructions
2105 // Future extensions might add support for versions that narrow to a destination that is 1/4 the width of the source.
2106 // So we can currently only scale down by 1/2 the width at a time.
2107 vsetvli(t0, t0, Assembler::e32, Assembler::mf2);
2108 vncvt_x_x_w(dst, src);
2109 if (dst_bt == T_SHORT || dst_bt == T_BYTE) {
2110 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2111 vncvt_x_x_w(dst, dst);
2112 if (dst_bt == T_BYTE) {
2113 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2114 vncvt_x_x_w(dst, dst);
2115 }
2116 }
2117 } else if (src_bt == T_INT) {
2118 // T_SHORT
2119 vsetvli(t0, t0, Assembler::e16, Assembler::mf2);
2120 vncvt_x_x_w(dst, src);
2121 if (dst_bt == T_BYTE) {
2122 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2123 vncvt_x_x_w(dst, dst);
2124 }
2125 } else if (src_bt == T_SHORT) {
2126 vsetvli(t0, t0, Assembler::e8, Assembler::mf2);
2127 vncvt_x_x_w(dst, src);
2128 }
2129 }
2130
2131 #define VFCVT_SAFE(VFLOATCVT) \
2132 void C2_MacroAssembler::VFLOATCVT##_safe(VectorRegister dst, VectorRegister src) { \
2133 assert_different_registers(dst, src); \
2134 vxor_vv(dst, dst, dst); \
2135 vmfeq_vv(v0, src, src); \
2136 VFLOATCVT(dst, src, Assembler::v0_t); \
2137 }
2138
2139 VFCVT_SAFE(vfcvt_rtz_x_f_v);
2140
2141 #undef VFCVT_SAFE
2142
2143 // Extract a scalar element from an vector at position 'idx'.
2144 // The input elements in src are expected to be of integral type.
2145 void C2_MacroAssembler::extract_v(Register dst, VectorRegister src, BasicType bt,
2146 int idx, VectorRegister tmp) {
2147 assert(is_integral_type(bt), "unsupported element type");
2148 assert(idx >= 0, "idx cannot be negative");
2149 // Only need the first element after vector slidedown
2150 vsetvli_helper(bt, 1);
2151 if (idx == 0) {
2152 vmv_x_s(dst, src);
2153 } else if (idx <= 31) {
2154 vslidedown_vi(tmp, src, idx);
2155 vmv_x_s(dst, tmp);
2156 } else {
2157 mv(t0, idx);
2158 vslidedown_vx(tmp, src, t0);
2159 vmv_x_s(dst, tmp);
2160 }
2161 }
2162
2163 // Extract a scalar element from an vector at position 'idx'.
2164 // The input elements in src are expected to be of floating point type.
2165 void C2_MacroAssembler::extract_fp_v(FloatRegister dst, VectorRegister src, BasicType bt,
2166 int idx, VectorRegister tmp) {
2167 assert(is_floating_point_type(bt), "unsupported element type");
2168 assert(idx >= 0, "idx cannot be negative");
2169 // Only need the first element after vector slidedown
2170 vsetvli_helper(bt, 1);
2171 if (idx == 0) {
2172 vfmv_f_s(dst, src);
2173 } else if (idx <= 31) {
2174 vslidedown_vi(tmp, src, idx);
2175 vfmv_f_s(dst, tmp);
2176 } else {
2177 mv(t0, idx);
2178 vslidedown_vx(tmp, src, t0);
2179 vfmv_f_s(dst, tmp);
2180 }
2181 }