1 /*
  2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #include "precompiled.hpp"
 26 #include "asm/assembler.hpp"
 27 #include "asm/assembler.inline.hpp"
 28 #include "opto/c2_MacroAssembler.hpp"
 29 #include "opto/intrinsicnode.hpp"
 30 #include "opto/subnode.hpp"
 31 #include "runtime/stubRoutines.hpp"
 32 
 33 #ifdef PRODUCT
 34 #define BLOCK_COMMENT(str) /* nothing */
 35 #define STOP(error) stop(error)
 36 #else
 37 #define BLOCK_COMMENT(str) block_comment(str)
 38 #define STOP(error) block_comment(error); stop(error)
 39 #endif
 40 
 41 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 42 
 43 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 44 
 45 // Search for str1 in str2 and return index or -1
 46 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 47 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 48                                        Register cnt2, Register cnt1,
 49                                        Register tmp1, Register tmp2,
 50                                        Register tmp3, Register tmp4,
 51                                        Register tmp5, Register tmp6,
 52                                        int icnt1, Register result, int ae) {
 53   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 54   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 55 
 56   Register ch1 = rscratch1;
 57   Register ch2 = rscratch2;
 58   Register cnt1tmp = tmp1;
 59   Register cnt2tmp = tmp2;
 60   Register cnt1_neg = cnt1;
 61   Register cnt2_neg = cnt2;
 62   Register result_tmp = tmp4;
 63 
 64   bool isL = ae == StrIntrinsicNode::LL;
 65 
 66   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 67   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 68   int str1_chr_shift = str1_isL ? 0:1;
 69   int str2_chr_shift = str2_isL ? 0:1;
 70   int str1_chr_size = str1_isL ? 1:2;
 71   int str2_chr_size = str2_isL ? 1:2;
 72   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 73                                       (chr_insn)&MacroAssembler::ldrh;
 74   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 75                                       (chr_insn)&MacroAssembler::ldrh;
 76   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 77   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 78 
 79   // Note, inline_string_indexOf() generates checks:
 80   // if (substr.count > string.count) return -1;
 81   // if (substr.count == 0) return 0;
 82 
 83   // We have two strings, a source string in str2, cnt2 and a pattern string
 84   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
 85 
 86   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 87   // With a small pattern and source we use linear scan.
 88 
 89   if (icnt1 == -1) {
 90     sub(result_tmp, cnt2, cnt1);
 91     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 92     br(LT, LINEARSEARCH);
 93     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 94     subs(zr, cnt1, 256);
 95     lsr(tmp1, cnt2, 2);
 96     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 97     br(GE, LINEARSTUB);
 98   }
 99 
100 // The Boyer Moore alogorithm is based on the description here:-
101 //
102 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
103 //
104 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
105 // and the 'Good Suffix' rule.
106 //
107 // These rules are essentially heuristics for how far we can shift the
108 // pattern along the search string.
109 //
110 // The implementation here uses the 'Bad Character' rule only because of the
111 // complexity of initialisation for the 'Good Suffix' rule.
112 //
113 // This is also known as the Boyer-Moore-Horspool algorithm:-
114 //
115 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
116 //
117 // This particular implementation has few java-specific optimizations.
118 //
119 // #define ASIZE 256
120 //
121 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
122 //       int i, j;
123 //       unsigned c;
124 //       unsigned char bc[ASIZE];
125 //
126 //       /* Preprocessing */
127 //       for (i = 0; i < ASIZE; ++i)
128 //          bc[i] = m;
129 //       for (i = 0; i < m - 1; ) {
130 //          c = x[i];
131 //          ++i;
132 //          // c < 256 for Latin1 string, so, no need for branch
133 //          #ifdef PATTERN_STRING_IS_LATIN1
134 //          bc[c] = m - i;
135 //          #else
136 //          if (c < ASIZE) bc[c] = m - i;
137 //          #endif
138 //       }
139 //
140 //       /* Searching */
141 //       j = 0;
142 //       while (j <= n - m) {
143 //          c = y[i+j];
144 //          if (x[m-1] == c)
145 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
146 //          if (i < 0) return j;
147 //          // c < 256 for Latin1 string, so, no need for branch
148 //          #ifdef SOURCE_STRING_IS_LATIN1
149 //          // LL case: (c< 256) always true. Remove branch
150 //          j += bc[y[j+m-1]];
151 //          #endif
152 //          #ifndef PATTERN_STRING_IS_UTF
153 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
154 //          if (c < ASIZE)
155 //            j += bc[y[j+m-1]];
156 //          else
157 //            j += 1
158 //          #endif
159 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
160 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
161 //          if (c < ASIZE)
162 //            j += bc[y[j+m-1]];
163 //          else
164 //            j += m
165 //          #endif
166 //       }
167 //    }
168 
169   if (icnt1 == -1) {
170     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
171         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
172     Register cnt1end = tmp2;
173     Register str2end = cnt2;
174     Register skipch = tmp2;
175 
176     // str1 length is >=8, so, we can read at least 1 register for cases when
177     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
178     // UL case. We'll re-read last character in inner pre-loop code to have
179     // single outer pre-loop load
180     const int firstStep = isL ? 7 : 3;
181 
182     const int ASIZE = 256;
183     const int STORED_BYTES = 32; // amount of bytes stored per instruction
184     sub(sp, sp, ASIZE);
185     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
186     mov(ch1, sp);
187     BIND(BM_INIT_LOOP);
188       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
189       subs(tmp5, tmp5, 1);
190       br(GT, BM_INIT_LOOP);
191 
192       sub(cnt1tmp, cnt1, 1);
193       mov(tmp5, str2);
194       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
195       sub(ch2, cnt1, 1);
196       mov(tmp3, str1);
197     BIND(BCLOOP);
198       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
199       if (!str1_isL) {
200         subs(zr, ch1, ASIZE);
201         br(HS, BCSKIP);
202       }
203       strb(ch2, Address(sp, ch1));
204     BIND(BCSKIP);
205       subs(ch2, ch2, 1);
206       br(GT, BCLOOP);
207 
208       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
209       if (str1_isL == str2_isL) {
210         // load last 8 bytes (8LL/4UU symbols)
211         ldr(tmp6, Address(tmp6, -wordSize));
212       } else {
213         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
214         // convert Latin1 to UTF. We'll have to wait until load completed, but
215         // it's still faster than per-character loads+checks
216         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
217         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
218         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
219         andr(tmp6, tmp6, 0xFF); // str1[N-4]
220         orr(ch2, ch1, ch2, LSL, 16);
221         orr(tmp6, tmp6, tmp3, LSL, 48);
222         orr(tmp6, tmp6, ch2, LSL, 16);
223       }
224     BIND(BMLOOPSTR2);
225       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
226       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
227       if (str1_isL == str2_isL) {
228         // re-init tmp3. It's for free because it's executed in parallel with
229         // load above. Alternative is to initialize it before loop, but it'll
230         // affect performance on in-order systems with 2 or more ld/st pipelines
231         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
232       }
233       if (!isL) { // UU/UL case
234         lsl(ch2, cnt1tmp, 1); // offset in bytes
235       }
236       cmp(tmp3, skipch);
237       br(NE, BMSKIP);
238       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
239       mov(ch1, tmp6);
240       if (isL) {
241         b(BMLOOPSTR1_AFTER_LOAD);
242       } else {
243         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
244         b(BMLOOPSTR1_CMP);
245       }
246     BIND(BMLOOPSTR1);
247       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
248       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
249     BIND(BMLOOPSTR1_AFTER_LOAD);
250       subs(cnt1tmp, cnt1tmp, 1);
251       br(LT, BMLOOPSTR1_LASTCMP);
252     BIND(BMLOOPSTR1_CMP);
253       cmp(ch1, ch2);
254       br(EQ, BMLOOPSTR1);
255     BIND(BMSKIP);
256       if (!isL) {
257         // if we've met UTF symbol while searching Latin1 pattern, then we can
258         // skip cnt1 symbols
259         if (str1_isL != str2_isL) {
260           mov(result_tmp, cnt1);
261         } else {
262           mov(result_tmp, 1);
263         }
264         subs(zr, skipch, ASIZE);
265         br(HS, BMADV);
266       }
267       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
268     BIND(BMADV);
269       sub(cnt1tmp, cnt1, 1);
270       add(str2, str2, result_tmp, LSL, str2_chr_shift);
271       cmp(str2, str2end);
272       br(LE, BMLOOPSTR2);
273       add(sp, sp, ASIZE);
274       b(NOMATCH);
275     BIND(BMLOOPSTR1_LASTCMP);
276       cmp(ch1, ch2);
277       br(NE, BMSKIP);
278     BIND(BMMATCH);
279       sub(result, str2, tmp5);
280       if (!str2_isL) lsr(result, result, 1);
281       add(sp, sp, ASIZE);
282       b(DONE);
283 
284     BIND(LINEARSTUB);
285     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
286     br(LT, LINEAR_MEDIUM);
287     mov(result, zr);
288     RuntimeAddress stub = NULL;
289     if (isL) {
290       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
291       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
292     } else if (str1_isL) {
293       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
294        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
295     } else {
296       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
297       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
298     }
299     trampoline_call(stub);
300     b(DONE);
301   }
302 
303   BIND(LINEARSEARCH);
304   {
305     Label DO1, DO2, DO3;
306 
307     Register str2tmp = tmp2;
308     Register first = tmp3;
309 
310     if (icnt1 == -1)
311     {
312         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
313 
314         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
315         br(LT, DOSHORT);
316       BIND(LINEAR_MEDIUM);
317         (this->*str1_load_1chr)(first, Address(str1));
318         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
319         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
320         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
321         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
322 
323       BIND(FIRST_LOOP);
324         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
325         cmp(first, ch2);
326         br(EQ, STR1_LOOP);
327       BIND(STR2_NEXT);
328         adds(cnt2_neg, cnt2_neg, str2_chr_size);
329         br(LE, FIRST_LOOP);
330         b(NOMATCH);
331 
332       BIND(STR1_LOOP);
333         adds(cnt1tmp, cnt1_neg, str1_chr_size);
334         add(cnt2tmp, cnt2_neg, str2_chr_size);
335         br(GE, MATCH);
336 
337       BIND(STR1_NEXT);
338         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
339         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
340         cmp(ch1, ch2);
341         br(NE, STR2_NEXT);
342         adds(cnt1tmp, cnt1tmp, str1_chr_size);
343         add(cnt2tmp, cnt2tmp, str2_chr_size);
344         br(LT, STR1_NEXT);
345         b(MATCH);
346 
347       BIND(DOSHORT);
348       if (str1_isL == str2_isL) {
349         cmp(cnt1, (u1)2);
350         br(LT, DO1);
351         br(GT, DO3);
352       }
353     }
354 
355     if (icnt1 == 4) {
356       Label CH1_LOOP;
357 
358         (this->*load_4chr)(ch1, str1);
359         sub(result_tmp, cnt2, 4);
360         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
361         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
362 
363       BIND(CH1_LOOP);
364         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
365         cmp(ch1, ch2);
366         br(EQ, MATCH);
367         adds(cnt2_neg, cnt2_neg, str2_chr_size);
368         br(LE, CH1_LOOP);
369         b(NOMATCH);
370       }
371 
372     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
373       Label CH1_LOOP;
374 
375       BIND(DO2);
376         (this->*load_2chr)(ch1, str1);
377         if (icnt1 == 2) {
378           sub(result_tmp, cnt2, 2);
379         }
380         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
381         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
382       BIND(CH1_LOOP);
383         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
384         cmp(ch1, ch2);
385         br(EQ, MATCH);
386         adds(cnt2_neg, cnt2_neg, str2_chr_size);
387         br(LE, CH1_LOOP);
388         b(NOMATCH);
389     }
390 
391     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
392       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
393 
394       BIND(DO3);
395         (this->*load_2chr)(first, str1);
396         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
397         if (icnt1 == 3) {
398           sub(result_tmp, cnt2, 3);
399         }
400         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
401         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
402       BIND(FIRST_LOOP);
403         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
404         cmpw(first, ch2);
405         br(EQ, STR1_LOOP);
406       BIND(STR2_NEXT);
407         adds(cnt2_neg, cnt2_neg, str2_chr_size);
408         br(LE, FIRST_LOOP);
409         b(NOMATCH);
410 
411       BIND(STR1_LOOP);
412         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
413         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
414         cmp(ch1, ch2);
415         br(NE, STR2_NEXT);
416         b(MATCH);
417     }
418 
419     if (icnt1 == -1 || icnt1 == 1) {
420       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
421 
422       BIND(DO1);
423         (this->*str1_load_1chr)(ch1, str1);
424         cmp(cnt2, (u1)8);
425         br(LT, DO1_SHORT);
426 
427         sub(result_tmp, cnt2, 8/str2_chr_size);
428         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
429         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
430         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
431 
432         if (str2_isL) {
433           orr(ch1, ch1, ch1, LSL, 8);
434         }
435         orr(ch1, ch1, ch1, LSL, 16);
436         orr(ch1, ch1, ch1, LSL, 32);
437       BIND(CH1_LOOP);
438         ldr(ch2, Address(str2, cnt2_neg));
439         eor(ch2, ch1, ch2);
440         sub(tmp1, ch2, tmp3);
441         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
442         bics(tmp1, tmp1, tmp2);
443         br(NE, HAS_ZERO);
444         adds(cnt2_neg, cnt2_neg, 8);
445         br(LT, CH1_LOOP);
446 
447         cmp(cnt2_neg, (u1)8);
448         mov(cnt2_neg, 0);
449         br(LT, CH1_LOOP);
450         b(NOMATCH);
451 
452       BIND(HAS_ZERO);
453         rev(tmp1, tmp1);
454         clz(tmp1, tmp1);
455         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
456         b(MATCH);
457 
458       BIND(DO1_SHORT);
459         mov(result_tmp, cnt2);
460         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
461         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
462       BIND(DO1_LOOP);
463         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
464         cmpw(ch1, ch2);
465         br(EQ, MATCH);
466         adds(cnt2_neg, cnt2_neg, str2_chr_size);
467         br(LT, DO1_LOOP);
468     }
469   }
470   BIND(NOMATCH);
471     mov(result, -1);
472     b(DONE);
473   BIND(MATCH);
474     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
475   BIND(DONE);
476 }
477 
478 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
479 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
480 
481 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
482                                             Register ch, Register result,
483                                             Register tmp1, Register tmp2, Register tmp3)
484 {
485   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
486   Register cnt1_neg = cnt1;
487   Register ch1 = rscratch1;
488   Register result_tmp = rscratch2;
489 
490   cbz(cnt1, NOMATCH);
491 
492   cmp(cnt1, (u1)4);
493   br(LT, DO1_SHORT);
494 
495   orr(ch, ch, ch, LSL, 16);
496   orr(ch, ch, ch, LSL, 32);
497 
498   sub(cnt1, cnt1, 4);
499   mov(result_tmp, cnt1);
500   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
501   sub(cnt1_neg, zr, cnt1, LSL, 1);
502 
503   mov(tmp3, 0x0001000100010001);
504 
505   BIND(CH1_LOOP);
506     ldr(ch1, Address(str1, cnt1_neg));
507     eor(ch1, ch, ch1);
508     sub(tmp1, ch1, tmp3);
509     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
510     bics(tmp1, tmp1, tmp2);
511     br(NE, HAS_ZERO);
512     adds(cnt1_neg, cnt1_neg, 8);
513     br(LT, CH1_LOOP);
514 
515     cmp(cnt1_neg, (u1)8);
516     mov(cnt1_neg, 0);
517     br(LT, CH1_LOOP);
518     b(NOMATCH);
519 
520   BIND(HAS_ZERO);
521     rev(tmp1, tmp1);
522     clz(tmp1, tmp1);
523     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
524     b(MATCH);
525 
526   BIND(DO1_SHORT);
527     mov(result_tmp, cnt1);
528     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
529     sub(cnt1_neg, zr, cnt1, LSL, 1);
530   BIND(DO1_LOOP);
531     ldrh(ch1, Address(str1, cnt1_neg));
532     cmpw(ch, ch1);
533     br(EQ, MATCH);
534     adds(cnt1_neg, cnt1_neg, 2);
535     br(LT, DO1_LOOP);
536   BIND(NOMATCH);
537     mov(result, -1);
538     b(DONE);
539   BIND(MATCH);
540     add(result, result_tmp, cnt1_neg, ASR, 1);
541   BIND(DONE);
542 }
543 
544 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
545                                             Register ch, Register result,
546                                             Register tmp1, Register tmp2, Register tmp3)
547 {
548   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
549   Register cnt1_neg = cnt1;
550   Register ch1 = rscratch1;
551   Register result_tmp = rscratch2;
552 
553   cbz(cnt1, NOMATCH);
554 
555   cmp(cnt1, (u1)8);
556   br(LT, DO1_SHORT);
557 
558   orr(ch, ch, ch, LSL, 8);
559   orr(ch, ch, ch, LSL, 16);
560   orr(ch, ch, ch, LSL, 32);
561 
562   sub(cnt1, cnt1, 8);
563   mov(result_tmp, cnt1);
564   lea(str1, Address(str1, cnt1));
565   sub(cnt1_neg, zr, cnt1);
566 
567   mov(tmp3, 0x0101010101010101);
568 
569   BIND(CH1_LOOP);
570     ldr(ch1, Address(str1, cnt1_neg));
571     eor(ch1, ch, ch1);
572     sub(tmp1, ch1, tmp3);
573     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
574     bics(tmp1, tmp1, tmp2);
575     br(NE, HAS_ZERO);
576     adds(cnt1_neg, cnt1_neg, 8);
577     br(LT, CH1_LOOP);
578 
579     cmp(cnt1_neg, (u1)8);
580     mov(cnt1_neg, 0);
581     br(LT, CH1_LOOP);
582     b(NOMATCH);
583 
584   BIND(HAS_ZERO);
585     rev(tmp1, tmp1);
586     clz(tmp1, tmp1);
587     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
588     b(MATCH);
589 
590   BIND(DO1_SHORT);
591     mov(result_tmp, cnt1);
592     lea(str1, Address(str1, cnt1));
593     sub(cnt1_neg, zr, cnt1);
594   BIND(DO1_LOOP);
595     ldrb(ch1, Address(str1, cnt1_neg));
596     cmp(ch, ch1);
597     br(EQ, MATCH);
598     adds(cnt1_neg, cnt1_neg, 1);
599     br(LT, DO1_LOOP);
600   BIND(NOMATCH);
601     mov(result, -1);
602     b(DONE);
603   BIND(MATCH);
604     add(result, result_tmp, cnt1_neg);
605   BIND(DONE);
606 }
607 
608 // Compare strings.
609 void C2_MacroAssembler::string_compare(Register str1, Register str2,
610     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
611     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
612   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
613       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
614       SHORT_LOOP_START, TAIL_CHECK;
615 
616   bool isLL = ae == StrIntrinsicNode::LL;
617   bool isLU = ae == StrIntrinsicNode::LU;
618   bool isUL = ae == StrIntrinsicNode::UL;
619 
620   // The stub threshold for LL strings is: 72 (64 + 8) chars
621   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
622   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
623   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
624 
625   bool str1_isL = isLL || isLU;
626   bool str2_isL = isLL || isUL;
627 
628   int str1_chr_shift = str1_isL ? 0 : 1;
629   int str2_chr_shift = str2_isL ? 0 : 1;
630   int str1_chr_size = str1_isL ? 1 : 2;
631   int str2_chr_size = str2_isL ? 1 : 2;
632   int minCharsInWord = isLL ? wordSize : wordSize/2;
633 
634   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
635   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
636                                       (chr_insn)&MacroAssembler::ldrh;
637   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
638                                       (chr_insn)&MacroAssembler::ldrh;
639   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
640                             (uxt_insn)&MacroAssembler::uxthw;
641 
642   BLOCK_COMMENT("string_compare {");
643 
644   // Bizzarely, the counts are passed in bytes, regardless of whether they
645   // are L or U strings, however the result is always in characters.
646   if (!str1_isL) asrw(cnt1, cnt1, 1);
647   if (!str2_isL) asrw(cnt2, cnt2, 1);
648 
649   // Compute the minimum of the string lengths and save the difference.
650   subsw(result, cnt1, cnt2);
651   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
652 
653   // A very short string
654   cmpw(cnt2, minCharsInWord);
655   br(Assembler::LE, SHORT_STRING);
656 
657   // Compare longwords
658   // load first parts of strings and finish initialization while loading
659   {
660     if (str1_isL == str2_isL) { // LL or UU
661       ldr(tmp1, Address(str1));
662       cmp(str1, str2);
663       br(Assembler::EQ, DONE);
664       ldr(tmp2, Address(str2));
665       cmp(cnt2, stub_threshold);
666       br(GE, STUB);
667       subsw(cnt2, cnt2, minCharsInWord);
668       br(EQ, TAIL_CHECK);
669       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
670       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
671       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
672     } else if (isLU) {
673       ldrs(vtmp, Address(str1));
674       ldr(tmp2, Address(str2));
675       cmp(cnt2, stub_threshold);
676       br(GE, STUB);
677       subw(cnt2, cnt2, 4);
678       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
679       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
680       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
681       zip1(vtmp, T8B, vtmp, vtmpZ);
682       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
683       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
684       add(cnt1, cnt1, 4);
685       fmovd(tmp1, vtmp);
686     } else { // UL case
687       ldr(tmp1, Address(str1));
688       ldrs(vtmp, Address(str2));
689       cmp(cnt2, stub_threshold);
690       br(GE, STUB);
691       subw(cnt2, cnt2, 4);
692       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
693       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
694       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
695       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
696       zip1(vtmp, T8B, vtmp, vtmpZ);
697       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
698       add(cnt1, cnt1, 8);
699       fmovd(tmp2, vtmp);
700     }
701     adds(cnt2, cnt2, isUL ? 4 : 8);
702     br(GE, TAIL);
703     eor(rscratch2, tmp1, tmp2);
704     cbnz(rscratch2, DIFF);
705     // main loop
706     bind(NEXT_WORD);
707     if (str1_isL == str2_isL) {
708       ldr(tmp1, Address(str1, cnt2));
709       ldr(tmp2, Address(str2, cnt2));
710       adds(cnt2, cnt2, 8);
711     } else if (isLU) {
712       ldrs(vtmp, Address(str1, cnt1));
713       ldr(tmp2, Address(str2, cnt2));
714       add(cnt1, cnt1, 4);
715       zip1(vtmp, T8B, vtmp, vtmpZ);
716       fmovd(tmp1, vtmp);
717       adds(cnt2, cnt2, 8);
718     } else { // UL
719       ldrs(vtmp, Address(str2, cnt2));
720       ldr(tmp1, Address(str1, cnt1));
721       zip1(vtmp, T8B, vtmp, vtmpZ);
722       add(cnt1, cnt1, 8);
723       fmovd(tmp2, vtmp);
724       adds(cnt2, cnt2, 4);
725     }
726     br(GE, TAIL);
727 
728     eor(rscratch2, tmp1, tmp2);
729     cbz(rscratch2, NEXT_WORD);
730     b(DIFF);
731     bind(TAIL);
732     eor(rscratch2, tmp1, tmp2);
733     cbnz(rscratch2, DIFF);
734     // Last longword.  In the case where length == 4 we compare the
735     // same longword twice, but that's still faster than another
736     // conditional branch.
737     if (str1_isL == str2_isL) {
738       ldr(tmp1, Address(str1));
739       ldr(tmp2, Address(str2));
740     } else if (isLU) {
741       ldrs(vtmp, Address(str1));
742       ldr(tmp2, Address(str2));
743       zip1(vtmp, T8B, vtmp, vtmpZ);
744       fmovd(tmp1, vtmp);
745     } else { // UL
746       ldrs(vtmp, Address(str2));
747       ldr(tmp1, Address(str1));
748       zip1(vtmp, T8B, vtmp, vtmpZ);
749       fmovd(tmp2, vtmp);
750     }
751     bind(TAIL_CHECK);
752     eor(rscratch2, tmp1, tmp2);
753     cbz(rscratch2, DONE);
754 
755     // Find the first different characters in the longwords and
756     // compute their difference.
757     bind(DIFF);
758     rev(rscratch2, rscratch2);
759     clz(rscratch2, rscratch2);
760     andr(rscratch2, rscratch2, isLL ? -8 : -16);
761     lsrv(tmp1, tmp1, rscratch2);
762     (this->*ext_chr)(tmp1, tmp1);
763     lsrv(tmp2, tmp2, rscratch2);
764     (this->*ext_chr)(tmp2, tmp2);
765     subw(result, tmp1, tmp2);
766     b(DONE);
767   }
768 
769   bind(STUB);
770     RuntimeAddress stub = NULL;
771     switch(ae) {
772       case StrIntrinsicNode::LL:
773         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
774         break;
775       case StrIntrinsicNode::UU:
776         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
777         break;
778       case StrIntrinsicNode::LU:
779         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
780         break;
781       case StrIntrinsicNode::UL:
782         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
783         break;
784       default:
785         ShouldNotReachHere();
786      }
787     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
788     trampoline_call(stub);
789     b(DONE);
790 
791   bind(SHORT_STRING);
792   // Is the minimum length zero?
793   cbz(cnt2, DONE);
794   // arrange code to do most branches while loading and loading next characters
795   // while comparing previous
796   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
797   subs(cnt2, cnt2, 1);
798   br(EQ, SHORT_LAST_INIT);
799   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
800   b(SHORT_LOOP_START);
801   bind(SHORT_LOOP);
802   subs(cnt2, cnt2, 1);
803   br(EQ, SHORT_LAST);
804   bind(SHORT_LOOP_START);
805   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
806   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
807   cmp(tmp1, cnt1);
808   br(NE, SHORT_LOOP_TAIL);
809   subs(cnt2, cnt2, 1);
810   br(EQ, SHORT_LAST2);
811   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
812   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
813   cmp(tmp2, rscratch1);
814   br(EQ, SHORT_LOOP);
815   sub(result, tmp2, rscratch1);
816   b(DONE);
817   bind(SHORT_LOOP_TAIL);
818   sub(result, tmp1, cnt1);
819   b(DONE);
820   bind(SHORT_LAST2);
821   cmp(tmp2, rscratch1);
822   br(EQ, DONE);
823   sub(result, tmp2, rscratch1);
824 
825   b(DONE);
826   bind(SHORT_LAST_INIT);
827   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
828   bind(SHORT_LAST);
829   cmp(tmp1, cnt1);
830   br(EQ, DONE);
831   sub(result, tmp1, cnt1);
832 
833   bind(DONE);
834 
835   BLOCK_COMMENT("} string_compare");
836 }
837 
838 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
839                                      FloatRegister src2, int cond, bool isQ) {
840   SIMD_Arrangement size = esize2arrangement(type2aelembytes(bt), isQ);
841   if (bt == T_FLOAT || bt == T_DOUBLE) {
842     switch (cond) {
843       case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
844       case BoolTest::ne: {
845         fcmeq(dst, size, src1, src2);
846         notr(dst, T16B, dst);
847         break;
848       }
849       case BoolTest::ge: fcmge(dst, size, src1, src2); break;
850       case BoolTest::gt: fcmgt(dst, size, src1, src2); break;
851       case BoolTest::le: fcmge(dst, size, src2, src1); break;
852       case BoolTest::lt: fcmgt(dst, size, src2, src1); break;
853       default:
854         assert(false, "unsupported");
855         ShouldNotReachHere();
856     }
857   } else {
858     switch (cond) {
859       case BoolTest::eq: cmeq(dst, size, src1, src2); break;
860       case BoolTest::ne: {
861         cmeq(dst, size, src1, src2);
862         notr(dst, T16B, dst);
863         break;
864       }
865       case BoolTest::ge: cmge(dst, size, src1, src2); break;
866       case BoolTest::gt: cmgt(dst, size, src1, src2); break;
867       case BoolTest::le: cmge(dst, size, src2, src1); break;
868       case BoolTest::lt: cmgt(dst, size, src2, src1); break;
869       case BoolTest::uge: cmhs(dst, size, src1, src2); break;
870       case BoolTest::ugt: cmhi(dst, size, src1, src2); break;
871       case BoolTest::ult: cmhi(dst, size, src2, src1); break;
872       case BoolTest::ule: cmhs(dst, size, src2, src1); break;
873       default:
874         assert(false, "unsupported");
875         ShouldNotReachHere();
876     }
877   }
878 }