1 /*
  2  * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #include "precompiled.hpp"
 26 #include "asm/assembler.hpp"
 27 #include "asm/assembler.inline.hpp"
 28 #include "opto/c2_MacroAssembler.hpp"
 29 #include "opto/intrinsicnode.hpp"
 30 #include "runtime/vm_version.hpp"
 31 
 32 #ifdef PRODUCT
 33 #define BLOCK_COMMENT(str) // nothing
 34 #else
 35 #define BLOCK_COMMENT(str) block_comment(str)
 36 #endif
 37 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 38 
 39 
 40 void C2_MacroAssembler::fast_lock_lightweight(ConditionRegister flag, Register obj, Register box,
 41                                               Register tmp1, Register tmp2, Register tmp3) {
 42   compiler_fast_lock_lightweight_object(flag, obj, tmp1, tmp2, tmp3);
 43 }
 44 
 45 void C2_MacroAssembler::fast_unlock_lightweight(ConditionRegister flag, Register obj, Register box,
 46                                                 Register tmp1, Register tmp2, Register tmp3) {
 47   compiler_fast_unlock_lightweight_object(flag, obj, tmp1, tmp2, tmp3);
 48 }
 49 
 50 // Intrinsics for CompactStrings
 51 
 52 // Compress char[] to byte[] by compressing 16 bytes at once.
 53 void C2_MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
 54                                            Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
 55                                            Label& Lfailure, bool ascii) {
 56 
 57   const Register tmp0 = R0;
 58   const int byte_mask = ascii ? 0x7F : 0xFF;
 59   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
 60   Label Lloop, Lslow;
 61 
 62   // Check if cnt >= 8 (= 16 bytes)
 63   lis(tmp1, byte_mask);           // tmp1 = 0x00FF00FF00FF00FF (non ascii case)
 64   srwi_(tmp2, cnt, 3);
 65   beq(CCR0, Lslow);
 66   ori(tmp1, tmp1, byte_mask);
 67   rldimi(tmp1, tmp1, 32, 0);
 68   mtctr(tmp2);
 69 
 70   // 2x unrolled loop
 71   bind(Lloop);
 72   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
 73   ld(tmp4, 8, src);               // _4_5_6_7
 74 
 75   orr(tmp0, tmp2, tmp4);
 76   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
 77   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
 78   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
 79   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
 80 
 81   andc_(tmp0, tmp0, tmp1);
 82   bne(CCR0, Lfailure);            // Not latin1/ascii.
 83   addi(src, src, 16);
 84 
 85   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
 86   srdi(tmp2, tmp2, 3*8);          // ____0_2_
 87   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
 88   srdi(tmp4, tmp4, 3*8);          // ____4_6_
 89 
 90   orr(tmp2, tmp2, tmp3);          // ____0123
 91   orr(tmp4, tmp4, tmp5);          // ____4567
 92 
 93   stw(tmp2, 0, dst);
 94   stw(tmp4, 4, dst);
 95   addi(dst, dst, 8);
 96   bdnz(Lloop);
 97 
 98   bind(Lslow);                    // Fallback to slow version
 99 }
100 
101 // Compress char[] to byte[]. cnt must be positive int.
102 void C2_MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp,
103                                         Label& Lfailure, bool ascii) {
104   const int byte_mask = ascii ? 0x7F : 0xFF;
105   Label Lloop;
106   mtctr(cnt);
107 
108   bind(Lloop);
109   lhz(tmp, 0, src);
110   cmplwi(CCR0, tmp, byte_mask);
111   bgt(CCR0, Lfailure);            // Not latin1/ascii.
112   addi(src, src, 2);
113   stb(tmp, 0, dst);
114   addi(dst, dst, 1);
115   bdnz(Lloop);
116 }
117 
118 void C2_MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
119                                          Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
120                                          Register result, bool ascii) {
121   Label Lslow, Lfailure1, Lfailure2, Ldone;
122 
123   string_compress_16(src, dst, len, tmp1, tmp2, tmp3, tmp4, tmp5, Lfailure1, ascii);
124   rldicl_(result, len, 0, 64-3); // Remaining characters.
125   beq(CCR0, Ldone);
126   bind(Lslow);
127   string_compress(src, dst, result, tmp2, Lfailure2, ascii);
128   li(result, 0);
129   b(Ldone);
130 
131   bind(Lfailure1);
132   mr(result, len);
133   mfctr(tmp1);
134   rldimi_(result, tmp1, 3, 0); // Remaining characters.
135   beq(CCR0, Ldone);
136   b(Lslow);
137 
138   bind(Lfailure2);
139   mfctr(result); // Remaining characters.
140 
141   bind(Ldone);
142   subf(result, result, len);
143 }
144 
145 // Inflate byte[] to char[] by inflating 16 bytes at once.
146 void C2_MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
147                                           Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
148   const Register tmp0 = R0;
149   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
150   Label Lloop, Lslow;
151 
152   // Check if cnt >= 8
153   srwi_(tmp2, cnt, 3);
154   beq(CCR0, Lslow);
155   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
156   ori(tmp1, tmp1, 0xFF);
157   mtctr(tmp2);
158 
159   // 2x unrolled loop
160   bind(Lloop);
161   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
162   lwz(tmp4, 4, src);              // ____4567
163   addi(src, src, 8);
164 
165   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
166   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
167   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
168   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
169 
170   andc(tmp0, tmp2, tmp1);         // ____0_1_
171   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
172   andc(tmp3, tmp4, tmp1);         // ____4_5_
173   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
174 
175   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
176   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
177 
178   std(tmp2, 0, dst);
179   std(tmp4, 8, dst);
180   addi(dst, dst, 16);
181   bdnz(Lloop);
182 
183   bind(Lslow);                    // Fallback to slow version
184 }
185 
186 // Inflate byte[] to char[]. cnt must be positive int.
187 void C2_MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
188   Label Lloop;
189   mtctr(cnt);
190 
191   bind(Lloop);
192   lbz(tmp, 0, src);
193   addi(src, src, 1);
194   sth(tmp, 0, dst);
195   addi(dst, dst, 2);
196   bdnz(Lloop);
197 }
198 
199 void C2_MacroAssembler::string_compare(Register str1, Register str2,
200                                        Register cnt1, Register cnt2,
201                                        Register tmp1, Register result, int ae) {
202   const Register tmp0 = R0,
203                  diff = tmp1;
204 
205   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
206   Label Ldone, Lslow, Lloop, Lreturn_diff;
207 
208   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
209   // we interchange str1 and str2 in the UL case and negate the result.
210   // Like this, str1 is always latin1 encoded, except for the UU case.
211   // In addition, we need 0 (or sign which is 0) extend.
212 
213   if (ae == StrIntrinsicNode::UU) {
214     srwi(cnt1, cnt1, 1);
215   } else {
216     clrldi(cnt1, cnt1, 32);
217   }
218 
219   if (ae != StrIntrinsicNode::LL) {
220     srwi(cnt2, cnt2, 1);
221   } else {
222     clrldi(cnt2, cnt2, 32);
223   }
224 
225   // See if the lengths are different, and calculate min in cnt1.
226   // Save diff in case we need it for a tie-breaker.
227   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
228   // if (diff > 0) { cnt1 = cnt2; }
229   if (VM_Version::has_isel()) {
230     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
231   } else {
232     Label Lskip;
233     blt(CCR0, Lskip);
234     mr(cnt1, cnt2);
235     bind(Lskip);
236   }
237 
238   // Rename registers
239   Register chr1 = result;
240   Register chr2 = tmp0;
241 
242   // Compare multiple characters in fast loop (only implemented for same encoding).
243   int stride1 = 8, stride2 = 8;
244   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
245     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
246     Label Lfastloop, Lskipfast;
247 
248     srwi_(tmp0, cnt1, log2_chars_per_iter);
249     beq(CCR0, Lskipfast);
250     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
251     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
252     mtctr(tmp0);
253 
254     bind(Lfastloop);
255     ld(chr1, 0, str1);
256     ld(chr2, 0, str2);
257     cmpd(CCR0, chr1, chr2);
258     bne(CCR0, Lslow);
259     addi(str1, str1, stride1);
260     addi(str2, str2, stride2);
261     bdnz(Lfastloop);
262     mr(cnt1, cnt2); // Remaining characters.
263     bind(Lskipfast);
264   }
265 
266   // Loop which searches the first difference character by character.
267   cmpwi(CCR0, cnt1, 0);
268   beq(CCR0, Lreturn_diff);
269   bind(Lslow);
270   mtctr(cnt1);
271 
272   switch (ae) {
273     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
274     case StrIntrinsicNode::UL: // fallthru (see comment above)
275     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
276     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
277     default: ShouldNotReachHere(); break;
278   }
279 
280   bind(Lloop);
281   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
282   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
283   subf_(result, chr2, chr1); // result = chr1 - chr2
284   bne(CCR0, Ldone);
285   addi(str1, str1, stride1);
286   addi(str2, str2, stride2);
287   bdnz(Lloop);
288 
289   // If strings are equal up to min length, return the length difference.
290   bind(Lreturn_diff);
291   mr(result, diff);
292 
293   // Otherwise, return the difference between the first mismatched chars.
294   bind(Ldone);
295   if (ae == StrIntrinsicNode::UL) {
296     neg(result, result); // Negate result (see note above).
297   }
298 }
299 
300 void C2_MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
301                                      Register limit, Register tmp1, Register result, bool is_byte) {
302   const Register tmp0 = R0;
303   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
304   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
305   bool limit_needs_shift = false;
306 
307   if (is_array_equ) {
308     const int length_offset = arrayOopDesc::length_offset_in_bytes();
309     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
310 
311     // Return true if the same array.
312     cmpd(CCR0, ary1, ary2);
313     beq(CCR0, Lskiploop);
314 
315     // Return false if one of them is null.
316     cmpdi(CCR0, ary1, 0);
317     cmpdi(CCR1, ary2, 0);
318     li(result, 0);
319     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
320     beq(CCR0, Ldone);
321 
322     // Load the lengths of arrays.
323     lwz(limit, length_offset, ary1);
324     lwz(tmp0, length_offset, ary2);
325 
326     // Return false if the two arrays are not equal length.
327     cmpw(CCR0, limit, tmp0);
328     bne(CCR0, Ldone);
329 
330     // Load array addresses.
331     addi(ary1, ary1, base_offset);
332     addi(ary2, ary2, base_offset);
333   } else {
334     limit_needs_shift = !is_byte;
335     li(result, 0); // Assume not equal.
336   }
337 
338   // Rename registers
339   Register chr1 = tmp0;
340   Register chr2 = tmp1;
341 
342   // Compare 8 bytes per iteration in fast loop.
343   const int log2_chars_per_iter = is_byte ? 3 : 2;
344 
345   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
346   beq(CCR0, Lskipfast);
347   mtctr(tmp0);
348 
349   bind(Lfastloop);
350   ld(chr1, 0, ary1);
351   ld(chr2, 0, ary2);
352   addi(ary1, ary1, 8);
353   addi(ary2, ary2, 8);
354   cmpd(CCR0, chr1, chr2);
355   bne(CCR0, Ldone);
356   bdnz(Lfastloop);
357 
358   bind(Lskipfast);
359   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
360   beq(CCR0, Lskiploop);
361   mtctr(limit);
362 
363   // Character by character.
364   bind(Lloop);
365   if (is_byte) {
366     lbz(chr1, 0, ary1);
367     lbz(chr2, 0, ary2);
368     addi(ary1, ary1, 1);
369     addi(ary2, ary2, 1);
370   } else {
371     lhz(chr1, 0, ary1);
372     lhz(chr2, 0, ary2);
373     addi(ary1, ary1, 2);
374     addi(ary2, ary2, 2);
375   }
376   cmpw(CCR0, chr1, chr2);
377   bne(CCR0, Ldone);
378   bdnz(Lloop);
379 
380   bind(Lskiploop);
381   li(result, 1); // All characters are equal.
382   bind(Ldone);
383 }
384 
385 void C2_MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
386                                        Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
387                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
388 
389   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
390   Label L_TooShort, L_Found, L_NotFound, L_End;
391   Register last_addr = haycnt, // Kill haycnt at the beginning.
392   addr      = tmp1,
393   n_start   = tmp2,
394   ch1       = tmp3,
395   ch2       = R0;
396 
397   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
398   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
399   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
400 
401   // **************************************************************************************************
402   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
403   // **************************************************************************************************
404 
405   // Compute last haystack addr to use if no match gets found.
406   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
407   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
408   if (needlecntval == 0) { // variable needlecnt
409    cmpwi(CCR6, needlecnt, 2);
410    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
411    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
412   }
413 
414   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
415 
416   if (needlecntval == 0) { // variable needlecnt
417    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
418    addi(needlecnt, needlecnt, -2);    // Rest of needle.
419   } else { // constant needlecnt
420   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
421   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
422    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
423    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
424   }
425 
426   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
427 
428   if (ae ==StrIntrinsicNode::UL) {
429    srwi(tmp4, n_start, 1*8);          // ___0
430    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
431   }
432 
433   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
434 
435   // Main Loop (now we have at least 2 characters).
436   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
437   bind(L_OuterLoop); // Search for 1st 2 characters.
438   Register addr_diff = tmp4;
439    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
440    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
441    srdi_(ch2, addr_diff, h_csize);
442    beq(CCR0, L_FinalCheck);           // 2 characters left?
443    mtctr(ch2);                        // num of characters / 2
444   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
445    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
446     lwz(ch1, 0, addr);
447     lwz(ch2, 2, addr);
448    } else {
449     lhz(ch1, 0, addr);
450     lhz(ch2, 1, addr);
451    }
452    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
453    cmpw(CCR1, ch2, n_start);
454    beq(CCR0, L_Comp1);                // Did we find the needle start?
455    beq(CCR1, L_Comp2);
456    addi(addr, addr, 2 * h_csize);
457    bdnz(L_InnerLoop);
458   bind(L_FinalCheck);
459    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
460    beq(CCR0, L_NotFound);
461    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
462    cmpw(CCR1, ch1, n_start);
463    beq(CCR1, L_Comp1);
464   bind(L_NotFound);
465    li(result, -1);                    // not found
466    b(L_End);
467 
468    // **************************************************************************************************
469    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
470    // **************************************************************************************************
471   if (needlecntval == 0) {           // We have to handle these cases separately.
472   Label L_OneCharLoop;
473   bind(L_TooShort);
474    mtctr(haycnt);
475    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
476   bind(L_OneCharLoop);
477    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
478    cmpw(CCR1, ch1, n_start);
479    beq(CCR1, L_Found);               // Did we find the one character needle?
480    bdnz(L_OneCharLoop);
481    li(result, -1);                   // Not found.
482    b(L_End);
483   }
484 
485   // **************************************************************************************************
486   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
487   // **************************************************************************************************
488 
489   // Compare the rest
490   bind(L_Comp2);
491    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
492   bind(L_Comp1);                     // Addr points to possible needle start.
493   if (needlecntval != 2) {           // Const needlecnt==2?
494    if (needlecntval != 3) {
495     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
496     Register n_ind = tmp4,
497              h_ind = n_ind;
498     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
499     mtctr(needlecnt);                // Decremented by 2, still > 0.
500    Label L_CompLoop;
501    bind(L_CompLoop);
502     if (ae ==StrIntrinsicNode::UL) {
503       h_ind = ch1;
504       sldi(h_ind, n_ind, 1);
505     }
506     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
507     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
508     cmpw(CCR1, ch1, ch2);
509     bne(CCR1, L_OuterLoop);
510     addi(n_ind, n_ind, n_csize);
511     bdnz(L_CompLoop);
512    } else { // No loop required if there's only one needle character left.
513     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
514     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
515     cmpw(CCR1, ch1, ch2);
516     bne(CCR1, L_OuterLoop);
517    }
518   }
519   // Return index ...
520   bind(L_Found);
521    subf(result, haystack, addr);     // relative to haystack, ...
522    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
523   bind(L_End);
524 } // string_indexof
525 
526 void C2_MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
527                                             Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
528   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
529 
530   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
531   Register addr = tmp1,
532            ch1 = tmp2,
533            ch2 = R0;
534 
535   const int h_csize = is_byte ? 1 : 2;
536 
537 //4:
538    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
539    mr(addr, haystack);
540    beq(CCR0, L_FinalCheck);
541    mtctr(tmp2);              // Move to count register.
542 //8:
543   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
544    if (!is_byte) {
545     lhz(ch1, 0, addr);
546     lhz(ch2, 2, addr);
547    } else {
548     lbz(ch1, 0, addr);
549     lbz(ch2, 1, addr);
550    }
551    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
552    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
553    beq(CCR0, L_Found1);      // Did we find the needle?
554    beq(CCR1, L_Found2);
555    addi(addr, addr, 2 * h_csize);
556    bdnz(L_InnerLoop);
557 //16:
558   bind(L_FinalCheck);
559    andi_(R0, haycnt, 1);
560    beq(CCR0, L_NotFound);
561    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
562    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
563    beq(CCR1, L_Found1);
564 //21:
565   bind(L_NotFound);
566    li(result, -1);           // Not found.
567    b(L_End);
568 
569   bind(L_Found2);
570    addi(addr, addr, h_csize);
571 //24:
572   bind(L_Found1);            // Return index ...
573    subf(result, haystack, addr); // relative to haystack, ...
574    if (!is_byte) { srdi(result, result, 1); } // in characters.
575   bind(L_End);
576 } // string_indexof_char
577 
578 
579 void C2_MacroAssembler::count_positives(Register src, Register cnt, Register result,
580                                         Register tmp1, Register tmp2) {
581   const Register tmp0 = R0;
582   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
583   Label Lfastloop, Lslow, Lloop, Ldone;
584 
585   // Check if cnt >= 8 (= 16 bytes)
586   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
587   srwi_(tmp2, cnt, 4);
588   mr(result, src);                // Use result reg to point to the current position.
589   beq(CCR0, Lslow);
590   ori(tmp1, tmp1, 0x8080);
591   rldimi(tmp1, tmp1, 32, 0);
592   mtctr(tmp2);
593 
594   // 2x unrolled loop
595   bind(Lfastloop);
596   ld(tmp2, 0, result);
597   ld(tmp0, 8, result);
598 
599   orr(tmp0, tmp2, tmp0);
600 
601   and_(tmp0, tmp0, tmp1);
602   bne(CCR0, Lslow);               // Found negative byte.
603   addi(result, result, 16);
604   bdnz(Lfastloop);
605 
606   bind(Lslow);                    // Fallback to slow version.
607   subf(tmp0, src, result);        // Bytes known positive.
608   subf_(tmp0, tmp0, cnt);         // Remaining Bytes.
609   beq(CCR0, Ldone);
610   mtctr(tmp0);
611   bind(Lloop);
612   lbz(tmp0, 0, result);
613   andi_(tmp0, tmp0, 0x80);
614   bne(CCR0, Ldone);               // Found negative byte.
615   addi(result, result, 1);
616   bdnz(Lloop);
617 
618   bind(Ldone);
619   subf(result, src, result);      // Result is offset from src.
620 }