1 /*
   2  * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_CodeStubs.hpp"
  29 #include "opto/c2_MacroAssembler.hpp"
  30 #include "opto/compile.hpp"
  31 #include "opto/output.hpp"
  32 #include "opto/intrinsicnode.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label cont;
  56   Label object_has_monitor;
  57   Label cas_failed;
  58 
  59   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  60   assert_different_registers(oop, box, tmp, disp_hdr);
  61 
  62   // Load markWord from object into displaced_header.
  63   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  64 
  65   if (DiagnoseSyncOnValueBasedClasses != 0) {
  66     load_klass(tmp, oop);
  67     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  68     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  69     br(Assembler::NE, cont);
  70   }
  71 
  72   if (UseBiasedLocking && !UseOptoBiasInlining) {
  73     biased_locking_enter(box, oop, disp_hdr, tmp, true, cont);
  74   }
  75 
  76   // Check for existing monitor
  77   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  78 
  79   if (LockingMode == LM_MONITOR) {
  80     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  81     b(cont);
  82   } else {
  83     assert(LockingMode == LM_LEGACY, "must be");
  84     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  85     orr(tmp, disp_hdr, markWord::unlocked_value);
  86 
  87     // Initialize the box. (Must happen before we update the object mark!)
  88     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  89 
  90     // Compare object markWord with an unlocked value (tmp) and if
  91     // equal exchange the stack address of our box with object markWord.
  92     // On failure disp_hdr contains the possibly locked markWord.
  93     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  94             /*release*/ true, /*weak*/ false, disp_hdr);
  95     br(Assembler::EQ, cont);
  96 
  97     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  98 
  99     // If the compare-and-exchange succeeded, then we found an unlocked
 100     // object, will have now locked it will continue at label cont
 101 
 102     bind(cas_failed);
 103     // We did not see an unlocked object so try the fast recursive case.
 104 
 105     // Check if the owner is self by comparing the value in the
 106     // markWord of object (disp_hdr) with the stack pointer.
 107     mov(rscratch1, sp);
 108     sub(disp_hdr, disp_hdr, rscratch1);
 109     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 110     // If condition is true we are cont and hence we can store 0 as the
 111     // displaced header in the box, which indicates that it is a recursive lock.
 112     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 113     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 114     b(cont);
 115   }
 116 
 117   // Handle existing monitor.
 118   bind(object_has_monitor);
 119 
 120   // The object's monitor m is unlocked iff m->owner == NULL,
 121   // otherwise m->owner may contain a thread or a stack address.
 122   //
 123   // Try to CAS m->owner from NULL to current thread.
 124   add(tmp, disp_hdr, (ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value));
 125   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 126           /*release*/ true, /*weak*/ false, rscratch1); // Sets flags for result
 127 
 128   // Store a non-null value into the box to avoid looking like a re-entrant
 129   // lock. The fast-path monitor unlock code checks for
 130   // markWord::monitor_value so use markWord::unused_mark which has the
 131   // relevant bit set, and also matches ObjectSynchronizer::enter.
 132   mov(tmp, (address)markWord::unused_mark().value());
 133   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 134 
 135   br(Assembler::EQ, cont); // CAS success means locking succeeded
 136 
 137   cmp(rscratch1, rthread);
 138   br(Assembler::NE, cont); // Check for recursive locking
 139 
 140   // Recursive lock case
 141   increment(Address(disp_hdr, ObjectMonitor::recursions_offset_in_bytes() - markWord::monitor_value), 1);
 142   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 143 
 144   bind(cont);
 145   // flag == EQ indicates success
 146   // flag == NE indicates failure
 147 }
 148 
 149 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 150                                     Register tmp2Reg) {
 151   Register oop = objectReg;
 152   Register box = boxReg;
 153   Register disp_hdr = tmpReg;
 154   Register tmp = tmp2Reg;
 155   Label cont;
 156   Label object_has_monitor;
 157 
 158   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 159   assert_different_registers(oop, box, tmp, disp_hdr);
 160 
 161   if (UseBiasedLocking && !UseOptoBiasInlining) {
 162     biased_locking_exit(oop, tmp, cont);
 163   }
 164 
 165   if (LockingMode == LM_LEGACY) {
 166     // Find the lock address and load the displaced header from the stack.
 167     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 168 
 169     // If the displaced header is 0, we have a recursive unlock.
 170     cmp(disp_hdr, zr);
 171     br(Assembler::EQ, cont);
 172   }
 173 
 174   // Handle existing monitor.
 175   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 176   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 177 
 178   if (LockingMode == LM_MONITOR) {
 179     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 180     b(cont);
 181   } else {
 182     assert(LockingMode == LM_LEGACY, "must be");
 183     // Check if it is still a light weight lock, this is is true if we
 184     // see the stack address of the basicLock in the markWord of the
 185     // object.
 186 
 187     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 188             /*release*/ true, /*weak*/ false, tmp);
 189     b(cont);
 190   }
 191 
 192   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 193 
 194   // Handle existing monitor.
 195   bind(object_has_monitor);
 196   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 197   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 198 
 199   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset_in_bytes()));
 200 
 201   Label notRecursive;
 202   cbz(disp_hdr, notRecursive);
 203 
 204   // Recursive lock
 205   sub(disp_hdr, disp_hdr, 1u);
 206   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset_in_bytes()));
 207   cmp(disp_hdr, disp_hdr); // Sets flags for result
 208   b(cont);
 209 
 210   bind(notRecursive);
 211   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset_in_bytes()));
 212   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset_in_bytes()));
 213   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 214   cmp(rscratch1, zr); // Sets flags for result
 215   cbnz(rscratch1, cont);
 216   // need a release store here
 217   lea(tmp, Address(tmp, ObjectMonitor::owner_offset_in_bytes()));
 218   stlr(zr, tmp); // set unowned
 219 
 220   bind(cont);
 221   // flag == EQ indicates success
 222   // flag == NE indicates failure
 223 }
 224 
 225 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
 226                                               Register t2, Register t3) {
 227   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 228   assert_different_registers(obj, t1, t2, t3);
 229 
 230   // Handle inflated monitor.
 231   Label inflated;
 232   // Finish fast lock successfully. MUST branch to with flag == EQ
 233   Label locked;
 234   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 235   Label slow_path;
 236 
 237   if (DiagnoseSyncOnValueBasedClasses != 0) {
 238     load_klass(t1, obj);
 239     ldrw(t1, Address(t1, Klass::access_flags_offset()));
 240     tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
 241     br(Assembler::NE, slow_path);
 242   }
 243 
 244   const Register t1_mark = t1;
 245 
 246   { // Lightweight locking
 247 
 248     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 249     Label push;
 250 
 251     const Register t2_top = t2;
 252     const Register t3_t = t3;
 253 
 254     // Check if lock-stack is full.
 255     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 256     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 257     br(Assembler::GT, slow_path);
 258 
 259     // Check if recursive.
 260     subw(t3_t, t2_top, oopSize);
 261     ldr(t3_t, Address(rthread, t3_t));
 262     cmp(obj, t3_t);
 263     br(Assembler::EQ, push);
 264 
 265     // Relaxed normal load to check for monitor. Optimization for monitor case.
 266     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 267     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 268 
 269     // Not inflated
 270     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 271 
 272     // Try to lock. Transition lock-bits 0b01 => 0b00
 273     orr(t1_mark, t1_mark, markWord::unlocked_value);
 274     eor(t3_t, t1_mark, markWord::unlocked_value);
 275     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 276             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 277     br(Assembler::NE, slow_path);
 278 
 279     bind(push);
 280     // After successful lock, push object on lock-stack.
 281     str(obj, Address(rthread, t2_top));
 282     addw(t2_top, t2_top, oopSize);
 283     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 284     b(locked);
 285   }
 286 
 287   { // Handle inflated monitor.
 288     bind(inflated);
 289 
 290     // mark contains the tagged ObjectMonitor*.
 291     const Register t1_tagged_monitor = t1_mark;
 292     const uintptr_t monitor_tag = markWord::monitor_value;
 293     const Register t2_owner_addr = t2;
 294     const Register t3_owner = t3;
 295 
 296     // Compute owner address.
 297     lea(t2_owner_addr, Address(t1_tagged_monitor, ObjectMonitor::owner_offset_in_bytes() - monitor_tag));
 298 
 299     // CAS owner (null => current thread).
 300     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 301             /*release*/ false, /*weak*/ false, t3_owner);
 302     br(Assembler::EQ, locked);
 303 
 304     // Check if recursive.
 305     cmp(t3_owner, rthread);
 306     br(Assembler::NE, slow_path);
 307 
 308     // Recursive.
 309     increment(Address(t1_tagged_monitor, ObjectMonitor::recursions_offset_in_bytes() - monitor_tag), 1);
 310   }
 311 
 312   bind(locked);
 313 #ifdef ASSERT
 314   // Check that locked label is reached with Flags == EQ.
 315   Label flag_correct;
 316   br(Assembler::EQ, flag_correct);
 317   stop("Fast Lock Flag != EQ");
 318 #endif
 319 
 320   bind(slow_path);
 321 #ifdef ASSERT
 322   // Check that slow_path label is reached with Flags == NE.
 323   br(Assembler::NE, flag_correct);
 324   stop("Fast Lock Flag != NE");
 325   bind(flag_correct);
 326 #endif
 327   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 328 }
 329 
 330 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
 331                                                 Register t3) {
 332   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 333   assert_different_registers(obj, t1, t2, t3);
 334 
 335   // Handle inflated monitor.
 336   Label inflated, inflated_load_monitor;
 337   // Finish fast unlock successfully. MUST branch to with flag == EQ
 338   Label unlocked;
 339   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 340   Label slow_path;
 341 
 342   const Register t1_mark = t1;
 343   const Register t2_top = t2;
 344   const Register t3_t = t3;
 345 
 346   { // Lightweight unlock
 347 
 348     // Check if obj is top of lock-stack.
 349     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 350     subw(t2_top, t2_top, oopSize);
 351     ldr(t3_t, Address(rthread, t2_top));
 352     cmp(obj, t3_t);
 353     // Top of lock stack was not obj. Must be monitor.
 354     br(Assembler::NE, inflated_load_monitor);
 355 
 356     // Pop lock-stack.
 357     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 358     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 359 
 360     // Check if recursive.
 361     subw(t3_t, t2_top, oopSize);
 362     ldr(t3_t, Address(rthread, t3_t));
 363     cmp(obj, t3_t);
 364     br(Assembler::EQ, unlocked);
 365 
 366     // Not recursive.
 367     // Load Mark.
 368     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 369 
 370     // Check header for monitor (0b10).
 371     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 372 
 373     // Try to unlock. Transition lock bits 0b00 => 0b01
 374     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 375     orr(t3_t, t1_mark, markWord::unlocked_value);
 376     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 377             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 378     br(Assembler::EQ, unlocked);
 379 
 380     // Compare and exchange failed.
 381     // Restore lock-stack and handle the unlock in runtime.
 382     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 383     addw(t2_top, t2_top, oopSize);
 384     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 385     b(slow_path);
 386   }
 387 
 388 
 389   { // Handle inflated monitor.
 390     bind(inflated_load_monitor);
 391     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 392 #ifdef ASSERT
 393     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 394     stop("Fast Unlock not monitor");
 395 #endif
 396 
 397     bind(inflated);
 398 
 399 #ifdef ASSERT
 400     Label check_done;
 401     subw(t2_top, t2_top, oopSize);
 402     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 403     br(Assembler::LT, check_done);
 404     ldr(t3_t, Address(rthread, t2_top));
 405     cmp(obj, t3_t);
 406     br(Assembler::NE, inflated);
 407     stop("Fast Unlock lock on stack");
 408     bind(check_done);
 409 #endif
 410 
 411     // mark contains the tagged ObjectMonitor*.
 412     const Register t1_monitor = t1_mark;
 413     const uintptr_t monitor_tag = markWord::monitor_value;
 414 
 415     // Untag the monitor.
 416     sub(t1_monitor, t1_mark, monitor_tag);
 417 
 418     const Register t2_recursions = t2;
 419     Label not_recursive;
 420 
 421     // Check if recursive.
 422     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset_in_bytes()));
 423     cbz(t2_recursions, not_recursive);
 424 
 425     // Recursive unlock.
 426     sub(t2_recursions, t2_recursions, 1u);
 427     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset_in_bytes()));
 428     // Set flag == EQ
 429     cmp(t2_recursions, t2_recursions);
 430     b(unlocked);
 431 
 432     bind(not_recursive);
 433 
 434     Label release;
 435     const Register t2_owner_addr = t2;
 436 
 437     // Compute owner address.
 438     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset_in_bytes()));
 439 
 440     // Check if the entry lists are empty.
 441     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset_in_bytes()));
 442     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset_in_bytes()));
 443     orr(rscratch1, rscratch1, t3_t);
 444     cmp(rscratch1, zr);
 445     br(Assembler::EQ, release);
 446 
 447     // The owner may be anonymous and we removed the last obj entry in
 448     // the lock-stack. This loses the information about the owner.
 449     // Write the thread to the owner field so the runtime knows the owner.
 450     str(rthread, Address(t2_owner_addr));
 451     b(slow_path);
 452 
 453     bind(release);
 454     // Set owner to null.
 455     // Release to satisfy the JMM
 456     stlr(zr, t2_owner_addr);
 457   }
 458 
 459   bind(unlocked);
 460 #ifdef ASSERT
 461   // Check that unlocked label is reached with Flags == EQ.
 462   Label flag_correct;
 463   br(Assembler::EQ, flag_correct);
 464   stop("Fast Unlock Flag != EQ");
 465 #endif
 466 
 467   bind(slow_path);
 468 #ifdef ASSERT
 469   // Check that slow_path label is reached with Flags == NE.
 470   br(Assembler::NE, flag_correct);
 471   stop("Fast Unlock Flag != NE");
 472   bind(flag_correct);
 473 #endif
 474   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 475 }
 476 
 477 // Search for str1 in str2 and return index or -1
 478 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 479 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 480                                        Register cnt2, Register cnt1,
 481                                        Register tmp1, Register tmp2,
 482                                        Register tmp3, Register tmp4,
 483                                        Register tmp5, Register tmp6,
 484                                        int icnt1, Register result, int ae) {
 485   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 486   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 487 
 488   Register ch1 = rscratch1;
 489   Register ch2 = rscratch2;
 490   Register cnt1tmp = tmp1;
 491   Register cnt2tmp = tmp2;
 492   Register cnt1_neg = cnt1;
 493   Register cnt2_neg = cnt2;
 494   Register result_tmp = tmp4;
 495 
 496   bool isL = ae == StrIntrinsicNode::LL;
 497 
 498   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 499   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 500   int str1_chr_shift = str1_isL ? 0:1;
 501   int str2_chr_shift = str2_isL ? 0:1;
 502   int str1_chr_size = str1_isL ? 1:2;
 503   int str2_chr_size = str2_isL ? 1:2;
 504   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 505                                       (chr_insn)&MacroAssembler::ldrh;
 506   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 507                                       (chr_insn)&MacroAssembler::ldrh;
 508   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 509   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 510 
 511   // Note, inline_string_indexOf() generates checks:
 512   // if (substr.count > string.count) return -1;
 513   // if (substr.count == 0) return 0;
 514 
 515   // We have two strings, a source string in str2, cnt2 and a pattern string
 516   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
 517 
 518   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 519   // With a small pattern and source we use linear scan.
 520 
 521   if (icnt1 == -1) {
 522     sub(result_tmp, cnt2, cnt1);
 523     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 524     br(LT, LINEARSEARCH);
 525     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 526     subs(zr, cnt1, 256);
 527     lsr(tmp1, cnt2, 2);
 528     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 529     br(GE, LINEARSTUB);
 530   }
 531 
 532 // The Boyer Moore alogorithm is based on the description here:-
 533 //
 534 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 535 //
 536 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 537 // and the 'Good Suffix' rule.
 538 //
 539 // These rules are essentially heuristics for how far we can shift the
 540 // pattern along the search string.
 541 //
 542 // The implementation here uses the 'Bad Character' rule only because of the
 543 // complexity of initialisation for the 'Good Suffix' rule.
 544 //
 545 // This is also known as the Boyer-Moore-Horspool algorithm:-
 546 //
 547 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 548 //
 549 // This particular implementation has few java-specific optimizations.
 550 //
 551 // #define ASIZE 256
 552 //
 553 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 554 //       int i, j;
 555 //       unsigned c;
 556 //       unsigned char bc[ASIZE];
 557 //
 558 //       /* Preprocessing */
 559 //       for (i = 0; i < ASIZE; ++i)
 560 //          bc[i] = m;
 561 //       for (i = 0; i < m - 1; ) {
 562 //          c = x[i];
 563 //          ++i;
 564 //          // c < 256 for Latin1 string, so, no need for branch
 565 //          #ifdef PATTERN_STRING_IS_LATIN1
 566 //          bc[c] = m - i;
 567 //          #else
 568 //          if (c < ASIZE) bc[c] = m - i;
 569 //          #endif
 570 //       }
 571 //
 572 //       /* Searching */
 573 //       j = 0;
 574 //       while (j <= n - m) {
 575 //          c = y[i+j];
 576 //          if (x[m-1] == c)
 577 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 578 //          if (i < 0) return j;
 579 //          // c < 256 for Latin1 string, so, no need for branch
 580 //          #ifdef SOURCE_STRING_IS_LATIN1
 581 //          // LL case: (c< 256) always true. Remove branch
 582 //          j += bc[y[j+m-1]];
 583 //          #endif
 584 //          #ifndef PATTERN_STRING_IS_UTF
 585 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 586 //          if (c < ASIZE)
 587 //            j += bc[y[j+m-1]];
 588 //          else
 589 //            j += 1
 590 //          #endif
 591 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 592 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 593 //          if (c < ASIZE)
 594 //            j += bc[y[j+m-1]];
 595 //          else
 596 //            j += m
 597 //          #endif
 598 //       }
 599 //    }
 600 
 601   if (icnt1 == -1) {
 602     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 603         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 604     Register cnt1end = tmp2;
 605     Register str2end = cnt2;
 606     Register skipch = tmp2;
 607 
 608     // str1 length is >=8, so, we can read at least 1 register for cases when
 609     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 610     // UL case. We'll re-read last character in inner pre-loop code to have
 611     // single outer pre-loop load
 612     const int firstStep = isL ? 7 : 3;
 613 
 614     const int ASIZE = 256;
 615     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 616     sub(sp, sp, ASIZE);
 617     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 618     mov(ch1, sp);
 619     BIND(BM_INIT_LOOP);
 620       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 621       subs(tmp5, tmp5, 1);
 622       br(GT, BM_INIT_LOOP);
 623 
 624       sub(cnt1tmp, cnt1, 1);
 625       mov(tmp5, str2);
 626       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 627       sub(ch2, cnt1, 1);
 628       mov(tmp3, str1);
 629     BIND(BCLOOP);
 630       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 631       if (!str1_isL) {
 632         subs(zr, ch1, ASIZE);
 633         br(HS, BCSKIP);
 634       }
 635       strb(ch2, Address(sp, ch1));
 636     BIND(BCSKIP);
 637       subs(ch2, ch2, 1);
 638       br(GT, BCLOOP);
 639 
 640       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 641       if (str1_isL == str2_isL) {
 642         // load last 8 bytes (8LL/4UU symbols)
 643         ldr(tmp6, Address(tmp6, -wordSize));
 644       } else {
 645         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 646         // convert Latin1 to UTF. We'll have to wait until load completed, but
 647         // it's still faster than per-character loads+checks
 648         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 649         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 650         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 651         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 652         orr(ch2, ch1, ch2, LSL, 16);
 653         orr(tmp6, tmp6, tmp3, LSL, 48);
 654         orr(tmp6, tmp6, ch2, LSL, 16);
 655       }
 656     BIND(BMLOOPSTR2);
 657       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 658       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 659       if (str1_isL == str2_isL) {
 660         // re-init tmp3. It's for free because it's executed in parallel with
 661         // load above. Alternative is to initialize it before loop, but it'll
 662         // affect performance on in-order systems with 2 or more ld/st pipelines
 663         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 664       }
 665       if (!isL) { // UU/UL case
 666         lsl(ch2, cnt1tmp, 1); // offset in bytes
 667       }
 668       cmp(tmp3, skipch);
 669       br(NE, BMSKIP);
 670       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 671       mov(ch1, tmp6);
 672       if (isL) {
 673         b(BMLOOPSTR1_AFTER_LOAD);
 674       } else {
 675         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 676         b(BMLOOPSTR1_CMP);
 677       }
 678     BIND(BMLOOPSTR1);
 679       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 680       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 681     BIND(BMLOOPSTR1_AFTER_LOAD);
 682       subs(cnt1tmp, cnt1tmp, 1);
 683       br(LT, BMLOOPSTR1_LASTCMP);
 684     BIND(BMLOOPSTR1_CMP);
 685       cmp(ch1, ch2);
 686       br(EQ, BMLOOPSTR1);
 687     BIND(BMSKIP);
 688       if (!isL) {
 689         // if we've met UTF symbol while searching Latin1 pattern, then we can
 690         // skip cnt1 symbols
 691         if (str1_isL != str2_isL) {
 692           mov(result_tmp, cnt1);
 693         } else {
 694           mov(result_tmp, 1);
 695         }
 696         subs(zr, skipch, ASIZE);
 697         br(HS, BMADV);
 698       }
 699       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 700     BIND(BMADV);
 701       sub(cnt1tmp, cnt1, 1);
 702       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 703       cmp(str2, str2end);
 704       br(LE, BMLOOPSTR2);
 705       add(sp, sp, ASIZE);
 706       b(NOMATCH);
 707     BIND(BMLOOPSTR1_LASTCMP);
 708       cmp(ch1, ch2);
 709       br(NE, BMSKIP);
 710     BIND(BMMATCH);
 711       sub(result, str2, tmp5);
 712       if (!str2_isL) lsr(result, result, 1);
 713       add(sp, sp, ASIZE);
 714       b(DONE);
 715 
 716     BIND(LINEARSTUB);
 717     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 718     br(LT, LINEAR_MEDIUM);
 719     mov(result, zr);
 720     RuntimeAddress stub = NULL;
 721     if (isL) {
 722       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 723       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
 724     } else if (str1_isL) {
 725       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 726        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
 727     } else {
 728       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 729       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
 730     }
 731     trampoline_call(stub);
 732     b(DONE);
 733   }
 734 
 735   BIND(LINEARSEARCH);
 736   {
 737     Label DO1, DO2, DO3;
 738 
 739     Register str2tmp = tmp2;
 740     Register first = tmp3;
 741 
 742     if (icnt1 == -1)
 743     {
 744         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 745 
 746         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 747         br(LT, DOSHORT);
 748       BIND(LINEAR_MEDIUM);
 749         (this->*str1_load_1chr)(first, Address(str1));
 750         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 751         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 753         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 754 
 755       BIND(FIRST_LOOP);
 756         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 757         cmp(first, ch2);
 758         br(EQ, STR1_LOOP);
 759       BIND(STR2_NEXT);
 760         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 761         br(LE, FIRST_LOOP);
 762         b(NOMATCH);
 763 
 764       BIND(STR1_LOOP);
 765         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 766         add(cnt2tmp, cnt2_neg, str2_chr_size);
 767         br(GE, MATCH);
 768 
 769       BIND(STR1_NEXT);
 770         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 771         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 772         cmp(ch1, ch2);
 773         br(NE, STR2_NEXT);
 774         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 775         add(cnt2tmp, cnt2tmp, str2_chr_size);
 776         br(LT, STR1_NEXT);
 777         b(MATCH);
 778 
 779       BIND(DOSHORT);
 780       if (str1_isL == str2_isL) {
 781         cmp(cnt1, (u1)2);
 782         br(LT, DO1);
 783         br(GT, DO3);
 784       }
 785     }
 786 
 787     if (icnt1 == 4) {
 788       Label CH1_LOOP;
 789 
 790         (this->*load_4chr)(ch1, str1);
 791         sub(result_tmp, cnt2, 4);
 792         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 793         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 794 
 795       BIND(CH1_LOOP);
 796         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 797         cmp(ch1, ch2);
 798         br(EQ, MATCH);
 799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 800         br(LE, CH1_LOOP);
 801         b(NOMATCH);
 802       }
 803 
 804     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 805       Label CH1_LOOP;
 806 
 807       BIND(DO2);
 808         (this->*load_2chr)(ch1, str1);
 809         if (icnt1 == 2) {
 810           sub(result_tmp, cnt2, 2);
 811         }
 812         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 813         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 814       BIND(CH1_LOOP);
 815         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 816         cmp(ch1, ch2);
 817         br(EQ, MATCH);
 818         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 819         br(LE, CH1_LOOP);
 820         b(NOMATCH);
 821     }
 822 
 823     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 824       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 825 
 826       BIND(DO3);
 827         (this->*load_2chr)(first, str1);
 828         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 829         if (icnt1 == 3) {
 830           sub(result_tmp, cnt2, 3);
 831         }
 832         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 833         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 834       BIND(FIRST_LOOP);
 835         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 836         cmpw(first, ch2);
 837         br(EQ, STR1_LOOP);
 838       BIND(STR2_NEXT);
 839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 840         br(LE, FIRST_LOOP);
 841         b(NOMATCH);
 842 
 843       BIND(STR1_LOOP);
 844         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 845         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 846         cmp(ch1, ch2);
 847         br(NE, STR2_NEXT);
 848         b(MATCH);
 849     }
 850 
 851     if (icnt1 == -1 || icnt1 == 1) {
 852       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 853 
 854       BIND(DO1);
 855         (this->*str1_load_1chr)(ch1, str1);
 856         cmp(cnt2, (u1)8);
 857         br(LT, DO1_SHORT);
 858 
 859         sub(result_tmp, cnt2, 8/str2_chr_size);
 860         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 861         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 862         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 863 
 864         if (str2_isL) {
 865           orr(ch1, ch1, ch1, LSL, 8);
 866         }
 867         orr(ch1, ch1, ch1, LSL, 16);
 868         orr(ch1, ch1, ch1, LSL, 32);
 869       BIND(CH1_LOOP);
 870         ldr(ch2, Address(str2, cnt2_neg));
 871         eor(ch2, ch1, ch2);
 872         sub(tmp1, ch2, tmp3);
 873         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 874         bics(tmp1, tmp1, tmp2);
 875         br(NE, HAS_ZERO);
 876         adds(cnt2_neg, cnt2_neg, 8);
 877         br(LT, CH1_LOOP);
 878 
 879         cmp(cnt2_neg, (u1)8);
 880         mov(cnt2_neg, 0);
 881         br(LT, CH1_LOOP);
 882         b(NOMATCH);
 883 
 884       BIND(HAS_ZERO);
 885         rev(tmp1, tmp1);
 886         clz(tmp1, tmp1);
 887         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 888         b(MATCH);
 889 
 890       BIND(DO1_SHORT);
 891         mov(result_tmp, cnt2);
 892         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 893         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 894       BIND(DO1_LOOP);
 895         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 896         cmpw(ch1, ch2);
 897         br(EQ, MATCH);
 898         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 899         br(LT, DO1_LOOP);
 900     }
 901   }
 902   BIND(NOMATCH);
 903     mov(result, -1);
 904     b(DONE);
 905   BIND(MATCH);
 906     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
 907   BIND(DONE);
 908 }
 909 
 910 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
 911 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
 912 
 913 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
 914                                             Register ch, Register result,
 915                                             Register tmp1, Register tmp2, Register tmp3)
 916 {
 917   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 918   Register cnt1_neg = cnt1;
 919   Register ch1 = rscratch1;
 920   Register result_tmp = rscratch2;
 921 
 922   cbz(cnt1, NOMATCH);
 923 
 924   cmp(cnt1, (u1)4);
 925   br(LT, DO1_SHORT);
 926 
 927   orr(ch, ch, ch, LSL, 16);
 928   orr(ch, ch, ch, LSL, 32);
 929 
 930   sub(cnt1, cnt1, 4);
 931   mov(result_tmp, cnt1);
 932   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 933   sub(cnt1_neg, zr, cnt1, LSL, 1);
 934 
 935   mov(tmp3, 0x0001000100010001);
 936 
 937   BIND(CH1_LOOP);
 938     ldr(ch1, Address(str1, cnt1_neg));
 939     eor(ch1, ch, ch1);
 940     sub(tmp1, ch1, tmp3);
 941     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
 942     bics(tmp1, tmp1, tmp2);
 943     br(NE, HAS_ZERO);
 944     adds(cnt1_neg, cnt1_neg, 8);
 945     br(LT, CH1_LOOP);
 946 
 947     cmp(cnt1_neg, (u1)8);
 948     mov(cnt1_neg, 0);
 949     br(LT, CH1_LOOP);
 950     b(NOMATCH);
 951 
 952   BIND(HAS_ZERO);
 953     rev(tmp1, tmp1);
 954     clz(tmp1, tmp1);
 955     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
 956     b(MATCH);
 957 
 958   BIND(DO1_SHORT);
 959     mov(result_tmp, cnt1);
 960     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
 961     sub(cnt1_neg, zr, cnt1, LSL, 1);
 962   BIND(DO1_LOOP);
 963     ldrh(ch1, Address(str1, cnt1_neg));
 964     cmpw(ch, ch1);
 965     br(EQ, MATCH);
 966     adds(cnt1_neg, cnt1_neg, 2);
 967     br(LT, DO1_LOOP);
 968   BIND(NOMATCH);
 969     mov(result, -1);
 970     b(DONE);
 971   BIND(MATCH);
 972     add(result, result_tmp, cnt1_neg, ASR, 1);
 973   BIND(DONE);
 974 }
 975 
 976 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
 977                                             Register ch, Register result,
 978                                             Register tmp1, Register tmp2, Register tmp3)
 979 {
 980   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
 981   Register cnt1_neg = cnt1;
 982   Register ch1 = rscratch1;
 983   Register result_tmp = rscratch2;
 984 
 985   cbz(cnt1, NOMATCH);
 986 
 987   cmp(cnt1, (u1)8);
 988   br(LT, DO1_SHORT);
 989 
 990   orr(ch, ch, ch, LSL, 8);
 991   orr(ch, ch, ch, LSL, 16);
 992   orr(ch, ch, ch, LSL, 32);
 993 
 994   sub(cnt1, cnt1, 8);
 995   mov(result_tmp, cnt1);
 996   lea(str1, Address(str1, cnt1));
 997   sub(cnt1_neg, zr, cnt1);
 998 
 999   mov(tmp3, 0x0101010101010101);
1000 
1001   BIND(CH1_LOOP);
1002     ldr(ch1, Address(str1, cnt1_neg));
1003     eor(ch1, ch, ch1);
1004     sub(tmp1, ch1, tmp3);
1005     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1006     bics(tmp1, tmp1, tmp2);
1007     br(NE, HAS_ZERO);
1008     adds(cnt1_neg, cnt1_neg, 8);
1009     br(LT, CH1_LOOP);
1010 
1011     cmp(cnt1_neg, (u1)8);
1012     mov(cnt1_neg, 0);
1013     br(LT, CH1_LOOP);
1014     b(NOMATCH);
1015 
1016   BIND(HAS_ZERO);
1017     rev(tmp1, tmp1);
1018     clz(tmp1, tmp1);
1019     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1020     b(MATCH);
1021 
1022   BIND(DO1_SHORT);
1023     mov(result_tmp, cnt1);
1024     lea(str1, Address(str1, cnt1));
1025     sub(cnt1_neg, zr, cnt1);
1026   BIND(DO1_LOOP);
1027     ldrb(ch1, Address(str1, cnt1_neg));
1028     cmp(ch, ch1);
1029     br(EQ, MATCH);
1030     adds(cnt1_neg, cnt1_neg, 1);
1031     br(LT, DO1_LOOP);
1032   BIND(NOMATCH);
1033     mov(result, -1);
1034     b(DONE);
1035   BIND(MATCH);
1036     add(result, result_tmp, cnt1_neg);
1037   BIND(DONE);
1038 }
1039 
1040 // Compare strings.
1041 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1042     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1043     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
1044   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1045       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1046       SHORT_LOOP_START, TAIL_CHECK;
1047 
1048   bool isLL = ae == StrIntrinsicNode::LL;
1049   bool isLU = ae == StrIntrinsicNode::LU;
1050   bool isUL = ae == StrIntrinsicNode::UL;
1051 
1052   // The stub threshold for LL strings is: 72 (64 + 8) chars
1053   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1054   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1055   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1056 
1057   bool str1_isL = isLL || isLU;
1058   bool str2_isL = isLL || isUL;
1059 
1060   int str1_chr_shift = str1_isL ? 0 : 1;
1061   int str2_chr_shift = str2_isL ? 0 : 1;
1062   int str1_chr_size = str1_isL ? 1 : 2;
1063   int str2_chr_size = str2_isL ? 1 : 2;
1064   int minCharsInWord = isLL ? wordSize : wordSize/2;
1065 
1066   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1067   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1068                                       (chr_insn)&MacroAssembler::ldrh;
1069   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1070                                       (chr_insn)&MacroAssembler::ldrh;
1071   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1072                             (uxt_insn)&MacroAssembler::uxthw;
1073 
1074   BLOCK_COMMENT("string_compare {");
1075 
1076   // Bizzarely, the counts are passed in bytes, regardless of whether they
1077   // are L or U strings, however the result is always in characters.
1078   if (!str1_isL) asrw(cnt1, cnt1, 1);
1079   if (!str2_isL) asrw(cnt2, cnt2, 1);
1080 
1081   // Compute the minimum of the string lengths and save the difference.
1082   subsw(result, cnt1, cnt2);
1083   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1084 
1085   // A very short string
1086   cmpw(cnt2, minCharsInWord);
1087   br(Assembler::LE, SHORT_STRING);
1088 
1089   // Compare longwords
1090   // load first parts of strings and finish initialization while loading
1091   {
1092     if (str1_isL == str2_isL) { // LL or UU
1093       ldr(tmp1, Address(str1));
1094       cmp(str1, str2);
1095       br(Assembler::EQ, DONE);
1096       ldr(tmp2, Address(str2));
1097       cmp(cnt2, stub_threshold);
1098       br(GE, STUB);
1099       subsw(cnt2, cnt2, minCharsInWord);
1100       br(EQ, TAIL_CHECK);
1101       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1102       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1103       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1104     } else if (isLU) {
1105       ldrs(vtmp, Address(str1));
1106       ldr(tmp2, Address(str2));
1107       cmp(cnt2, stub_threshold);
1108       br(GE, STUB);
1109       subw(cnt2, cnt2, 4);
1110       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1111       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1112       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1113       zip1(vtmp, T8B, vtmp, vtmpZ);
1114       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1115       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1116       add(cnt1, cnt1, 4);
1117       fmovd(tmp1, vtmp);
1118     } else { // UL case
1119       ldr(tmp1, Address(str1));
1120       ldrs(vtmp, Address(str2));
1121       cmp(cnt2, stub_threshold);
1122       br(GE, STUB);
1123       subw(cnt2, cnt2, 4);
1124       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1125       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1126       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1127       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1128       zip1(vtmp, T8B, vtmp, vtmpZ);
1129       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1130       add(cnt1, cnt1, 8);
1131       fmovd(tmp2, vtmp);
1132     }
1133     adds(cnt2, cnt2, isUL ? 4 : 8);
1134     br(GE, TAIL);
1135     eor(rscratch2, tmp1, tmp2);
1136     cbnz(rscratch2, DIFF);
1137     // main loop
1138     bind(NEXT_WORD);
1139     if (str1_isL == str2_isL) {
1140       ldr(tmp1, Address(str1, cnt2));
1141       ldr(tmp2, Address(str2, cnt2));
1142       adds(cnt2, cnt2, 8);
1143     } else if (isLU) {
1144       ldrs(vtmp, Address(str1, cnt1));
1145       ldr(tmp2, Address(str2, cnt2));
1146       add(cnt1, cnt1, 4);
1147       zip1(vtmp, T8B, vtmp, vtmpZ);
1148       fmovd(tmp1, vtmp);
1149       adds(cnt2, cnt2, 8);
1150     } else { // UL
1151       ldrs(vtmp, Address(str2, cnt2));
1152       ldr(tmp1, Address(str1, cnt1));
1153       zip1(vtmp, T8B, vtmp, vtmpZ);
1154       add(cnt1, cnt1, 8);
1155       fmovd(tmp2, vtmp);
1156       adds(cnt2, cnt2, 4);
1157     }
1158     br(GE, TAIL);
1159 
1160     eor(rscratch2, tmp1, tmp2);
1161     cbz(rscratch2, NEXT_WORD);
1162     b(DIFF);
1163     bind(TAIL);
1164     eor(rscratch2, tmp1, tmp2);
1165     cbnz(rscratch2, DIFF);
1166     // Last longword.  In the case where length == 4 we compare the
1167     // same longword twice, but that's still faster than another
1168     // conditional branch.
1169     if (str1_isL == str2_isL) {
1170       ldr(tmp1, Address(str1));
1171       ldr(tmp2, Address(str2));
1172     } else if (isLU) {
1173       ldrs(vtmp, Address(str1));
1174       ldr(tmp2, Address(str2));
1175       zip1(vtmp, T8B, vtmp, vtmpZ);
1176       fmovd(tmp1, vtmp);
1177     } else { // UL
1178       ldrs(vtmp, Address(str2));
1179       ldr(tmp1, Address(str1));
1180       zip1(vtmp, T8B, vtmp, vtmpZ);
1181       fmovd(tmp2, vtmp);
1182     }
1183     bind(TAIL_CHECK);
1184     eor(rscratch2, tmp1, tmp2);
1185     cbz(rscratch2, DONE);
1186 
1187     // Find the first different characters in the longwords and
1188     // compute their difference.
1189     bind(DIFF);
1190     rev(rscratch2, rscratch2);
1191     clz(rscratch2, rscratch2);
1192     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1193     lsrv(tmp1, tmp1, rscratch2);
1194     (this->*ext_chr)(tmp1, tmp1);
1195     lsrv(tmp2, tmp2, rscratch2);
1196     (this->*ext_chr)(tmp2, tmp2);
1197     subw(result, tmp1, tmp2);
1198     b(DONE);
1199   }
1200 
1201   bind(STUB);
1202     RuntimeAddress stub = NULL;
1203     switch(ae) {
1204       case StrIntrinsicNode::LL:
1205         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1206         break;
1207       case StrIntrinsicNode::UU:
1208         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1209         break;
1210       case StrIntrinsicNode::LU:
1211         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1212         break;
1213       case StrIntrinsicNode::UL:
1214         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1215         break;
1216       default:
1217         ShouldNotReachHere();
1218      }
1219     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
1220     trampoline_call(stub);
1221     b(DONE);
1222 
1223   bind(SHORT_STRING);
1224   // Is the minimum length zero?
1225   cbz(cnt2, DONE);
1226   // arrange code to do most branches while loading and loading next characters
1227   // while comparing previous
1228   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1229   subs(cnt2, cnt2, 1);
1230   br(EQ, SHORT_LAST_INIT);
1231   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1232   b(SHORT_LOOP_START);
1233   bind(SHORT_LOOP);
1234   subs(cnt2, cnt2, 1);
1235   br(EQ, SHORT_LAST);
1236   bind(SHORT_LOOP_START);
1237   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1238   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1239   cmp(tmp1, cnt1);
1240   br(NE, SHORT_LOOP_TAIL);
1241   subs(cnt2, cnt2, 1);
1242   br(EQ, SHORT_LAST2);
1243   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1244   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1245   cmp(tmp2, rscratch1);
1246   br(EQ, SHORT_LOOP);
1247   sub(result, tmp2, rscratch1);
1248   b(DONE);
1249   bind(SHORT_LOOP_TAIL);
1250   sub(result, tmp1, cnt1);
1251   b(DONE);
1252   bind(SHORT_LAST2);
1253   cmp(tmp2, rscratch1);
1254   br(EQ, DONE);
1255   sub(result, tmp2, rscratch1);
1256 
1257   b(DONE);
1258   bind(SHORT_LAST_INIT);
1259   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1260   bind(SHORT_LAST);
1261   cmp(tmp1, cnt1);
1262   br(EQ, DONE);
1263   sub(result, tmp1, cnt1);
1264 
1265   bind(DONE);
1266 
1267   BLOCK_COMMENT("} string_compare");
1268 }
1269 
1270 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1271                                      FloatRegister src2, int cond, bool isQ) {
1272   SIMD_Arrangement size = esize2arrangement(type2aelembytes(bt), isQ);
1273   if (bt == T_FLOAT || bt == T_DOUBLE) {
1274     switch (cond) {
1275       case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
1276       case BoolTest::ne: {
1277         fcmeq(dst, size, src1, src2);
1278         notr(dst, T16B, dst);
1279         break;
1280       }
1281       case BoolTest::ge: fcmge(dst, size, src1, src2); break;
1282       case BoolTest::gt: fcmgt(dst, size, src1, src2); break;
1283       case BoolTest::le: fcmge(dst, size, src2, src1); break;
1284       case BoolTest::lt: fcmgt(dst, size, src2, src1); break;
1285       default:
1286         assert(false, "unsupported");
1287         ShouldNotReachHere();
1288     }
1289   } else {
1290     switch (cond) {
1291       case BoolTest::eq: cmeq(dst, size, src1, src2); break;
1292       case BoolTest::ne: {
1293         cmeq(dst, size, src1, src2);
1294         notr(dst, T16B, dst);
1295         break;
1296       }
1297       case BoolTest::ge: cmge(dst, size, src1, src2); break;
1298       case BoolTest::gt: cmgt(dst, size, src1, src2); break;
1299       case BoolTest::le: cmge(dst, size, src2, src1); break;
1300       case BoolTest::lt: cmgt(dst, size, src2, src1); break;
1301       case BoolTest::uge: cmhs(dst, size, src1, src2); break;
1302       case BoolTest::ugt: cmhi(dst, size, src1, src2); break;
1303       case BoolTest::ult: cmhi(dst, size, src2, src1); break;
1304       case BoolTest::ule: cmhs(dst, size, src2, src1); break;
1305       default:
1306         assert(false, "unsupported");
1307         ShouldNotReachHere();
1308     }
1309   }
1310 }
1311 
1312 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) {
1313   C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst);
1314   Compile::current()->output()->add_stub(stub);
1315 
1316   // Note: Don't clobber obj anywhere in that method!
1317 
1318   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
1319   // obj-start, so that we can load from the object's mark-word instead. Usually the address
1320   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
1321   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
1322   // then passes that register as obj and 0 in disp. The following code extracts the base
1323   // and offset to load the mark-word.
1324   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
1325   if (index == noreg) {
1326     ldr(dst, Address(obj, offset));
1327   } else {
1328     lea(dst, Address(obj, index, Address::lsl(scale)));
1329     ldr(dst, Address(dst, offset));
1330   }
1331   // NOTE: We can't use tbnz here, because the target is sometimes too far away
1332   // and cannot be encoded.
1333   tst(dst, markWord::monitor_value);
1334   br(Assembler::NE, stub->entry());
1335   bind(stub->continuation());
1336   lsr(dst, dst, markWord::klass_shift);
1337 }