1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  50                                   Register tmp2Reg, Register tmp3Reg) {
  51   Register oop = objectReg;
  52   Register box = boxReg;
  53   Register disp_hdr = tmpReg;
  54   Register tmp = tmp2Reg;
  55   Label cont;
  56   Label object_has_monitor;
  57   Label count, no_count;
  58 
  59   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  60   assert_different_registers(oop, box, tmp, disp_hdr);
  61 
  62   // Load markWord from object into displaced_header.
  63   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  64 
  65   if (DiagnoseSyncOnValueBasedClasses != 0) {
  66     load_klass(tmp, oop);
  67     ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
  68     tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
  69     br(Assembler::NE, cont);
  70   }
  71 
  72   // Check for existing monitor
  73   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  74 
  75   if (LockingMode == LM_MONITOR) {
  76     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
  77     b(cont);
  78   } else {
  79     assert(LockingMode == LM_LEGACY, "must be");
  80     // Set tmp to be (markWord of object | UNLOCK_VALUE).
  81     orr(tmp, disp_hdr, markWord::unlocked_value);
  82 
  83     // Initialize the box. (Must happen before we update the object mark!)
  84     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
  85 
  86     // Compare object markWord with an unlocked value (tmp) and if
  87     // equal exchange the stack address of our box with object markWord.
  88     // On failure disp_hdr contains the possibly locked markWord.
  89     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
  90             /*release*/ true, /*weak*/ false, disp_hdr);
  91     br(Assembler::EQ, cont);
  92 
  93     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  94 
  95     // If the compare-and-exchange succeeded, then we found an unlocked
  96     // object, will have now locked it will continue at label cont
  97 
  98     // Check if the owner is self by comparing the value in the
  99     // markWord of object (disp_hdr) with the stack pointer.
 100     mov(rscratch1, sp);
 101     sub(disp_hdr, disp_hdr, rscratch1);
 102     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 103     // If condition is true we are cont and hence we can store 0 as the
 104     // displaced header in the box, which indicates that it is a recursive lock.
 105     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 106     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 107     b(cont);
 108   }
 109 
 110   // Handle existing monitor.
 111   bind(object_has_monitor);
 112 
 113   // The object's monitor m is unlocked iff m->owner == nullptr,
 114   // otherwise m->owner may contain a thread or a stack address.
 115   //
 116   // Try to CAS m->owner from null to current thread.
 117   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 118   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 119           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 120 
 121   // Store a non-null value into the box to avoid looking like a re-entrant
 122   // lock. The fast-path monitor unlock code checks for
 123   // markWord::monitor_value so use markWord::unused_mark which has the
 124   // relevant bit set, and also matches ObjectSynchronizer::enter.
 125   mov(tmp, (address)markWord::unused_mark().value());
 126   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 127 
 128   br(Assembler::EQ, cont); // CAS success means locking succeeded
 129 
 130   cmp(tmp3Reg, rthread);
 131   br(Assembler::NE, cont); // Check for recursive locking
 132 
 133   // Recursive lock case
 134   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 135   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 136 
 137   bind(cont);
 138   // flag == EQ indicates success
 139   // flag == NE indicates failure
 140   br(Assembler::NE, no_count);
 141 
 142   bind(count);
 143   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 144 
 145   bind(no_count);
 146 }
 147 
 148 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 149                                     Register tmp2Reg) {
 150   Register oop = objectReg;
 151   Register box = boxReg;
 152   Register disp_hdr = tmpReg;
 153   Register tmp = tmp2Reg;
 154   Label cont;
 155   Label object_has_monitor;
 156   Label count, no_count;
 157 
 158   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 159   assert_different_registers(oop, box, tmp, disp_hdr);
 160 
 161   if (LockingMode == LM_LEGACY) {
 162     // Find the lock address and load the displaced header from the stack.
 163     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 164 
 165     // If the displaced header is 0, we have a recursive unlock.
 166     cmp(disp_hdr, zr);
 167     br(Assembler::EQ, cont);
 168   }
 169 
 170   // Handle existing monitor.
 171   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 172   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 173 
 174   if (LockingMode == LM_MONITOR) {
 175     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 176     b(cont);
 177   } else {
 178     assert(LockingMode == LM_LEGACY, "must be");
 179     // Check if it is still a light weight lock, this is is true if we
 180     // see the stack address of the basicLock in the markWord of the
 181     // object.
 182 
 183     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 184             /*release*/ true, /*weak*/ false, tmp);
 185     b(cont);
 186   }
 187 
 188   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 189 
 190   // Handle existing monitor.
 191   bind(object_has_monitor);
 192   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 193   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 194 
 195   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 196 
 197   Label notRecursive;
 198   cbz(disp_hdr, notRecursive);
 199 
 200   // Recursive lock
 201   sub(disp_hdr, disp_hdr, 1u);
 202   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 203   cmp(disp_hdr, disp_hdr); // Sets flags for result
 204   b(cont);
 205 
 206   bind(notRecursive);
 207   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 208   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 209   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 210   cmp(rscratch1, zr); // Sets flags for result
 211   cbnz(rscratch1, cont);
 212   // need a release store here
 213   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 214   stlr(zr, tmp); // set unowned
 215 
 216   bind(cont);
 217   // flag == EQ indicates success
 218   // flag == NE indicates failure
 219   br(Assembler::NE, no_count);
 220 
 221   bind(count);
 222   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 223 
 224   bind(no_count);
 225 }
 226 
 227 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 228                                               Register t2, Register t3) {
 229   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 230   assert_different_registers(obj, box, t1, t2, t3);
 231 
 232   // Handle inflated monitor.
 233   Label inflated;
 234   // Finish fast lock successfully. MUST branch to with flag == EQ
 235   Label locked;
 236   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 237   Label slow_path;
 238 
 239   // Clear box. TODO[OMWorld]: Is this necessary? May also defer this to not write twice.
 240   str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 241 
 242   if (DiagnoseSyncOnValueBasedClasses != 0) {
 243     load_klass(t1, obj);
 244     ldrw(t1, Address(t1, Klass::access_flags_offset()));
 245     tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
 246     br(Assembler::NE, slow_path);
 247   }
 248 
 249   const Register t1_mark = t1;
 250   const Register t3_t = t3;
 251 
 252   { // Lightweight locking
 253 
 254     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 255     Label push;
 256 
 257     const Register t2_top = t2;
 258 
 259     // Check if lock-stack is full.
 260     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 261     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 262     br(Assembler::GT, slow_path);
 263 
 264     // Check if recursive.
 265     subw(t3_t, t2_top, oopSize);
 266     ldr(t3_t, Address(rthread, t3_t));
 267     cmp(obj, t3_t);
 268     br(Assembler::EQ, push);
 269 
 270     // Relaxed normal load to check for monitor. Optimization for monitor case.
 271     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 272     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 273 
 274     // Not inflated
 275     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 276 
 277     // Try to lock. Transition lock-bits 0b01 => 0b00
 278     orr(t1_mark, t1_mark, markWord::unlocked_value);
 279     eor(t3_t, t1_mark, markWord::unlocked_value);
 280     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 281             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 282     br(Assembler::NE, slow_path);
 283 
 284     bind(push);
 285     // After successful lock, push object on lock-stack.
 286     str(obj, Address(rthread, t2_top));
 287     addw(t2_top, t2_top, oopSize);
 288     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 289     b(locked);
 290   }
 291 
 292   { // Handle inflated monitor.
 293     bind(inflated);
 294 
 295     if (!OMUseC2Cache) {
 296       // Set Flags == NE
 297       cmp(zr, obj);
 298       b(slow_path);
 299     } else {
 300 
 301       if (OMCacheHitRate) increment(Address(rthread, JavaThread::lock_lookup_offset()));
 302 
 303       Label monitor_found;
 304 
 305       // Load cache address
 306       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 307 
 308       const int num_unrolled = MIN2(OMC2UnrollCacheEntries, OMCacheSize);
 309       for (int i = 0; i < num_unrolled; i++) {
 310         ldr(t1, Address(t3_t));
 311         cmp(obj, t1);
 312         br(Assembler::EQ, monitor_found);
 313         if (i + 1 != num_unrolled) {
 314           increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 315         }
 316       }
 317 
 318       if (num_unrolled == 0 || (OMC2UnrollCacheLookupLoopTail && num_unrolled != OMCacheSize)) {
 319         if (num_unrolled != 0) {
 320           // Loop after unrolling, advance iterator.
 321           increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 322         }
 323 
 324         Label loop;
 325 
 326         // Search for obj in cache.
 327         bind(loop);
 328 
 329         // Check for match.
 330         ldr(t1, Address(t3_t));
 331         cmp(obj, t1);
 332         br(Assembler::EQ, monitor_found);
 333 
 334         // Search until null encountered, guaranteed _null_sentinel at end.
 335         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 336         cbnz(t1, loop);
 337         // Cache Miss, NE set from cmp above, cbnz does not set flags
 338         b(slow_path);
 339       } else {
 340         b(slow_path);
 341       }
 342 
 343       bind(monitor_found);
 344       ldr(t1, Address(t3_t, OMCache::oop_to_monitor_difference()));
 345       if (OMCacheHitRate) increment(Address(rthread, JavaThread::lock_hit_offset()));
 346 
 347       // ObjectMonitor* is in t1
 348       const Register t1_monitor = t1;
 349       const Register t2_owner_addr = t2;
 350       const Register t3_owner = t3;
 351 
 352       Label recursive;
 353       Label monitor_locked;
 354 
 355       // Compute owner address.
 356       lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 357 
 358       if (OMRecursiveFastPath) {
 359         ldr(t3_owner, Address(t2_owner_addr));
 360         cmp(t3_owner, rthread);
 361         br(Assembler::EQ, recursive);
 362       }
 363 
 364       // CAS owner (null => current thread).
 365       cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 366               /*release*/ false, /*weak*/ false, t3_owner);
 367       br(Assembler::EQ, monitor_locked);
 368 
 369       if (OMRecursiveFastPath) {
 370         b(slow_path);
 371       } else {
 372         // Check if recursive.
 373         cmp(t3_owner, rthread);
 374         br(Assembler::NE, slow_path);
 375       }
 376 
 377       // Recursive.
 378       bind(recursive);
 379       increment(Address(t1_monitor, ObjectMonitor::recursions_offset()), 1);
 380 
 381       bind(monitor_locked);
 382       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 383     }
 384 
 385   }
 386 
 387   bind(locked);
 388   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 389 
 390 #ifdef ASSERT
 391   // Check that locked label is reached with Flags == EQ.
 392   Label flag_correct;
 393   br(Assembler::EQ, flag_correct);
 394   stop("Fast Lock Flag != EQ");
 395 #endif
 396 
 397   bind(slow_path);
 398 #ifdef ASSERT
 399   // Check that slow_path label is reached with Flags == NE.
 400   br(Assembler::NE, flag_correct);
 401   stop("Fast Lock Flag != NE");
 402   bind(flag_correct);
 403 #endif
 404   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 405 }
 406 
 407 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 408                                                 Register t2, Register t3) {
 409   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 410   assert_different_registers(obj, box, t1, t2, t3);
 411 
 412   // Handle inflated monitor.
 413   Label inflated, inflated_load_monitor;
 414   // Finish fast unlock successfully. MUST branch to with flag == EQ
 415   Label unlocked;
 416   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 417   Label slow_path;
 418 
 419   const Register t1_mark = t1;
 420   const Register t2_top = t2;
 421   const Register t3_t = t3;
 422 
 423   { // Lightweight unlock
 424 
 425     Label push_and_slow_path;
 426 
 427     // Check if obj is top of lock-stack.
 428     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 429     subw(t2_top, t2_top, oopSize);
 430     ldr(t3_t, Address(rthread, t2_top));
 431     cmp(obj, t3_t);
 432     // Top of lock stack was not obj. Must be monitor.
 433     br(Assembler::NE, inflated_load_monitor);
 434 
 435     // Pop lock-stack.
 436     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 437     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 438 
 439     // Check if recursive.
 440     subw(t3_t, t2_top, oopSize);
 441     ldr(t3_t, Address(rthread, t3_t));
 442     cmp(obj, t3_t);
 443     br(Assembler::EQ, unlocked);
 444 
 445     // Not recursive.
 446     // Load Mark.
 447     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 448 
 449     // Check header for monitor (0b10).
 450     // Because we got here by popping (meaning we pushed in locked)
 451     // there will be no monitor in the box. So we need to push back the obj
 452     // so that the runtime can fix any potential anonymous owner.
 453     tbnz(t1_mark, exact_log2(markWord::monitor_value), push_and_slow_path);
 454 
 455     // Try to unlock. Transition lock bits 0b00 => 0b01
 456     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 457     orr(t3_t, t1_mark, markWord::unlocked_value);
 458     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 459             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 460     br(Assembler::EQ, unlocked);
 461 
 462     bind(push_and_slow_path);
 463     // Compare and exchange failed.
 464     // Restore lock-stack and handle the unlock in runtime.
 465     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 466     addw(t2_top, t2_top, oopSize);
 467     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 468     b(slow_path);
 469   }
 470 
 471 
 472   { // Handle inflated monitor.
 473     bind(inflated_load_monitor);
 474     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 475 #ifdef ASSERT
 476     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 477     stop("Fast Unlock not monitor");
 478 #endif
 479 
 480     bind(inflated);
 481 
 482 #ifdef ASSERT
 483     Label check_done;
 484     subw(t2_top, t2_top, oopSize);
 485     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 486     br(Assembler::LT, check_done);
 487     ldr(t3_t, Address(rthread, t2_top));
 488     cmp(obj, t3_t);
 489     br(Assembler::NE, inflated);
 490     stop("Fast Unlock lock on stack");
 491     bind(check_done);
 492 #endif
 493 
 494     if (!OMUseC2Cache) {
 495       b(slow_path);
 496     } else {
 497       const Register t1_monitor = t1;
 498 
 499       if (OMCacheHitRate) increment(Address(rthread, JavaThread::unlock_lookup_offset()));
 500       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 501       // TODO: Cleanup these constants (with an enum and asserts)
 502       cmp(t1_monitor, (uint8_t)2);
 503       // Non symmetrical, take slow path monitor == 0 or 1, 0 and 1 < 2, both LS and NE
 504       br(Assembler::LO, slow_path);
 505       if (OMCacheHitRate) increment(Address(rthread, JavaThread::unlock_hit_offset()));
 506 
 507       const Register t2_recursions = t2;
 508       Label not_recursive;
 509 
 510       // Check if recursive.
 511       ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 512       cbz(t2_recursions, not_recursive);
 513 
 514       // Recursive unlock.
 515       sub(t2_recursions, t2_recursions, 1u);
 516       str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 517       // Set flag == EQ
 518       cmp(t2_recursions, t2_recursions);
 519       b(unlocked);
 520 
 521       bind(not_recursive);
 522 
 523       Label release;
 524       const Register t2_owner_addr = t2;
 525 
 526       // Compute owner address.
 527       lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 528 
 529       // Check if the entry lists are empty.
 530       ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 531       ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 532       orr(rscratch1, rscratch1, t3_t);
 533       cmp(rscratch1, zr);
 534       br(Assembler::EQ, release);
 535 
 536       // The owner may be anonymous and we removed the last obj entry in
 537       // the lock-stack. This loses the information about the owner.
 538       // Write the thread to the owner field so the runtime knows the owner.
 539       str(rthread, Address(t2_owner_addr));
 540       b(slow_path);
 541 
 542       bind(release);
 543       // Set owner to null.
 544       // Release to satisfy the JMM
 545       stlr(zr, t2_owner_addr);
 546     }
 547   }
 548 
 549   bind(unlocked);
 550   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 551 
 552 #ifdef ASSERT
 553   // Check that unlocked label is reached with Flags == EQ.
 554   Label flag_correct;
 555   br(Assembler::EQ, flag_correct);
 556   stop("Fast Unlock Flag != EQ");
 557 #endif
 558 
 559   bind(slow_path);
 560 #ifdef ASSERT
 561   // Check that slow_path label is reached with Flags == NE.
 562   br(Assembler::NE, flag_correct);
 563   stop("Fast Unlock Flag != NE");
 564   bind(flag_correct);
 565 #endif
 566   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 567 }
 568 
 569 // Search for str1 in str2 and return index or -1
 570 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 571 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 572                                        Register cnt2, Register cnt1,
 573                                        Register tmp1, Register tmp2,
 574                                        Register tmp3, Register tmp4,
 575                                        Register tmp5, Register tmp6,
 576                                        int icnt1, Register result, int ae) {
 577   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 578   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 579 
 580   Register ch1 = rscratch1;
 581   Register ch2 = rscratch2;
 582   Register cnt1tmp = tmp1;
 583   Register cnt2tmp = tmp2;
 584   Register cnt1_neg = cnt1;
 585   Register cnt2_neg = cnt2;
 586   Register result_tmp = tmp4;
 587 
 588   bool isL = ae == StrIntrinsicNode::LL;
 589 
 590   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 591   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 592   int str1_chr_shift = str1_isL ? 0:1;
 593   int str2_chr_shift = str2_isL ? 0:1;
 594   int str1_chr_size = str1_isL ? 1:2;
 595   int str2_chr_size = str2_isL ? 1:2;
 596   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 597                                       (chr_insn)&MacroAssembler::ldrh;
 598   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 599                                       (chr_insn)&MacroAssembler::ldrh;
 600   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 601   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 602 
 603   // Note, inline_string_indexOf() generates checks:
 604   // if (substr.count > string.count) return -1;
 605   // if (substr.count == 0) return 0;
 606 
 607   // We have two strings, a source string in str2, cnt2 and a pattern string
 608   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 609 
 610   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 611   // With a small pattern and source we use linear scan.
 612 
 613   if (icnt1 == -1) {
 614     sub(result_tmp, cnt2, cnt1);
 615     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 616     br(LT, LINEARSEARCH);
 617     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 618     subs(zr, cnt1, 256);
 619     lsr(tmp1, cnt2, 2);
 620     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 621     br(GE, LINEARSTUB);
 622   }
 623 
 624 // The Boyer Moore alogorithm is based on the description here:-
 625 //
 626 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 627 //
 628 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 629 // and the 'Good Suffix' rule.
 630 //
 631 // These rules are essentially heuristics for how far we can shift the
 632 // pattern along the search string.
 633 //
 634 // The implementation here uses the 'Bad Character' rule only because of the
 635 // complexity of initialisation for the 'Good Suffix' rule.
 636 //
 637 // This is also known as the Boyer-Moore-Horspool algorithm:-
 638 //
 639 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 640 //
 641 // This particular implementation has few java-specific optimizations.
 642 //
 643 // #define ASIZE 256
 644 //
 645 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 646 //       int i, j;
 647 //       unsigned c;
 648 //       unsigned char bc[ASIZE];
 649 //
 650 //       /* Preprocessing */
 651 //       for (i = 0; i < ASIZE; ++i)
 652 //          bc[i] = m;
 653 //       for (i = 0; i < m - 1; ) {
 654 //          c = x[i];
 655 //          ++i;
 656 //          // c < 256 for Latin1 string, so, no need for branch
 657 //          #ifdef PATTERN_STRING_IS_LATIN1
 658 //          bc[c] = m - i;
 659 //          #else
 660 //          if (c < ASIZE) bc[c] = m - i;
 661 //          #endif
 662 //       }
 663 //
 664 //       /* Searching */
 665 //       j = 0;
 666 //       while (j <= n - m) {
 667 //          c = y[i+j];
 668 //          if (x[m-1] == c)
 669 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 670 //          if (i < 0) return j;
 671 //          // c < 256 for Latin1 string, so, no need for branch
 672 //          #ifdef SOURCE_STRING_IS_LATIN1
 673 //          // LL case: (c< 256) always true. Remove branch
 674 //          j += bc[y[j+m-1]];
 675 //          #endif
 676 //          #ifndef PATTERN_STRING_IS_UTF
 677 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 678 //          if (c < ASIZE)
 679 //            j += bc[y[j+m-1]];
 680 //          else
 681 //            j += 1
 682 //          #endif
 683 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 684 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 685 //          if (c < ASIZE)
 686 //            j += bc[y[j+m-1]];
 687 //          else
 688 //            j += m
 689 //          #endif
 690 //       }
 691 //    }
 692 
 693   if (icnt1 == -1) {
 694     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 695         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 696     Register cnt1end = tmp2;
 697     Register str2end = cnt2;
 698     Register skipch = tmp2;
 699 
 700     // str1 length is >=8, so, we can read at least 1 register for cases when
 701     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 702     // UL case. We'll re-read last character in inner pre-loop code to have
 703     // single outer pre-loop load
 704     const int firstStep = isL ? 7 : 3;
 705 
 706     const int ASIZE = 256;
 707     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 708     sub(sp, sp, ASIZE);
 709     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 710     mov(ch1, sp);
 711     BIND(BM_INIT_LOOP);
 712       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 713       subs(tmp5, tmp5, 1);
 714       br(GT, BM_INIT_LOOP);
 715 
 716       sub(cnt1tmp, cnt1, 1);
 717       mov(tmp5, str2);
 718       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 719       sub(ch2, cnt1, 1);
 720       mov(tmp3, str1);
 721     BIND(BCLOOP);
 722       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 723       if (!str1_isL) {
 724         subs(zr, ch1, ASIZE);
 725         br(HS, BCSKIP);
 726       }
 727       strb(ch2, Address(sp, ch1));
 728     BIND(BCSKIP);
 729       subs(ch2, ch2, 1);
 730       br(GT, BCLOOP);
 731 
 732       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 733       if (str1_isL == str2_isL) {
 734         // load last 8 bytes (8LL/4UU symbols)
 735         ldr(tmp6, Address(tmp6, -wordSize));
 736       } else {
 737         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 738         // convert Latin1 to UTF. We'll have to wait until load completed, but
 739         // it's still faster than per-character loads+checks
 740         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 741         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 742         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 743         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 744         orr(ch2, ch1, ch2, LSL, 16);
 745         orr(tmp6, tmp6, tmp3, LSL, 48);
 746         orr(tmp6, tmp6, ch2, LSL, 16);
 747       }
 748     BIND(BMLOOPSTR2);
 749       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 750       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 751       if (str1_isL == str2_isL) {
 752         // re-init tmp3. It's for free because it's executed in parallel with
 753         // load above. Alternative is to initialize it before loop, but it'll
 754         // affect performance on in-order systems with 2 or more ld/st pipelines
 755         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 756       }
 757       if (!isL) { // UU/UL case
 758         lsl(ch2, cnt1tmp, 1); // offset in bytes
 759       }
 760       cmp(tmp3, skipch);
 761       br(NE, BMSKIP);
 762       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 763       mov(ch1, tmp6);
 764       if (isL) {
 765         b(BMLOOPSTR1_AFTER_LOAD);
 766       } else {
 767         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 768         b(BMLOOPSTR1_CMP);
 769       }
 770     BIND(BMLOOPSTR1);
 771       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 772       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 773     BIND(BMLOOPSTR1_AFTER_LOAD);
 774       subs(cnt1tmp, cnt1tmp, 1);
 775       br(LT, BMLOOPSTR1_LASTCMP);
 776     BIND(BMLOOPSTR1_CMP);
 777       cmp(ch1, ch2);
 778       br(EQ, BMLOOPSTR1);
 779     BIND(BMSKIP);
 780       if (!isL) {
 781         // if we've met UTF symbol while searching Latin1 pattern, then we can
 782         // skip cnt1 symbols
 783         if (str1_isL != str2_isL) {
 784           mov(result_tmp, cnt1);
 785         } else {
 786           mov(result_tmp, 1);
 787         }
 788         subs(zr, skipch, ASIZE);
 789         br(HS, BMADV);
 790       }
 791       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 792     BIND(BMADV);
 793       sub(cnt1tmp, cnt1, 1);
 794       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 795       cmp(str2, str2end);
 796       br(LE, BMLOOPSTR2);
 797       add(sp, sp, ASIZE);
 798       b(NOMATCH);
 799     BIND(BMLOOPSTR1_LASTCMP);
 800       cmp(ch1, ch2);
 801       br(NE, BMSKIP);
 802     BIND(BMMATCH);
 803       sub(result, str2, tmp5);
 804       if (!str2_isL) lsr(result, result, 1);
 805       add(sp, sp, ASIZE);
 806       b(DONE);
 807 
 808     BIND(LINEARSTUB);
 809     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 810     br(LT, LINEAR_MEDIUM);
 811     mov(result, zr);
 812     RuntimeAddress stub = nullptr;
 813     if (isL) {
 814       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 815       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 816     } else if (str1_isL) {
 817       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 818        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 819     } else {
 820       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 821       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 822     }
 823     address call = trampoline_call(stub);
 824     if (call == nullptr) {
 825       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 826       ciEnv::current()->record_failure("CodeCache is full");
 827       return;
 828     }
 829     b(DONE);
 830   }
 831 
 832   BIND(LINEARSEARCH);
 833   {
 834     Label DO1, DO2, DO3;
 835 
 836     Register str2tmp = tmp2;
 837     Register first = tmp3;
 838 
 839     if (icnt1 == -1)
 840     {
 841         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 842 
 843         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 844         br(LT, DOSHORT);
 845       BIND(LINEAR_MEDIUM);
 846         (this->*str1_load_1chr)(first, Address(str1));
 847         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 848         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 849         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 850         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 851 
 852       BIND(FIRST_LOOP);
 853         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 854         cmp(first, ch2);
 855         br(EQ, STR1_LOOP);
 856       BIND(STR2_NEXT);
 857         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 858         br(LE, FIRST_LOOP);
 859         b(NOMATCH);
 860 
 861       BIND(STR1_LOOP);
 862         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 863         add(cnt2tmp, cnt2_neg, str2_chr_size);
 864         br(GE, MATCH);
 865 
 866       BIND(STR1_NEXT);
 867         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 868         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 869         cmp(ch1, ch2);
 870         br(NE, STR2_NEXT);
 871         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 872         add(cnt2tmp, cnt2tmp, str2_chr_size);
 873         br(LT, STR1_NEXT);
 874         b(MATCH);
 875 
 876       BIND(DOSHORT);
 877       if (str1_isL == str2_isL) {
 878         cmp(cnt1, (u1)2);
 879         br(LT, DO1);
 880         br(GT, DO3);
 881       }
 882     }
 883 
 884     if (icnt1 == 4) {
 885       Label CH1_LOOP;
 886 
 887         (this->*load_4chr)(ch1, str1);
 888         sub(result_tmp, cnt2, 4);
 889         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 890         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 891 
 892       BIND(CH1_LOOP);
 893         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 894         cmp(ch1, ch2);
 895         br(EQ, MATCH);
 896         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 897         br(LE, CH1_LOOP);
 898         b(NOMATCH);
 899       }
 900 
 901     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 902       Label CH1_LOOP;
 903 
 904       BIND(DO2);
 905         (this->*load_2chr)(ch1, str1);
 906         if (icnt1 == 2) {
 907           sub(result_tmp, cnt2, 2);
 908         }
 909         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 910         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 911       BIND(CH1_LOOP);
 912         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 913         cmp(ch1, ch2);
 914         br(EQ, MATCH);
 915         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 916         br(LE, CH1_LOOP);
 917         b(NOMATCH);
 918     }
 919 
 920     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 921       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 922 
 923       BIND(DO3);
 924         (this->*load_2chr)(first, str1);
 925         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 926         if (icnt1 == 3) {
 927           sub(result_tmp, cnt2, 3);
 928         }
 929         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 930         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 931       BIND(FIRST_LOOP);
 932         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 933         cmpw(first, ch2);
 934         br(EQ, STR1_LOOP);
 935       BIND(STR2_NEXT);
 936         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 937         br(LE, FIRST_LOOP);
 938         b(NOMATCH);
 939 
 940       BIND(STR1_LOOP);
 941         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 942         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 943         cmp(ch1, ch2);
 944         br(NE, STR2_NEXT);
 945         b(MATCH);
 946     }
 947 
 948     if (icnt1 == -1 || icnt1 == 1) {
 949       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 950 
 951       BIND(DO1);
 952         (this->*str1_load_1chr)(ch1, str1);
 953         cmp(cnt2, (u1)8);
 954         br(LT, DO1_SHORT);
 955 
 956         sub(result_tmp, cnt2, 8/str2_chr_size);
 957         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 958         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 959         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 960 
 961         if (str2_isL) {
 962           orr(ch1, ch1, ch1, LSL, 8);
 963         }
 964         orr(ch1, ch1, ch1, LSL, 16);
 965         orr(ch1, ch1, ch1, LSL, 32);
 966       BIND(CH1_LOOP);
 967         ldr(ch2, Address(str2, cnt2_neg));
 968         eor(ch2, ch1, ch2);
 969         sub(tmp1, ch2, tmp3);
 970         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 971         bics(tmp1, tmp1, tmp2);
 972         br(NE, HAS_ZERO);
 973         adds(cnt2_neg, cnt2_neg, 8);
 974         br(LT, CH1_LOOP);
 975 
 976         cmp(cnt2_neg, (u1)8);
 977         mov(cnt2_neg, 0);
 978         br(LT, CH1_LOOP);
 979         b(NOMATCH);
 980 
 981       BIND(HAS_ZERO);
 982         rev(tmp1, tmp1);
 983         clz(tmp1, tmp1);
 984         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 985         b(MATCH);
 986 
 987       BIND(DO1_SHORT);
 988         mov(result_tmp, cnt2);
 989         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 990         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 991       BIND(DO1_LOOP);
 992         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 993         cmpw(ch1, ch2);
 994         br(EQ, MATCH);
 995         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 996         br(LT, DO1_LOOP);
 997     }
 998   }
 999   BIND(NOMATCH);
1000     mov(result, -1);
1001     b(DONE);
1002   BIND(MATCH);
1003     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1004   BIND(DONE);
1005 }
1006 
1007 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1008 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1009 
1010 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1011                                             Register ch, Register result,
1012                                             Register tmp1, Register tmp2, Register tmp3)
1013 {
1014   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1015   Register cnt1_neg = cnt1;
1016   Register ch1 = rscratch1;
1017   Register result_tmp = rscratch2;
1018 
1019   cbz(cnt1, NOMATCH);
1020 
1021   cmp(cnt1, (u1)4);
1022   br(LT, DO1_SHORT);
1023 
1024   orr(ch, ch, ch, LSL, 16);
1025   orr(ch, ch, ch, LSL, 32);
1026 
1027   sub(cnt1, cnt1, 4);
1028   mov(result_tmp, cnt1);
1029   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1030   sub(cnt1_neg, zr, cnt1, LSL, 1);
1031 
1032   mov(tmp3, 0x0001000100010001);
1033 
1034   BIND(CH1_LOOP);
1035     ldr(ch1, Address(str1, cnt1_neg));
1036     eor(ch1, ch, ch1);
1037     sub(tmp1, ch1, tmp3);
1038     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1039     bics(tmp1, tmp1, tmp2);
1040     br(NE, HAS_ZERO);
1041     adds(cnt1_neg, cnt1_neg, 8);
1042     br(LT, CH1_LOOP);
1043 
1044     cmp(cnt1_neg, (u1)8);
1045     mov(cnt1_neg, 0);
1046     br(LT, CH1_LOOP);
1047     b(NOMATCH);
1048 
1049   BIND(HAS_ZERO);
1050     rev(tmp1, tmp1);
1051     clz(tmp1, tmp1);
1052     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1053     b(MATCH);
1054 
1055   BIND(DO1_SHORT);
1056     mov(result_tmp, cnt1);
1057     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1058     sub(cnt1_neg, zr, cnt1, LSL, 1);
1059   BIND(DO1_LOOP);
1060     ldrh(ch1, Address(str1, cnt1_neg));
1061     cmpw(ch, ch1);
1062     br(EQ, MATCH);
1063     adds(cnt1_neg, cnt1_neg, 2);
1064     br(LT, DO1_LOOP);
1065   BIND(NOMATCH);
1066     mov(result, -1);
1067     b(DONE);
1068   BIND(MATCH);
1069     add(result, result_tmp, cnt1_neg, ASR, 1);
1070   BIND(DONE);
1071 }
1072 
1073 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1074                                                 Register ch, Register result,
1075                                                 FloatRegister ztmp1,
1076                                                 FloatRegister ztmp2,
1077                                                 PRegister tmp_pg,
1078                                                 PRegister tmp_pdn, bool isL)
1079 {
1080   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1081   assert(tmp_pg->is_governing(),
1082          "this register has to be a governing predicate register");
1083 
1084   Label LOOP, MATCH, DONE, NOMATCH;
1085   Register vec_len = rscratch1;
1086   Register idx = rscratch2;
1087 
1088   SIMD_RegVariant T = (isL == true) ? B : H;
1089 
1090   cbz(cnt1, NOMATCH);
1091 
1092   // Assign the particular char throughout the vector.
1093   sve_dup(ztmp2, T, ch);
1094   if (isL) {
1095     sve_cntb(vec_len);
1096   } else {
1097     sve_cnth(vec_len);
1098   }
1099   mov(idx, 0);
1100 
1101   // Generate a predicate to control the reading of input string.
1102   sve_whilelt(tmp_pg, T, idx, cnt1);
1103 
1104   BIND(LOOP);
1105     // Read a vector of 8- or 16-bit data depending on the string type. Note
1106     // that inactive elements indicated by the predicate register won't cause
1107     // a data read from memory to the destination vector.
1108     if (isL) {
1109       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1110     } else {
1111       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1112     }
1113     add(idx, idx, vec_len);
1114 
1115     // Perform the comparison. An element of the destination predicate is set
1116     // to active if the particular char is matched.
1117     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1118 
1119     // Branch if the particular char is found.
1120     br(NE, MATCH);
1121 
1122     sve_whilelt(tmp_pg, T, idx, cnt1);
1123 
1124     // Loop back if the particular char not found.
1125     br(MI, LOOP);
1126 
1127   BIND(NOMATCH);
1128     mov(result, -1);
1129     b(DONE);
1130 
1131   BIND(MATCH);
1132     // Undo the index increment.
1133     sub(idx, idx, vec_len);
1134 
1135     // Crop the vector to find its location.
1136     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1137     add(result, idx, -1);
1138     sve_incp(result, T, tmp_pdn);
1139   BIND(DONE);
1140 }
1141 
1142 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1143                                             Register ch, Register result,
1144                                             Register tmp1, Register tmp2, Register tmp3)
1145 {
1146   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1147   Register cnt1_neg = cnt1;
1148   Register ch1 = rscratch1;
1149   Register result_tmp = rscratch2;
1150 
1151   cbz(cnt1, NOMATCH);
1152 
1153   cmp(cnt1, (u1)8);
1154   br(LT, DO1_SHORT);
1155 
1156   orr(ch, ch, ch, LSL, 8);
1157   orr(ch, ch, ch, LSL, 16);
1158   orr(ch, ch, ch, LSL, 32);
1159 
1160   sub(cnt1, cnt1, 8);
1161   mov(result_tmp, cnt1);
1162   lea(str1, Address(str1, cnt1));
1163   sub(cnt1_neg, zr, cnt1);
1164 
1165   mov(tmp3, 0x0101010101010101);
1166 
1167   BIND(CH1_LOOP);
1168     ldr(ch1, Address(str1, cnt1_neg));
1169     eor(ch1, ch, ch1);
1170     sub(tmp1, ch1, tmp3);
1171     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1172     bics(tmp1, tmp1, tmp2);
1173     br(NE, HAS_ZERO);
1174     adds(cnt1_neg, cnt1_neg, 8);
1175     br(LT, CH1_LOOP);
1176 
1177     cmp(cnt1_neg, (u1)8);
1178     mov(cnt1_neg, 0);
1179     br(LT, CH1_LOOP);
1180     b(NOMATCH);
1181 
1182   BIND(HAS_ZERO);
1183     rev(tmp1, tmp1);
1184     clz(tmp1, tmp1);
1185     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1186     b(MATCH);
1187 
1188   BIND(DO1_SHORT);
1189     mov(result_tmp, cnt1);
1190     lea(str1, Address(str1, cnt1));
1191     sub(cnt1_neg, zr, cnt1);
1192   BIND(DO1_LOOP);
1193     ldrb(ch1, Address(str1, cnt1_neg));
1194     cmp(ch, ch1);
1195     br(EQ, MATCH);
1196     adds(cnt1_neg, cnt1_neg, 1);
1197     br(LT, DO1_LOOP);
1198   BIND(NOMATCH);
1199     mov(result, -1);
1200     b(DONE);
1201   BIND(MATCH);
1202     add(result, result_tmp, cnt1_neg);
1203   BIND(DONE);
1204 }
1205 
1206 // Compare strings.
1207 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1208     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1209     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1210     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1211   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1212       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1213       SHORT_LOOP_START, TAIL_CHECK;
1214 
1215   bool isLL = ae == StrIntrinsicNode::LL;
1216   bool isLU = ae == StrIntrinsicNode::LU;
1217   bool isUL = ae == StrIntrinsicNode::UL;
1218 
1219   // The stub threshold for LL strings is: 72 (64 + 8) chars
1220   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1221   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1222   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1223 
1224   bool str1_isL = isLL || isLU;
1225   bool str2_isL = isLL || isUL;
1226 
1227   int str1_chr_shift = str1_isL ? 0 : 1;
1228   int str2_chr_shift = str2_isL ? 0 : 1;
1229   int str1_chr_size = str1_isL ? 1 : 2;
1230   int str2_chr_size = str2_isL ? 1 : 2;
1231   int minCharsInWord = isLL ? wordSize : wordSize/2;
1232 
1233   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1234   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1235                                       (chr_insn)&MacroAssembler::ldrh;
1236   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1237                                       (chr_insn)&MacroAssembler::ldrh;
1238   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1239                             (uxt_insn)&MacroAssembler::uxthw;
1240 
1241   BLOCK_COMMENT("string_compare {");
1242 
1243   // Bizzarely, the counts are passed in bytes, regardless of whether they
1244   // are L or U strings, however the result is always in characters.
1245   if (!str1_isL) asrw(cnt1, cnt1, 1);
1246   if (!str2_isL) asrw(cnt2, cnt2, 1);
1247 
1248   // Compute the minimum of the string lengths and save the difference.
1249   subsw(result, cnt1, cnt2);
1250   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1251 
1252   // A very short string
1253   cmpw(cnt2, minCharsInWord);
1254   br(Assembler::LE, SHORT_STRING);
1255 
1256   // Compare longwords
1257   // load first parts of strings and finish initialization while loading
1258   {
1259     if (str1_isL == str2_isL) { // LL or UU
1260       ldr(tmp1, Address(str1));
1261       cmp(str1, str2);
1262       br(Assembler::EQ, DONE);
1263       ldr(tmp2, Address(str2));
1264       cmp(cnt2, stub_threshold);
1265       br(GE, STUB);
1266       subsw(cnt2, cnt2, minCharsInWord);
1267       br(EQ, TAIL_CHECK);
1268       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1269       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1270       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1271     } else if (isLU) {
1272       ldrs(vtmp, Address(str1));
1273       ldr(tmp2, Address(str2));
1274       cmp(cnt2, stub_threshold);
1275       br(GE, STUB);
1276       subw(cnt2, cnt2, 4);
1277       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1278       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1279       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1280       zip1(vtmp, T8B, vtmp, vtmpZ);
1281       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1282       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1283       add(cnt1, cnt1, 4);
1284       fmovd(tmp1, vtmp);
1285     } else { // UL case
1286       ldr(tmp1, Address(str1));
1287       ldrs(vtmp, Address(str2));
1288       cmp(cnt2, stub_threshold);
1289       br(GE, STUB);
1290       subw(cnt2, cnt2, 4);
1291       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1292       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1293       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1294       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1295       zip1(vtmp, T8B, vtmp, vtmpZ);
1296       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1297       add(cnt1, cnt1, 8);
1298       fmovd(tmp2, vtmp);
1299     }
1300     adds(cnt2, cnt2, isUL ? 4 : 8);
1301     br(GE, TAIL);
1302     eor(rscratch2, tmp1, tmp2);
1303     cbnz(rscratch2, DIFF);
1304     // main loop
1305     bind(NEXT_WORD);
1306     if (str1_isL == str2_isL) {
1307       ldr(tmp1, Address(str1, cnt2));
1308       ldr(tmp2, Address(str2, cnt2));
1309       adds(cnt2, cnt2, 8);
1310     } else if (isLU) {
1311       ldrs(vtmp, Address(str1, cnt1));
1312       ldr(tmp2, Address(str2, cnt2));
1313       add(cnt1, cnt1, 4);
1314       zip1(vtmp, T8B, vtmp, vtmpZ);
1315       fmovd(tmp1, vtmp);
1316       adds(cnt2, cnt2, 8);
1317     } else { // UL
1318       ldrs(vtmp, Address(str2, cnt2));
1319       ldr(tmp1, Address(str1, cnt1));
1320       zip1(vtmp, T8B, vtmp, vtmpZ);
1321       add(cnt1, cnt1, 8);
1322       fmovd(tmp2, vtmp);
1323       adds(cnt2, cnt2, 4);
1324     }
1325     br(GE, TAIL);
1326 
1327     eor(rscratch2, tmp1, tmp2);
1328     cbz(rscratch2, NEXT_WORD);
1329     b(DIFF);
1330     bind(TAIL);
1331     eor(rscratch2, tmp1, tmp2);
1332     cbnz(rscratch2, DIFF);
1333     // Last longword.  In the case where length == 4 we compare the
1334     // same longword twice, but that's still faster than another
1335     // conditional branch.
1336     if (str1_isL == str2_isL) {
1337       ldr(tmp1, Address(str1));
1338       ldr(tmp2, Address(str2));
1339     } else if (isLU) {
1340       ldrs(vtmp, Address(str1));
1341       ldr(tmp2, Address(str2));
1342       zip1(vtmp, T8B, vtmp, vtmpZ);
1343       fmovd(tmp1, vtmp);
1344     } else { // UL
1345       ldrs(vtmp, Address(str2));
1346       ldr(tmp1, Address(str1));
1347       zip1(vtmp, T8B, vtmp, vtmpZ);
1348       fmovd(tmp2, vtmp);
1349     }
1350     bind(TAIL_CHECK);
1351     eor(rscratch2, tmp1, tmp2);
1352     cbz(rscratch2, DONE);
1353 
1354     // Find the first different characters in the longwords and
1355     // compute their difference.
1356     bind(DIFF);
1357     rev(rscratch2, rscratch2);
1358     clz(rscratch2, rscratch2);
1359     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1360     lsrv(tmp1, tmp1, rscratch2);
1361     (this->*ext_chr)(tmp1, tmp1);
1362     lsrv(tmp2, tmp2, rscratch2);
1363     (this->*ext_chr)(tmp2, tmp2);
1364     subw(result, tmp1, tmp2);
1365     b(DONE);
1366   }
1367 
1368   bind(STUB);
1369     RuntimeAddress stub = nullptr;
1370     switch(ae) {
1371       case StrIntrinsicNode::LL:
1372         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1373         break;
1374       case StrIntrinsicNode::UU:
1375         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1376         break;
1377       case StrIntrinsicNode::LU:
1378         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1379         break;
1380       case StrIntrinsicNode::UL:
1381         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1382         break;
1383       default:
1384         ShouldNotReachHere();
1385      }
1386     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1387     address call = trampoline_call(stub);
1388     if (call == nullptr) {
1389       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1390       ciEnv::current()->record_failure("CodeCache is full");
1391       return;
1392     }
1393     b(DONE);
1394 
1395   bind(SHORT_STRING);
1396   // Is the minimum length zero?
1397   cbz(cnt2, DONE);
1398   // arrange code to do most branches while loading and loading next characters
1399   // while comparing previous
1400   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1401   subs(cnt2, cnt2, 1);
1402   br(EQ, SHORT_LAST_INIT);
1403   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1404   b(SHORT_LOOP_START);
1405   bind(SHORT_LOOP);
1406   subs(cnt2, cnt2, 1);
1407   br(EQ, SHORT_LAST);
1408   bind(SHORT_LOOP_START);
1409   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1410   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1411   cmp(tmp1, cnt1);
1412   br(NE, SHORT_LOOP_TAIL);
1413   subs(cnt2, cnt2, 1);
1414   br(EQ, SHORT_LAST2);
1415   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1416   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1417   cmp(tmp2, rscratch1);
1418   br(EQ, SHORT_LOOP);
1419   sub(result, tmp2, rscratch1);
1420   b(DONE);
1421   bind(SHORT_LOOP_TAIL);
1422   sub(result, tmp1, cnt1);
1423   b(DONE);
1424   bind(SHORT_LAST2);
1425   cmp(tmp2, rscratch1);
1426   br(EQ, DONE);
1427   sub(result, tmp2, rscratch1);
1428 
1429   b(DONE);
1430   bind(SHORT_LAST_INIT);
1431   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1432   bind(SHORT_LAST);
1433   cmp(tmp1, cnt1);
1434   br(EQ, DONE);
1435   sub(result, tmp1, cnt1);
1436 
1437   bind(DONE);
1438 
1439   BLOCK_COMMENT("} string_compare");
1440 }
1441 
1442 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1443                                      FloatRegister src2, Condition cond, bool isQ) {
1444   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1445   FloatRegister zn = src1, zm = src2;
1446   bool needs_negation = false;
1447   switch (cond) {
1448     case LT: cond = GT; zn = src2; zm = src1; break;
1449     case LE: cond = GE; zn = src2; zm = src1; break;
1450     case LO: cond = HI; zn = src2; zm = src1; break;
1451     case LS: cond = HS; zn = src2; zm = src1; break;
1452     case NE: cond = EQ; needs_negation = true; break;
1453     default:
1454       break;
1455   }
1456 
1457   if (is_floating_point_type(bt)) {
1458     fcm(cond, dst, size, zn, zm);
1459   } else {
1460     cm(cond, dst, size, zn, zm);
1461   }
1462 
1463   if (needs_negation) {
1464     notr(dst, isQ ? T16B : T8B, dst);
1465   }
1466 }
1467 
1468 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1469                                           Condition cond, bool isQ) {
1470   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1471   if (bt == T_FLOAT || bt == T_DOUBLE) {
1472     if (cond == Assembler::NE) {
1473       fcm(Assembler::EQ, dst, size, src);
1474       notr(dst, isQ ? T16B : T8B, dst);
1475     } else {
1476       fcm(cond, dst, size, src);
1477     }
1478   } else {
1479     if (cond == Assembler::NE) {
1480       cm(Assembler::EQ, dst, size, src);
1481       notr(dst, isQ ? T16B : T8B, dst);
1482     } else {
1483       cm(cond, dst, size, src);
1484     }
1485   }
1486 }
1487 
1488 // Compress the least significant bit of each byte to the rightmost and clear
1489 // the higher garbage bits.
1490 void C2_MacroAssembler::bytemask_compress(Register dst) {
1491   // Example input, dst = 0x01 00 00 00 01 01 00 01
1492   // The "??" bytes are garbage.
1493   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1494   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1495   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1496   andr(dst, dst, 0xff);                   // dst = 0x8D
1497 }
1498 
1499 // Pack the lowest-numbered bit of each mask element in src into a long value
1500 // in dst, at most the first 64 lane elements.
1501 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1502 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1503                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1504   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1505   assert_different_registers(dst, rscratch1);
1506   assert_different_registers(vtmp1, vtmp2);
1507 
1508   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1509   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1510   // Expected:  dst = 0x658D
1511 
1512   // Convert the mask into vector with sequential bytes.
1513   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1514   sve_cpy(vtmp1, size, src, 1, false);
1515   if (bt != T_BYTE) {
1516     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1517   }
1518 
1519   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1520     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1521     // is to compress each significant bit of the byte in a cross-lane way. Due
1522     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1523     // (bit-compress in each lane) with the biggest lane size (T = D) then
1524     // concatenate the results.
1525 
1526     // The second source input of BEXT, initialized with 0x01 in each byte.
1527     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1528     sve_dup(vtmp2, B, 1);
1529 
1530     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1531     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1532     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1533     //         ---------------------------------------
1534     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1535     sve_bext(vtmp1, D, vtmp1, vtmp2);
1536 
1537     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1538     // result to dst.
1539     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1540     // dst   = 0x658D
1541     if (lane_cnt <= 8) {
1542       // No need to concatenate.
1543       umov(dst, vtmp1, B, 0);
1544     } else if (lane_cnt <= 16) {
1545       ins(vtmp1, B, vtmp1, 1, 8);
1546       umov(dst, vtmp1, H, 0);
1547     } else {
1548       // As the lane count is 64 at most, the final expected value must be in
1549       // the lowest 64 bits after narrowing vtmp1 from D to B.
1550       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1551       umov(dst, vtmp1, D, 0);
1552     }
1553   } else if (UseSVE > 0) {
1554     // Compress the lowest 8 bytes.
1555     fmovd(dst, vtmp1);
1556     bytemask_compress(dst);
1557     if (lane_cnt <= 8) return;
1558 
1559     // Repeat on higher bytes and join the results.
1560     // Compress 8 bytes in each iteration.
1561     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1562       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1563       bytemask_compress(rscratch1);
1564       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1565     }
1566   } else {
1567     assert(false, "unsupported");
1568     ShouldNotReachHere();
1569   }
1570 }
1571 
1572 // Unpack the mask, a long value in src, into predicate register dst based on the
1573 // corresponding data type. Note that dst can support at most 64 lanes.
1574 // Below example gives the expected dst predicate register in different types, with
1575 // a valid src(0x658D) on a 1024-bit vector size machine.
1576 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1577 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1578 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1579 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1580 //
1581 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1582 // has 24 significant bits would be an invalid input if dst predicate register refers to
1583 // a LONG type 1024-bit vector, which has at most 16 lanes.
1584 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1585                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1586   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1587          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1588   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1589   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1590   // Expected:  dst = 0b01101001 10001101
1591 
1592   // Put long value from general purpose register into the first lane of vector.
1593   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1594   sve_dup(vtmp1, B, 0);
1595   mov(vtmp1, D, 0, src);
1596 
1597   // As sve_cmp generates mask value with the minimum unit in byte, we should
1598   // transform the value in the first lane which is mask in bit now to the
1599   // mask in byte, which can be done by SVE2's BDEP instruction.
1600 
1601   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1602   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1603   if (lane_cnt <= 8) {
1604     // Nothing. As only one byte exsits.
1605   } else if (lane_cnt <= 16) {
1606     ins(vtmp1, B, vtmp1, 8, 1);
1607     mov(vtmp1, B, 1, zr);
1608   } else {
1609     sve_vector_extend(vtmp1, D, vtmp1, B);
1610   }
1611 
1612   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1613   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1614   sve_dup(vtmp2, B, 1);
1615 
1616   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1617   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1618   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1619   //         ---------------------------------------
1620   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1621   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1622 
1623   if (bt != T_BYTE) {
1624     sve_vector_extend(vtmp1, size, vtmp1, B);
1625   }
1626   // Generate mask according to the given vector, in which the elements have been
1627   // extended to expected type.
1628   // dst = 0b01101001 10001101
1629   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1630 }
1631 
1632 // Clobbers: rflags
1633 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1634                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1635   assert(pg->is_governing(), "This register has to be a governing predicate register");
1636   FloatRegister z1 = zn, z2 = zm;
1637   switch (cond) {
1638     case LE: z1 = zm; z2 = zn; cond = GE; break;
1639     case LT: z1 = zm; z2 = zn; cond = GT; break;
1640     case LO: z1 = zm; z2 = zn; cond = HI; break;
1641     case LS: z1 = zm; z2 = zn; cond = HS; break;
1642     default:
1643       break;
1644   }
1645 
1646   SIMD_RegVariant size = elemType_to_regVariant(bt);
1647   if (is_floating_point_type(bt)) {
1648     sve_fcm(cond, pd, size, pg, z1, z2);
1649   } else {
1650     assert(is_integral_type(bt), "unsupported element type");
1651     sve_cmp(cond, pd, size, pg, z1, z2);
1652   }
1653 }
1654 
1655 // Get index of the last mask lane that is set
1656 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1657   SIMD_RegVariant size = elemType_to_regVariant(bt);
1658   sve_rev(ptmp, size, src);
1659   sve_brkb(ptmp, ptrue, ptmp, false);
1660   sve_cntp(dst, size, ptrue, ptmp);
1661   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1662   subw(dst, rscratch1, dst);
1663 }
1664 
1665 // Extend integer vector src to dst with the same lane count
1666 // but larger element size, e.g. 4B -> 4I
1667 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1668                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1669   if (src_bt == T_BYTE) {
1670     if (dst_bt == T_SHORT) {
1671       // 4B/8B to 4S/8S
1672       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1673     } else {
1674       // 4B to 4I
1675       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1676       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1677       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1678     }
1679   } else if (src_bt == T_SHORT) {
1680     // 4S to 4I
1681     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1682     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1683   } else if (src_bt == T_INT) {
1684     // 2I to 2L
1685     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1686     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1687   } else {
1688     ShouldNotReachHere();
1689   }
1690 }
1691 
1692 // Narrow integer vector src down to dst with the same lane count
1693 // but smaller element size, e.g. 4I -> 4B
1694 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1695                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1696   if (src_bt == T_SHORT) {
1697     // 4S/8S to 4B/8B
1698     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1699     assert(dst_bt == T_BYTE, "unsupported");
1700     xtn(dst, T8B, src, T8H);
1701   } else if (src_bt == T_INT) {
1702     // 4I to 4B/4S
1703     assert(src_vlen_in_bytes == 16, "unsupported");
1704     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1705     xtn(dst, T4H, src, T4S);
1706     if (dst_bt == T_BYTE) {
1707       xtn(dst, T8B, dst, T8H);
1708     }
1709   } else if (src_bt == T_LONG) {
1710     // 2L to 2I
1711     assert(src_vlen_in_bytes == 16, "unsupported");
1712     assert(dst_bt == T_INT, "unsupported");
1713     xtn(dst, T2S, src, T2D);
1714   } else {
1715     ShouldNotReachHere();
1716   }
1717 }
1718 
1719 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1720                                           FloatRegister src, SIMD_RegVariant src_size,
1721                                           bool is_unsigned) {
1722   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1723 
1724   if (src_size == B) {
1725     switch (dst_size) {
1726     case H:
1727       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1728       break;
1729     case S:
1730       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1731       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1732       break;
1733     case D:
1734       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1735       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1736       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1737       break;
1738     default:
1739       ShouldNotReachHere();
1740     }
1741   } else if (src_size == H) {
1742     if (dst_size == S) {
1743       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1744     } else { // D
1745       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1746       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1747     }
1748   } else if (src_size == S) {
1749     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1750   }
1751 }
1752 
1753 // Vector narrow from src to dst with specified element sizes.
1754 // High part of dst vector will be filled with zero.
1755 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1756                                           FloatRegister src, SIMD_RegVariant src_size,
1757                                           FloatRegister tmp) {
1758   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1759   assert_different_registers(src, tmp);
1760   sve_dup(tmp, src_size, 0);
1761   if (src_size == D) {
1762     switch (dst_size) {
1763     case S:
1764       sve_uzp1(dst, S, src, tmp);
1765       break;
1766     case H:
1767       assert_different_registers(dst, tmp);
1768       sve_uzp1(dst, S, src, tmp);
1769       sve_uzp1(dst, H, dst, tmp);
1770       break;
1771     case B:
1772       assert_different_registers(dst, tmp);
1773       sve_uzp1(dst, S, src, tmp);
1774       sve_uzp1(dst, H, dst, tmp);
1775       sve_uzp1(dst, B, dst, tmp);
1776       break;
1777     default:
1778       ShouldNotReachHere();
1779     }
1780   } else if (src_size == S) {
1781     if (dst_size == H) {
1782       sve_uzp1(dst, H, src, tmp);
1783     } else { // B
1784       assert_different_registers(dst, tmp);
1785       sve_uzp1(dst, H, src, tmp);
1786       sve_uzp1(dst, B, dst, tmp);
1787     }
1788   } else if (src_size == H) {
1789     sve_uzp1(dst, B, src, tmp);
1790   }
1791 }
1792 
1793 // Extend src predicate to dst predicate with the same lane count but larger
1794 // element size, e.g. 64Byte -> 512Long
1795 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1796                                              uint dst_element_length_in_bytes,
1797                                              uint src_element_length_in_bytes) {
1798   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1799     sve_punpklo(dst, src);
1800   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1801     sve_punpklo(dst, src);
1802     sve_punpklo(dst, dst);
1803   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1804     sve_punpklo(dst, src);
1805     sve_punpklo(dst, dst);
1806     sve_punpklo(dst, dst);
1807   } else {
1808     assert(false, "unsupported");
1809     ShouldNotReachHere();
1810   }
1811 }
1812 
1813 // Narrow src predicate to dst predicate with the same lane count but
1814 // smaller element size, e.g. 512Long -> 64Byte
1815 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1816                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1817   // The insignificant bits in src predicate are expected to be zero.
1818   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1819   // passed as the second argument. An example narrowing operation with a given mask would be -
1820   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1821   // Mask (for 2 Longs) : TF
1822   // Predicate register for the above mask (16 bits) : 00000001 00000000
1823   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1824   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1825   assert_different_registers(src, ptmp);
1826   assert_different_registers(dst, ptmp);
1827   sve_pfalse(ptmp);
1828   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1829     sve_uzp1(dst, B, src, ptmp);
1830   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1831     sve_uzp1(dst, H, src, ptmp);
1832     sve_uzp1(dst, B, dst, ptmp);
1833   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1834     sve_uzp1(dst, S, src, ptmp);
1835     sve_uzp1(dst, H, dst, ptmp);
1836     sve_uzp1(dst, B, dst, ptmp);
1837   } else {
1838     assert(false, "unsupported");
1839     ShouldNotReachHere();
1840   }
1841 }
1842 
1843 // Vector reduction add for integral type with ASIMD instructions.
1844 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1845                                                  Register isrc, FloatRegister vsrc,
1846                                                  unsigned vector_length_in_bytes,
1847                                                  FloatRegister vtmp) {
1848   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1849   assert_different_registers(dst, isrc);
1850   bool isQ = vector_length_in_bytes == 16;
1851 
1852   BLOCK_COMMENT("neon_reduce_add_integral {");
1853     switch(bt) {
1854       case T_BYTE:
1855         addv(vtmp, isQ ? T16B : T8B, vsrc);
1856         smov(dst, vtmp, B, 0);
1857         addw(dst, dst, isrc, ext::sxtb);
1858         break;
1859       case T_SHORT:
1860         addv(vtmp, isQ ? T8H : T4H, vsrc);
1861         smov(dst, vtmp, H, 0);
1862         addw(dst, dst, isrc, ext::sxth);
1863         break;
1864       case T_INT:
1865         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1866         umov(dst, vtmp, S, 0);
1867         addw(dst, dst, isrc);
1868         break;
1869       case T_LONG:
1870         assert(isQ, "unsupported");
1871         addpd(vtmp, vsrc);
1872         umov(dst, vtmp, D, 0);
1873         add(dst, dst, isrc);
1874         break;
1875       default:
1876         assert(false, "unsupported");
1877         ShouldNotReachHere();
1878     }
1879   BLOCK_COMMENT("} neon_reduce_add_integral");
1880 }
1881 
1882 // Vector reduction multiply for integral type with ASIMD instructions.
1883 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1884 // Clobbers: rscratch1
1885 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1886                                                  Register isrc, FloatRegister vsrc,
1887                                                  unsigned vector_length_in_bytes,
1888                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1889   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1890   bool isQ = vector_length_in_bytes == 16;
1891 
1892   BLOCK_COMMENT("neon_reduce_mul_integral {");
1893     switch(bt) {
1894       case T_BYTE:
1895         if (isQ) {
1896           // Multiply the lower half and higher half of vector iteratively.
1897           // vtmp1 = vsrc[8:15]
1898           ins(vtmp1, D, vsrc, 0, 1);
1899           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1900           mulv(vtmp1, T8B, vtmp1, vsrc);
1901           // vtmp2 = vtmp1[4:7]
1902           ins(vtmp2, S, vtmp1, 0, 1);
1903           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1904           mulv(vtmp1, T8B, vtmp2, vtmp1);
1905         } else {
1906           ins(vtmp1, S, vsrc, 0, 1);
1907           mulv(vtmp1, T8B, vtmp1, vsrc);
1908         }
1909         // vtmp2 = vtmp1[2:3]
1910         ins(vtmp2, H, vtmp1, 0, 1);
1911         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1912         mulv(vtmp2, T8B, vtmp2, vtmp1);
1913         // dst = vtmp2[0] * isrc * vtmp2[1]
1914         umov(rscratch1, vtmp2, B, 0);
1915         mulw(dst, rscratch1, isrc);
1916         sxtb(dst, dst);
1917         umov(rscratch1, vtmp2, B, 1);
1918         mulw(dst, rscratch1, dst);
1919         sxtb(dst, dst);
1920         break;
1921       case T_SHORT:
1922         if (isQ) {
1923           ins(vtmp2, D, vsrc, 0, 1);
1924           mulv(vtmp2, T4H, vtmp2, vsrc);
1925           ins(vtmp1, S, vtmp2, 0, 1);
1926           mulv(vtmp1, T4H, vtmp1, vtmp2);
1927         } else {
1928           ins(vtmp1, S, vsrc, 0, 1);
1929           mulv(vtmp1, T4H, vtmp1, vsrc);
1930         }
1931         umov(rscratch1, vtmp1, H, 0);
1932         mulw(dst, rscratch1, isrc);
1933         sxth(dst, dst);
1934         umov(rscratch1, vtmp1, H, 1);
1935         mulw(dst, rscratch1, dst);
1936         sxth(dst, dst);
1937         break;
1938       case T_INT:
1939         if (isQ) {
1940           ins(vtmp1, D, vsrc, 0, 1);
1941           mulv(vtmp1, T2S, vtmp1, vsrc);
1942         } else {
1943           vtmp1 = vsrc;
1944         }
1945         umov(rscratch1, vtmp1, S, 0);
1946         mul(dst, rscratch1, isrc);
1947         umov(rscratch1, vtmp1, S, 1);
1948         mul(dst, rscratch1, dst);
1949         break;
1950       case T_LONG:
1951         umov(rscratch1, vsrc, D, 0);
1952         mul(dst, isrc, rscratch1);
1953         umov(rscratch1, vsrc, D, 1);
1954         mul(dst, dst, rscratch1);
1955         break;
1956       default:
1957         assert(false, "unsupported");
1958         ShouldNotReachHere();
1959     }
1960   BLOCK_COMMENT("} neon_reduce_mul_integral");
1961 }
1962 
1963 // Vector reduction multiply for floating-point type with ASIMD instructions.
1964 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1965                                            FloatRegister fsrc, FloatRegister vsrc,
1966                                            unsigned vector_length_in_bytes,
1967                                            FloatRegister vtmp) {
1968   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1969   bool isQ = vector_length_in_bytes == 16;
1970 
1971   BLOCK_COMMENT("neon_reduce_mul_fp {");
1972     switch(bt) {
1973       case T_FLOAT:
1974         fmuls(dst, fsrc, vsrc);
1975         ins(vtmp, S, vsrc, 0, 1);
1976         fmuls(dst, dst, vtmp);
1977         if (isQ) {
1978           ins(vtmp, S, vsrc, 0, 2);
1979           fmuls(dst, dst, vtmp);
1980           ins(vtmp, S, vsrc, 0, 3);
1981           fmuls(dst, dst, vtmp);
1982          }
1983         break;
1984       case T_DOUBLE:
1985         assert(isQ, "unsupported");
1986         fmuld(dst, fsrc, vsrc);
1987         ins(vtmp, D, vsrc, 0, 1);
1988         fmuld(dst, dst, vtmp);
1989         break;
1990       default:
1991         assert(false, "unsupported");
1992         ShouldNotReachHere();
1993     }
1994   BLOCK_COMMENT("} neon_reduce_mul_fp");
1995 }
1996 
1997 // Helper to select logical instruction
1998 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1999                                                    Register Rn, Register Rm,
2000                                                    enum shift_kind kind, unsigned shift) {
2001   switch(opc) {
2002     case Op_AndReductionV:
2003       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2004       break;
2005     case Op_OrReductionV:
2006       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2007       break;
2008     case Op_XorReductionV:
2009       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2010       break;
2011     default:
2012       assert(false, "unsupported");
2013       ShouldNotReachHere();
2014   }
2015 }
2016 
2017 // Vector reduction logical operations And, Or, Xor
2018 // Clobbers: rscratch1
2019 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2020                                             Register isrc, FloatRegister vsrc,
2021                                             unsigned vector_length_in_bytes) {
2022   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2023          "unsupported");
2024   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2025   assert_different_registers(dst, isrc);
2026   bool isQ = vector_length_in_bytes == 16;
2027 
2028   BLOCK_COMMENT("neon_reduce_logical {");
2029     umov(rscratch1, vsrc, isQ ? D : S, 0);
2030     umov(dst, vsrc, isQ ? D : S, 1);
2031     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2032     switch(bt) {
2033       case T_BYTE:
2034         if (isQ) {
2035           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2036         }
2037         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2038         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2039         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2040         sxtb(dst, dst);
2041         break;
2042       case T_SHORT:
2043         if (isQ) {
2044           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2045         }
2046         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2047         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2048         sxth(dst, dst);
2049         break;
2050       case T_INT:
2051         if (isQ) {
2052           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2053         }
2054         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2055         break;
2056       case T_LONG:
2057         assert(isQ, "unsupported");
2058         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2059         break;
2060       default:
2061         assert(false, "unsupported");
2062         ShouldNotReachHere();
2063     }
2064   BLOCK_COMMENT("} neon_reduce_logical");
2065 }
2066 
2067 // Vector reduction min/max for integral type with ASIMD instructions.
2068 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2069 // Clobbers: rscratch1, rflags
2070 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2071                                                     Register isrc, FloatRegister vsrc,
2072                                                     unsigned vector_length_in_bytes,
2073                                                     FloatRegister vtmp) {
2074   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2075   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2076   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2077   assert_different_registers(dst, isrc);
2078   bool isQ = vector_length_in_bytes == 16;
2079   bool is_min = opc == Op_MinReductionV;
2080 
2081   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2082     if (bt == T_LONG) {
2083       assert(vtmp == fnoreg, "should be");
2084       assert(isQ, "should be");
2085       umov(rscratch1, vsrc, D, 0);
2086       cmp(isrc, rscratch1);
2087       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2088       umov(rscratch1, vsrc, D, 1);
2089       cmp(dst, rscratch1);
2090       csel(dst, dst, rscratch1, is_min ? LT : GT);
2091     } else {
2092       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2093       if (size == T2S) {
2094         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2095       } else {
2096         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2097       }
2098       if (bt == T_INT) {
2099         umov(dst, vtmp, S, 0);
2100       } else {
2101         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2102       }
2103       cmpw(dst, isrc);
2104       cselw(dst, dst, isrc, is_min ? LT : GT);
2105     }
2106   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2107 }
2108 
2109 // Vector reduction for integral type with SVE instruction.
2110 // Supported operations are Add, And, Or, Xor, Max, Min.
2111 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2112 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2113                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2114   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2115   assert(pg->is_governing(), "This register has to be a governing predicate register");
2116   assert_different_registers(src1, dst);
2117   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2118   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2119   switch (opc) {
2120     case Op_AddReductionVI: {
2121       sve_uaddv(tmp, size, pg, src2);
2122       if (bt == T_BYTE) {
2123         smov(dst, tmp, size, 0);
2124         addw(dst, src1, dst, ext::sxtb);
2125       } else if (bt == T_SHORT) {
2126         smov(dst, tmp, size, 0);
2127         addw(dst, src1, dst, ext::sxth);
2128       } else {
2129         umov(dst, tmp, size, 0);
2130         addw(dst, dst, src1);
2131       }
2132       break;
2133     }
2134     case Op_AddReductionVL: {
2135       sve_uaddv(tmp, size, pg, src2);
2136       umov(dst, tmp, size, 0);
2137       add(dst, dst, src1);
2138       break;
2139     }
2140     case Op_AndReductionV: {
2141       sve_andv(tmp, size, pg, src2);
2142       if (bt == T_INT || bt == T_LONG) {
2143         umov(dst, tmp, size, 0);
2144       } else {
2145         smov(dst, tmp, size, 0);
2146       }
2147       if (bt == T_LONG) {
2148         andr(dst, dst, src1);
2149       } else {
2150         andw(dst, dst, src1);
2151       }
2152       break;
2153     }
2154     case Op_OrReductionV: {
2155       sve_orv(tmp, size, pg, src2);
2156       if (bt == T_INT || bt == T_LONG) {
2157         umov(dst, tmp, size, 0);
2158       } else {
2159         smov(dst, tmp, size, 0);
2160       }
2161       if (bt == T_LONG) {
2162         orr(dst, dst, src1);
2163       } else {
2164         orrw(dst, dst, src1);
2165       }
2166       break;
2167     }
2168     case Op_XorReductionV: {
2169       sve_eorv(tmp, size, pg, src2);
2170       if (bt == T_INT || bt == T_LONG) {
2171         umov(dst, tmp, size, 0);
2172       } else {
2173         smov(dst, tmp, size, 0);
2174       }
2175       if (bt == T_LONG) {
2176         eor(dst, dst, src1);
2177       } else {
2178         eorw(dst, dst, src1);
2179       }
2180       break;
2181     }
2182     case Op_MaxReductionV: {
2183       sve_smaxv(tmp, size, pg, src2);
2184       if (bt == T_INT || bt == T_LONG) {
2185         umov(dst, tmp, size, 0);
2186       } else {
2187         smov(dst, tmp, size, 0);
2188       }
2189       if (bt == T_LONG) {
2190         cmp(dst, src1);
2191         csel(dst, dst, src1, Assembler::GT);
2192       } else {
2193         cmpw(dst, src1);
2194         cselw(dst, dst, src1, Assembler::GT);
2195       }
2196       break;
2197     }
2198     case Op_MinReductionV: {
2199       sve_sminv(tmp, size, pg, src2);
2200       if (bt == T_INT || bt == T_LONG) {
2201         umov(dst, tmp, size, 0);
2202       } else {
2203         smov(dst, tmp, size, 0);
2204       }
2205       if (bt == T_LONG) {
2206         cmp(dst, src1);
2207         csel(dst, dst, src1, Assembler::LT);
2208       } else {
2209         cmpw(dst, src1);
2210         cselw(dst, dst, src1, Assembler::LT);
2211       }
2212       break;
2213     }
2214     default:
2215       assert(false, "unsupported");
2216       ShouldNotReachHere();
2217   }
2218 
2219   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2220     if (bt == T_BYTE) {
2221       sxtb(dst, dst);
2222     } else if (bt == T_SHORT) {
2223       sxth(dst, dst);
2224     }
2225   }
2226 }
2227 
2228 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2229 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2230 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2231 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2232   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2233   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2234 
2235   // Set all elements to false if the input "lane_cnt" is zero.
2236   if (lane_cnt == 0) {
2237     sve_pfalse(dst);
2238     return;
2239   }
2240 
2241   SIMD_RegVariant size = elemType_to_regVariant(bt);
2242   assert(size != Q, "invalid size");
2243 
2244   // Set all true if "lane_cnt" equals to the max lane count.
2245   if (lane_cnt == max_vector_length) {
2246     sve_ptrue(dst, size, /* ALL */ 0b11111);
2247     return;
2248   }
2249 
2250   // Fixed numbers for "ptrue".
2251   switch(lane_cnt) {
2252   case 1: /* VL1 */
2253   case 2: /* VL2 */
2254   case 3: /* VL3 */
2255   case 4: /* VL4 */
2256   case 5: /* VL5 */
2257   case 6: /* VL6 */
2258   case 7: /* VL7 */
2259   case 8: /* VL8 */
2260     sve_ptrue(dst, size, lane_cnt);
2261     return;
2262   case 16:
2263     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2264     return;
2265   case 32:
2266     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2267     return;
2268   case 64:
2269     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2270     return;
2271   case 128:
2272     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2273     return;
2274   case 256:
2275     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2276     return;
2277   default:
2278     break;
2279   }
2280 
2281   // Special patterns for "ptrue".
2282   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2283     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2284   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2285     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2286   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2287     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2288   } else {
2289     // Encode to "whileltw" for the remaining cases.
2290     mov(rscratch1, lane_cnt);
2291     sve_whileltw(dst, size, zr, rscratch1);
2292   }
2293 }
2294 
2295 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2296 // Any remaining elements of dst will be filled with zero.
2297 // Clobbers: rscratch1
2298 // Preserves: src, mask
2299 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2300                                            FloatRegister vtmp1, FloatRegister vtmp2,
2301                                            PRegister pgtmp) {
2302   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2303   assert_different_registers(dst, src, vtmp1, vtmp2);
2304   assert_different_registers(mask, pgtmp);
2305 
2306   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2307   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2308   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2309   sve_dup(vtmp2, H, 0);
2310 
2311   // Extend lowest half to type INT.
2312   // dst = 00004444 00003333 00002222 00001111
2313   sve_uunpklo(dst, S, src);
2314   // pgtmp = 00000001 00000000 00000001 00000001
2315   sve_punpklo(pgtmp, mask);
2316   // Pack the active elements in size of type INT to the right,
2317   // and fill the remainings with zero.
2318   // dst = 00000000 00004444 00002222 00001111
2319   sve_compact(dst, S, dst, pgtmp);
2320   // Narrow the result back to type SHORT.
2321   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2322   sve_uzp1(dst, H, dst, vtmp2);
2323   // Count the active elements of lowest half.
2324   // rscratch1 = 3
2325   sve_cntp(rscratch1, S, ptrue, pgtmp);
2326 
2327   // Repeat to the highest half.
2328   // pgtmp = 00000001 00000000 00000000 00000001
2329   sve_punpkhi(pgtmp, mask);
2330   // vtmp1 = 00008888 00007777 00006666 00005555
2331   sve_uunpkhi(vtmp1, S, src);
2332   // vtmp1 = 00000000 00000000 00008888 00005555
2333   sve_compact(vtmp1, S, vtmp1, pgtmp);
2334   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2335   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2336 
2337   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2338   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2339   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2340   // TRUE_CNT is the number of active elements in the compressed low.
2341   neg(rscratch1, rscratch1);
2342   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2343   sve_index(vtmp2, H, rscratch1, 1);
2344   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2345   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2346 
2347   // Combine the compressed high(after shifted) with the compressed low.
2348   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2349   sve_orr(dst, dst, vtmp1);
2350 }
2351 
2352 // Clobbers: rscratch1, rscratch2
2353 // Preserves: src, mask
2354 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2355                                           FloatRegister vtmp1, FloatRegister vtmp2,
2356                                           FloatRegister vtmp3, FloatRegister vtmp4,
2357                                           PRegister ptmp, PRegister pgtmp) {
2358   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2359   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2360   assert_different_registers(mask, ptmp, pgtmp);
2361   // Example input:   src   = 88 77 66 55 44 33 22 11
2362   //                  mask  = 01 00 00 01 01 00 01 01
2363   // Expected result: dst   = 00 00 00 88 55 44 22 11
2364 
2365   sve_dup(vtmp4, B, 0);
2366   // Extend lowest half to type SHORT.
2367   // vtmp1 = 0044 0033 0022 0011
2368   sve_uunpklo(vtmp1, H, src);
2369   // ptmp = 0001 0000 0001 0001
2370   sve_punpklo(ptmp, mask);
2371   // Count the active elements of lowest half.
2372   // rscratch2 = 3
2373   sve_cntp(rscratch2, H, ptrue, ptmp);
2374   // Pack the active elements in size of type SHORT to the right,
2375   // and fill the remainings with zero.
2376   // dst = 0000 0044 0022 0011
2377   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2378   // Narrow the result back to type BYTE.
2379   // dst = 00 00 00 00 00 44 22 11
2380   sve_uzp1(dst, B, dst, vtmp4);
2381 
2382   // Repeat to the highest half.
2383   // ptmp = 0001 0000 0000 0001
2384   sve_punpkhi(ptmp, mask);
2385   // vtmp1 = 0088 0077 0066 0055
2386   sve_uunpkhi(vtmp2, H, src);
2387   // vtmp1 = 0000 0000 0088 0055
2388   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2389 
2390   sve_dup(vtmp4, B, 0);
2391   // vtmp1 = 00 00 00 00 00 00 88 55
2392   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2393 
2394   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2395   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2396   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2397   // TRUE_CNT is the number of active elements in the compressed low.
2398   neg(rscratch2, rscratch2);
2399   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2400   sve_index(vtmp2, B, rscratch2, 1);
2401   // vtmp1 = 00 00 00 88 55 00 00 00
2402   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2403   // Combine the compressed high(after shifted) with the compressed low.
2404   // dst = 00 00 00 88 55 44 22 11
2405   sve_orr(dst, dst, vtmp1);
2406 }
2407 
2408 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2409   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2410   SIMD_Arrangement size = isQ ? T16B : T8B;
2411   if (bt == T_BYTE) {
2412     rbit(dst, size, src);
2413   } else {
2414     neon_reverse_bytes(dst, src, bt, isQ);
2415     rbit(dst, size, dst);
2416   }
2417 }
2418 
2419 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2420   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2421   SIMD_Arrangement size = isQ ? T16B : T8B;
2422   switch (bt) {
2423     case T_BYTE:
2424       if (dst != src) {
2425         orr(dst, size, src, src);
2426       }
2427       break;
2428     case T_SHORT:
2429       rev16(dst, size, src);
2430       break;
2431     case T_INT:
2432       rev32(dst, size, src);
2433       break;
2434     case T_LONG:
2435       rev64(dst, size, src);
2436       break;
2437     default:
2438       assert(false, "unsupported");
2439       ShouldNotReachHere();
2440   }
2441 }
2442 
2443 // Extract a scalar element from an sve vector at position 'idx'.
2444 // The input elements in src are expected to be of integral type.
2445 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2446                                              int idx, FloatRegister vtmp) {
2447   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2448   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2449   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2450     if (bt == T_INT || bt == T_LONG) {
2451       umov(dst, src, size, idx);
2452     } else {
2453       smov(dst, src, size, idx);
2454     }
2455   } else {
2456     sve_orr(vtmp, src, src);
2457     sve_ext(vtmp, vtmp, idx << size);
2458     if (bt == T_INT || bt == T_LONG) {
2459       umov(dst, vtmp, size, 0);
2460     } else {
2461       smov(dst, vtmp, size, 0);
2462     }
2463   }
2464 }
2465 
2466 // java.lang.Math::round intrinsics
2467 
2468 // Clobbers: rscratch1, rflags
2469 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2470                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2471   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2472   switch (T) {
2473     case T2S:
2474     case T4S:
2475       fmovs(tmp1, T, 0.5f);
2476       mov(rscratch1, jint_cast(0x1.0p23f));
2477       break;
2478     case T2D:
2479       fmovd(tmp1, T, 0.5);
2480       mov(rscratch1, julong_cast(0x1.0p52));
2481       break;
2482     default:
2483       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2484   }
2485   fadd(tmp1, T, tmp1, src);
2486   fcvtms(tmp1, T, tmp1);
2487   // tmp1 = floor(src + 0.5, ties to even)
2488 
2489   fcvtas(dst, T, src);
2490   // dst = round(src), ties to away
2491 
2492   fneg(tmp3, T, src);
2493   dup(tmp2, T, rscratch1);
2494   cm(HS, tmp3, T, tmp3, tmp2);
2495   // tmp3 is now a set of flags
2496 
2497   bif(dst, T16B, tmp1, tmp3);
2498   // result in dst
2499 }
2500 
2501 // Clobbers: rscratch1, rflags
2502 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2503                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2504   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2505   assert_different_registers(tmp1, tmp2, src, dst);
2506 
2507   switch (T) {
2508     case S:
2509       mov(rscratch1, jint_cast(0x1.0p23f));
2510       break;
2511     case D:
2512       mov(rscratch1, julong_cast(0x1.0p52));
2513       break;
2514     default:
2515       assert(T == S || T == D, "invalid register variant");
2516   }
2517 
2518   sve_frinta(dst, T, ptrue, src);
2519   // dst = round(src), ties to away
2520 
2521   Label none;
2522 
2523   sve_fneg(tmp1, T, ptrue, src);
2524   sve_dup(tmp2, T, rscratch1);
2525   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2526   br(EQ, none);
2527   {
2528     sve_cpy(tmp1, T, pgtmp, 0.5);
2529     sve_fadd(tmp1, T, pgtmp, src);
2530     sve_frintm(dst, T, pgtmp, tmp1);
2531     // dst = floor(src + 0.5, ties to even)
2532   }
2533   bind(none);
2534 
2535   sve_fcvtzs(dst, T, ptrue, dst, T);
2536   // result in dst
2537 }
2538 
2539 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2540                                            FloatRegister one, SIMD_Arrangement T) {
2541   assert_different_registers(dst, src, zero, one);
2542   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2543 
2544   facgt(dst, T, src, zero);
2545   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2546   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2547 }
2548 
2549 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2550                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2551     assert_different_registers(dst, src, zero, one, vtmp);
2552     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2553 
2554     sve_orr(vtmp, src, src);
2555     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2556     switch (T) {
2557     case S:
2558       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2559       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2560                                         // on the sign of the float value
2561       break;
2562     case D:
2563       sve_and(vtmp, T, min_jlong);
2564       sve_orr(vtmp, T, jlong_cast(1.0));
2565       break;
2566     default:
2567       assert(false, "unsupported");
2568       ShouldNotReachHere();
2569     }
2570     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2571                                        // Result in dst
2572 }
2573 
2574 bool C2_MacroAssembler::in_scratch_emit_size() {
2575   if (ciEnv::current()->task() != nullptr) {
2576     PhaseOutput* phase_output = Compile::current()->output();
2577     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2578       return true;
2579     }
2580   }
2581   return MacroAssembler::in_scratch_emit_size();
2582 }
2583 
2584 void C2_MacroAssembler::load_nklass_compact(Register dst, Register obj, Register index, int scale, int disp) {
2585   // Note: Don't clobber obj anywhere in that method!
2586 
2587   // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
2588   // obj-start, so that we can load from the object's mark-word instead. Usually the address
2589   // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
2590   // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
2591   // then passes that register as obj and 0 in disp. The following code extracts the base
2592   // and offset to load the mark-word.
2593   int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
2594   if (index == noreg) {
2595     ldr(dst, Address(obj, offset));
2596   } else {
2597     lea(dst, Address(obj, index, Address::lsl(scale)));
2598     ldr(dst, Address(dst, offset));
2599   }
2600   lsr(dst, dst, markWord::klass_shift);
2601 }