1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 
  37 #ifdef PRODUCT
  38 #define BLOCK_COMMENT(str) /* nothing */
  39 #define STOP(error) stop(error)
  40 #else
  41 #define BLOCK_COMMENT(str) block_comment(str)
  42 #define STOP(error) block_comment(error); stop(error)
  43 #endif
  44 
  45 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  46 
  47 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  48 
  49 void C2_MacroAssembler::entry_barrier() {
  50   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  51   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  52     // Dummy labels for just measuring the code size
  53     Label dummy_slow_path;
  54     Label dummy_continuation;
  55     Label dummy_guard;
  56     Label* slow_path = &dummy_slow_path;
  57     Label* continuation = &dummy_continuation;
  58     Label* guard = &dummy_guard;
  59     if (!Compile::current()->output()->in_scratch_emit_size()) {
  60       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  61       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  62       Compile::current()->output()->add_stub(stub);
  63       slow_path = &stub->entry();
  64       continuation = &stub->continuation();
  65       guard = &stub->guard();
  66     }
  67     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  68     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  69   }
  70 }
  71 
  72 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
  73                                   Register tmp2Reg, Register tmp3Reg) {
  74   Register oop = objectReg;
  75   Register box = boxReg;
  76   Register disp_hdr = tmpReg;
  77   Register tmp = tmp2Reg;
  78   Label cont;
  79   Label object_has_monitor;
  80   Label count, no_count;
  81 
  82   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
  83   assert_different_registers(oop, box, tmp, disp_hdr);
  84 
  85   // Load markWord from object into displaced_header.
  86   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
  87 
  88   if (DiagnoseSyncOnValueBasedClasses != 0) {
  89     load_klass(tmp, oop);
  90     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
  91     tst(tmp, KlassFlags::_misc_is_value_based_class);
  92     br(Assembler::NE, cont);
  93   }
  94 
  95   // Check for existing monitor
  96   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
  97 
  98   if (LockingMode == LM_MONITOR) {
  99     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 100     b(cont);
 101   } else {
 102     assert(LockingMode == LM_LEGACY, "must be");
 103     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 104     orr(tmp, disp_hdr, markWord::unlocked_value);
 105 
 106     if (EnableValhalla) {
 107       // Mask inline_type bit such that we go to the slow path if object is an inline type
 108       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 109     }
 110 
 111     // Initialize the box. (Must happen before we update the object mark!)
 112     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 113 
 114     // Compare object markWord with an unlocked value (tmp) and if
 115     // equal exchange the stack address of our box with object markWord.
 116     // On failure disp_hdr contains the possibly locked markWord.
 117     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 118             /*release*/ true, /*weak*/ false, disp_hdr);
 119     br(Assembler::EQ, cont);
 120 
 121     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 122 
 123     // If the compare-and-exchange succeeded, then we found an unlocked
 124     // object, will have now locked it will continue at label cont
 125 
 126     // Check if the owner is self by comparing the value in the
 127     // markWord of object (disp_hdr) with the stack pointer.
 128     mov(rscratch1, sp);
 129     sub(disp_hdr, disp_hdr, rscratch1);
 130     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 131     // If condition is true we are cont and hence we can store 0 as the
 132     // displaced header in the box, which indicates that it is a recursive lock.
 133     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 134     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 135     b(cont);
 136   }
 137 
 138   // Handle existing monitor.
 139   bind(object_has_monitor);
 140 
 141   // The object's monitor m is unlocked iff m->owner == nullptr,
 142   // otherwise m->owner may contain a thread or a stack address.
 143   //
 144   // Try to CAS m->owner from null to current thread.
 145   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 146   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 147           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 148 
 149   // Store a non-null value into the box to avoid looking like a re-entrant
 150   // lock. The fast-path monitor unlock code checks for
 151   // markWord::monitor_value so use markWord::unused_mark which has the
 152   // relevant bit set, and also matches ObjectSynchronizer::enter.
 153   mov(tmp, (address)markWord::unused_mark().value());
 154   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 155 
 156   br(Assembler::EQ, cont); // CAS success means locking succeeded
 157 
 158   cmp(tmp3Reg, rthread);
 159   br(Assembler::NE, cont); // Check for recursive locking
 160 
 161   // Recursive lock case
 162   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 163   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 164 
 165   bind(cont);
 166   // flag == EQ indicates success
 167   // flag == NE indicates failure
 168   br(Assembler::NE, no_count);
 169 
 170   bind(count);
 171   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 172 
 173   bind(no_count);
 174 }
 175 
 176 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 177                                     Register tmp2Reg) {
 178   Register oop = objectReg;
 179   Register box = boxReg;
 180   Register disp_hdr = tmpReg;
 181   Register tmp = tmp2Reg;
 182   Label cont;
 183   Label object_has_monitor;
 184   Label count, no_count;
 185 
 186   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 187   assert_different_registers(oop, box, tmp, disp_hdr);
 188 
 189   if (LockingMode == LM_LEGACY) {
 190     // Find the lock address and load the displaced header from the stack.
 191     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 192 
 193     // If the displaced header is 0, we have a recursive unlock.
 194     cmp(disp_hdr, zr);
 195     br(Assembler::EQ, cont);
 196   }
 197 
 198   // Handle existing monitor.
 199   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 200   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 201 
 202   if (LockingMode == LM_MONITOR) {
 203     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 204     b(cont);
 205   } else {
 206     assert(LockingMode == LM_LEGACY, "must be");
 207     // Check if it is still a light weight lock, this is is true if we
 208     // see the stack address of the basicLock in the markWord of the
 209     // object.
 210 
 211     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 212             /*release*/ true, /*weak*/ false, tmp);
 213     b(cont);
 214   }
 215 
 216   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 217 
 218   // Handle existing monitor.
 219   bind(object_has_monitor);
 220   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 221   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 222 
 223   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 224 
 225   Label notRecursive;
 226   cbz(disp_hdr, notRecursive);
 227 
 228   // Recursive lock
 229   sub(disp_hdr, disp_hdr, 1u);
 230   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 231   cmp(disp_hdr, disp_hdr); // Sets flags for result
 232   b(cont);
 233 
 234   bind(notRecursive);
 235   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 236   ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
 237   orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
 238   cmp(rscratch1, zr); // Sets flags for result
 239   cbnz(rscratch1, cont);
 240   // need a release store here
 241   lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
 242   stlr(zr, tmp); // set unowned
 243 
 244   bind(cont);
 245   // flag == EQ indicates success
 246   // flag == NE indicates failure
 247   br(Assembler::NE, no_count);
 248 
 249   bind(count);
 250   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 251 
 252   bind(no_count);
 253 }
 254 
 255 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 256                                               Register t2, Register t3) {
 257   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 258   assert_different_registers(obj, box, t1, t2, t3);
 259 
 260   // Handle inflated monitor.
 261   Label inflated;
 262   // Finish fast lock successfully. MUST branch to with flag == EQ
 263   Label locked;
 264   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 265   Label slow_path;
 266 
 267   if (UseObjectMonitorTable) {
 268     // Clear cache in case fast locking succeeds.
 269     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 270   }
 271 
 272   if (DiagnoseSyncOnValueBasedClasses != 0) {
 273     load_klass(t1, obj);
 274     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 275     tst(t1, KlassFlags::_misc_is_value_based_class);
 276     br(Assembler::NE, slow_path);
 277   }
 278 
 279   const Register t1_mark = t1;
 280   const Register t3_t = t3;
 281 
 282   { // Lightweight locking
 283 
 284     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 285     Label push;
 286 
 287     const Register t2_top = t2;
 288 
 289     // Check if lock-stack is full.
 290     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 291     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 292     br(Assembler::GT, slow_path);
 293 
 294     // Check if recursive.
 295     subw(t3_t, t2_top, oopSize);
 296     ldr(t3_t, Address(rthread, t3_t));
 297     cmp(obj, t3_t);
 298     br(Assembler::EQ, push);
 299 
 300     // Relaxed normal load to check for monitor. Optimization for monitor case.
 301     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 302     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 303 
 304     // Not inflated
 305     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 306 
 307     // Try to lock. Transition lock-bits 0b01 => 0b00
 308     orr(t1_mark, t1_mark, markWord::unlocked_value);
 309     eor(t3_t, t1_mark, markWord::unlocked_value);
 310     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 311             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 312     br(Assembler::NE, slow_path);
 313 
 314     bind(push);
 315     // After successful lock, push object on lock-stack.
 316     str(obj, Address(rthread, t2_top));
 317     addw(t2_top, t2_top, oopSize);
 318     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 319     b(locked);
 320   }
 321 
 322   { // Handle inflated monitor.
 323     bind(inflated);
 324 
 325     const Register t1_monitor = t1;
 326 
 327     if (!UseObjectMonitorTable) {
 328       assert(t1_monitor == t1_mark, "should be the same here");
 329     } else {
 330       Label monitor_found;
 331 
 332       // Load cache address
 333       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 334 
 335       const int num_unrolled = 2;
 336       for (int i = 0; i < num_unrolled; i++) {
 337         ldr(t1, Address(t3_t));
 338         cmp(obj, t1);
 339         br(Assembler::EQ, monitor_found);
 340         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 341       }
 342 
 343       Label loop;
 344 
 345       // Search for obj in cache.
 346       bind(loop);
 347 
 348       // Check for match.
 349       ldr(t1, Address(t3_t));
 350       cmp(obj, t1);
 351       br(Assembler::EQ, monitor_found);
 352 
 353       // Search until null encountered, guaranteed _null_sentinel at end.
 354       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 355       cbnz(t1, loop);
 356       // Cache Miss, NE set from cmp above, cbnz does not set flags
 357       b(slow_path);
 358 
 359       bind(monitor_found);
 360       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 361     }
 362 
 363     const Register t2_owner_addr = t2;
 364     const Register t3_owner = t3;
 365     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 366     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 367     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 368 
 369     Label monitor_locked;
 370 
 371     // Compute owner address.
 372     lea(t2_owner_addr, owner_address);
 373 
 374     // CAS owner (null => current thread).
 375     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 376             /*release*/ false, /*weak*/ false, t3_owner);
 377     br(Assembler::EQ, monitor_locked);
 378 
 379     // Check if recursive.
 380     cmp(t3_owner, rthread);
 381     br(Assembler::NE, slow_path);
 382 
 383     // Recursive.
 384     increment(recursions_address, 1);
 385 
 386     bind(monitor_locked);
 387     if (UseObjectMonitorTable) {
 388       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 389     }
 390   }
 391 
 392   bind(locked);
 393   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 394 
 395 #ifdef ASSERT
 396   // Check that locked label is reached with Flags == EQ.
 397   Label flag_correct;
 398   br(Assembler::EQ, flag_correct);
 399   stop("Fast Lock Flag != EQ");
 400 #endif
 401 
 402   bind(slow_path);
 403 #ifdef ASSERT
 404   // Check that slow_path label is reached with Flags == NE.
 405   br(Assembler::NE, flag_correct);
 406   stop("Fast Lock Flag != NE");
 407   bind(flag_correct);
 408 #endif
 409   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 410 }
 411 
 412 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 413                                                 Register t2, Register t3) {
 414   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 415   assert_different_registers(obj, box, t1, t2, t3);
 416 
 417   // Handle inflated monitor.
 418   Label inflated, inflated_load_mark;
 419   // Finish fast unlock successfully. MUST branch to with flag == EQ
 420   Label unlocked;
 421   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 422   Label slow_path;
 423 
 424   const Register t1_mark = t1;
 425   const Register t2_top = t2;
 426   const Register t3_t = t3;
 427 
 428   { // Lightweight unlock
 429 
 430     Label push_and_slow_path;
 431 
 432     // Check if obj is top of lock-stack.
 433     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 434     subw(t2_top, t2_top, oopSize);
 435     ldr(t3_t, Address(rthread, t2_top));
 436     cmp(obj, t3_t);
 437     // Top of lock stack was not obj. Must be monitor.
 438     br(Assembler::NE, inflated_load_mark);
 439 
 440     // Pop lock-stack.
 441     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 442     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 443 
 444     // Check if recursive.
 445     subw(t3_t, t2_top, oopSize);
 446     ldr(t3_t, Address(rthread, t3_t));
 447     cmp(obj, t3_t);
 448     br(Assembler::EQ, unlocked);
 449 
 450     // Not recursive.
 451     // Load Mark.
 452     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 453 
 454     // Check header for monitor (0b10).
 455     // Because we got here by popping (meaning we pushed in locked)
 456     // there will be no monitor in the box. So we need to push back the obj
 457     // so that the runtime can fix any potential anonymous owner.
 458     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 459 
 460     // Try to unlock. Transition lock bits 0b00 => 0b01
 461     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 462     orr(t3_t, t1_mark, markWord::unlocked_value);
 463     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 464             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 465     br(Assembler::EQ, unlocked);
 466 
 467     bind(push_and_slow_path);
 468     // Compare and exchange failed.
 469     // Restore lock-stack and handle the unlock in runtime.
 470     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 471     addw(t2_top, t2_top, oopSize);
 472     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 473     b(slow_path);
 474   }
 475 
 476 
 477   { // Handle inflated monitor.
 478     bind(inflated_load_mark);
 479     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 480 #ifdef ASSERT
 481     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 482     stop("Fast Unlock not monitor");
 483 #endif
 484 
 485     bind(inflated);
 486 
 487 #ifdef ASSERT
 488     Label check_done;
 489     subw(t2_top, t2_top, oopSize);
 490     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 491     br(Assembler::LT, check_done);
 492     ldr(t3_t, Address(rthread, t2_top));
 493     cmp(obj, t3_t);
 494     br(Assembler::NE, inflated);
 495     stop("Fast Unlock lock on stack");
 496     bind(check_done);
 497 #endif
 498 
 499     const Register t1_monitor = t1;
 500 
 501     if (!UseObjectMonitorTable) {
 502       assert(t1_monitor == t1_mark, "should be the same here");
 503 
 504       // Untag the monitor.
 505       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 506     } else {
 507       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 508       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 509       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 510       br(Assembler::LO, slow_path);
 511     }
 512 
 513     const Register t2_recursions = t2;
 514     Label not_recursive;
 515 
 516     // Check if recursive.
 517     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 518     cbz(t2_recursions, not_recursive);
 519 
 520     // Recursive unlock.
 521     sub(t2_recursions, t2_recursions, 1u);
 522     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 523     // Set flag == EQ
 524     cmp(t2_recursions, t2_recursions);
 525     b(unlocked);
 526 
 527     bind(not_recursive);
 528 
 529     Label release;
 530     const Register t2_owner_addr = t2;
 531 
 532     // Compute owner address.
 533     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 534 
 535     // Check if the entry lists are empty.
 536     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 537     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 538     orr(rscratch1, rscratch1, t3_t);
 539     cmp(rscratch1, zr);
 540     br(Assembler::EQ, release);
 541 
 542     // The owner may be anonymous and we removed the last obj entry in
 543     // the lock-stack. This loses the information about the owner.
 544     // Write the thread to the owner field so the runtime knows the owner.
 545     str(rthread, Address(t2_owner_addr));
 546     b(slow_path);
 547 
 548     bind(release);
 549     // Set owner to null.
 550     // Release to satisfy the JMM
 551     stlr(zr, t2_owner_addr);
 552   }
 553 
 554   bind(unlocked);
 555   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 556 
 557 #ifdef ASSERT
 558   // Check that unlocked label is reached with Flags == EQ.
 559   Label flag_correct;
 560   br(Assembler::EQ, flag_correct);
 561   stop("Fast Unlock Flag != EQ");
 562 #endif
 563 
 564   bind(slow_path);
 565 #ifdef ASSERT
 566   // Check that slow_path label is reached with Flags == NE.
 567   br(Assembler::NE, flag_correct);
 568   stop("Fast Unlock Flag != NE");
 569   bind(flag_correct);
 570 #endif
 571   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 572 }
 573 
 574 // Search for str1 in str2 and return index or -1
 575 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 576 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 577                                        Register cnt2, Register cnt1,
 578                                        Register tmp1, Register tmp2,
 579                                        Register tmp3, Register tmp4,
 580                                        Register tmp5, Register tmp6,
 581                                        int icnt1, Register result, int ae) {
 582   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 583   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 584 
 585   Register ch1 = rscratch1;
 586   Register ch2 = rscratch2;
 587   Register cnt1tmp = tmp1;
 588   Register cnt2tmp = tmp2;
 589   Register cnt1_neg = cnt1;
 590   Register cnt2_neg = cnt2;
 591   Register result_tmp = tmp4;
 592 
 593   bool isL = ae == StrIntrinsicNode::LL;
 594 
 595   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 596   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 597   int str1_chr_shift = str1_isL ? 0:1;
 598   int str2_chr_shift = str2_isL ? 0:1;
 599   int str1_chr_size = str1_isL ? 1:2;
 600   int str2_chr_size = str2_isL ? 1:2;
 601   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 602                                       (chr_insn)&MacroAssembler::ldrh;
 603   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 604                                       (chr_insn)&MacroAssembler::ldrh;
 605   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 606   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 607 
 608   // Note, inline_string_indexOf() generates checks:
 609   // if (substr.count > string.count) return -1;
 610   // if (substr.count == 0) return 0;
 611 
 612   // We have two strings, a source string in str2, cnt2 and a pattern string
 613   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 614 
 615   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 616   // With a small pattern and source we use linear scan.
 617 
 618   if (icnt1 == -1) {
 619     sub(result_tmp, cnt2, cnt1);
 620     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 621     br(LT, LINEARSEARCH);
 622     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 623     subs(zr, cnt1, 256);
 624     lsr(tmp1, cnt2, 2);
 625     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 626     br(GE, LINEARSTUB);
 627   }
 628 
 629 // The Boyer Moore alogorithm is based on the description here:-
 630 //
 631 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 632 //
 633 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 634 // and the 'Good Suffix' rule.
 635 //
 636 // These rules are essentially heuristics for how far we can shift the
 637 // pattern along the search string.
 638 //
 639 // The implementation here uses the 'Bad Character' rule only because of the
 640 // complexity of initialisation for the 'Good Suffix' rule.
 641 //
 642 // This is also known as the Boyer-Moore-Horspool algorithm:-
 643 //
 644 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 645 //
 646 // This particular implementation has few java-specific optimizations.
 647 //
 648 // #define ASIZE 256
 649 //
 650 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 651 //       int i, j;
 652 //       unsigned c;
 653 //       unsigned char bc[ASIZE];
 654 //
 655 //       /* Preprocessing */
 656 //       for (i = 0; i < ASIZE; ++i)
 657 //          bc[i] = m;
 658 //       for (i = 0; i < m - 1; ) {
 659 //          c = x[i];
 660 //          ++i;
 661 //          // c < 256 for Latin1 string, so, no need for branch
 662 //          #ifdef PATTERN_STRING_IS_LATIN1
 663 //          bc[c] = m - i;
 664 //          #else
 665 //          if (c < ASIZE) bc[c] = m - i;
 666 //          #endif
 667 //       }
 668 //
 669 //       /* Searching */
 670 //       j = 0;
 671 //       while (j <= n - m) {
 672 //          c = y[i+j];
 673 //          if (x[m-1] == c)
 674 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 675 //          if (i < 0) return j;
 676 //          // c < 256 for Latin1 string, so, no need for branch
 677 //          #ifdef SOURCE_STRING_IS_LATIN1
 678 //          // LL case: (c< 256) always true. Remove branch
 679 //          j += bc[y[j+m-1]];
 680 //          #endif
 681 //          #ifndef PATTERN_STRING_IS_UTF
 682 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 683 //          if (c < ASIZE)
 684 //            j += bc[y[j+m-1]];
 685 //          else
 686 //            j += 1
 687 //          #endif
 688 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 689 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 690 //          if (c < ASIZE)
 691 //            j += bc[y[j+m-1]];
 692 //          else
 693 //            j += m
 694 //          #endif
 695 //       }
 696 //    }
 697 
 698   if (icnt1 == -1) {
 699     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 700         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 701     Register cnt1end = tmp2;
 702     Register str2end = cnt2;
 703     Register skipch = tmp2;
 704 
 705     // str1 length is >=8, so, we can read at least 1 register for cases when
 706     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 707     // UL case. We'll re-read last character in inner pre-loop code to have
 708     // single outer pre-loop load
 709     const int firstStep = isL ? 7 : 3;
 710 
 711     const int ASIZE = 256;
 712     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 713     sub(sp, sp, ASIZE);
 714     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 715     mov(ch1, sp);
 716     BIND(BM_INIT_LOOP);
 717       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 718       subs(tmp5, tmp5, 1);
 719       br(GT, BM_INIT_LOOP);
 720 
 721       sub(cnt1tmp, cnt1, 1);
 722       mov(tmp5, str2);
 723       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 724       sub(ch2, cnt1, 1);
 725       mov(tmp3, str1);
 726     BIND(BCLOOP);
 727       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 728       if (!str1_isL) {
 729         subs(zr, ch1, ASIZE);
 730         br(HS, BCSKIP);
 731       }
 732       strb(ch2, Address(sp, ch1));
 733     BIND(BCSKIP);
 734       subs(ch2, ch2, 1);
 735       br(GT, BCLOOP);
 736 
 737       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 738       if (str1_isL == str2_isL) {
 739         // load last 8 bytes (8LL/4UU symbols)
 740         ldr(tmp6, Address(tmp6, -wordSize));
 741       } else {
 742         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 743         // convert Latin1 to UTF. We'll have to wait until load completed, but
 744         // it's still faster than per-character loads+checks
 745         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 746         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 747         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 748         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 749         orr(ch2, ch1, ch2, LSL, 16);
 750         orr(tmp6, tmp6, tmp3, LSL, 48);
 751         orr(tmp6, tmp6, ch2, LSL, 16);
 752       }
 753     BIND(BMLOOPSTR2);
 754       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 755       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 756       if (str1_isL == str2_isL) {
 757         // re-init tmp3. It's for free because it's executed in parallel with
 758         // load above. Alternative is to initialize it before loop, but it'll
 759         // affect performance on in-order systems with 2 or more ld/st pipelines
 760         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 761       }
 762       if (!isL) { // UU/UL case
 763         lsl(ch2, cnt1tmp, 1); // offset in bytes
 764       }
 765       cmp(tmp3, skipch);
 766       br(NE, BMSKIP);
 767       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 768       mov(ch1, tmp6);
 769       if (isL) {
 770         b(BMLOOPSTR1_AFTER_LOAD);
 771       } else {
 772         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 773         b(BMLOOPSTR1_CMP);
 774       }
 775     BIND(BMLOOPSTR1);
 776       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 777       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 778     BIND(BMLOOPSTR1_AFTER_LOAD);
 779       subs(cnt1tmp, cnt1tmp, 1);
 780       br(LT, BMLOOPSTR1_LASTCMP);
 781     BIND(BMLOOPSTR1_CMP);
 782       cmp(ch1, ch2);
 783       br(EQ, BMLOOPSTR1);
 784     BIND(BMSKIP);
 785       if (!isL) {
 786         // if we've met UTF symbol while searching Latin1 pattern, then we can
 787         // skip cnt1 symbols
 788         if (str1_isL != str2_isL) {
 789           mov(result_tmp, cnt1);
 790         } else {
 791           mov(result_tmp, 1);
 792         }
 793         subs(zr, skipch, ASIZE);
 794         br(HS, BMADV);
 795       }
 796       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 797     BIND(BMADV);
 798       sub(cnt1tmp, cnt1, 1);
 799       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 800       cmp(str2, str2end);
 801       br(LE, BMLOOPSTR2);
 802       add(sp, sp, ASIZE);
 803       b(NOMATCH);
 804     BIND(BMLOOPSTR1_LASTCMP);
 805       cmp(ch1, ch2);
 806       br(NE, BMSKIP);
 807     BIND(BMMATCH);
 808       sub(result, str2, tmp5);
 809       if (!str2_isL) lsr(result, result, 1);
 810       add(sp, sp, ASIZE);
 811       b(DONE);
 812 
 813     BIND(LINEARSTUB);
 814     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 815     br(LT, LINEAR_MEDIUM);
 816     mov(result, zr);
 817     RuntimeAddress stub = nullptr;
 818     if (isL) {
 819       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 820       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 821     } else if (str1_isL) {
 822       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 823        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 824     } else {
 825       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 826       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 827     }
 828     address call = trampoline_call(stub);
 829     if (call == nullptr) {
 830       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 831       ciEnv::current()->record_failure("CodeCache is full");
 832       return;
 833     }
 834     b(DONE);
 835   }
 836 
 837   BIND(LINEARSEARCH);
 838   {
 839     Label DO1, DO2, DO3;
 840 
 841     Register str2tmp = tmp2;
 842     Register first = tmp3;
 843 
 844     if (icnt1 == -1)
 845     {
 846         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 847 
 848         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 849         br(LT, DOSHORT);
 850       BIND(LINEAR_MEDIUM);
 851         (this->*str1_load_1chr)(first, Address(str1));
 852         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 853         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 854         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 855         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 856 
 857       BIND(FIRST_LOOP);
 858         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 859         cmp(first, ch2);
 860         br(EQ, STR1_LOOP);
 861       BIND(STR2_NEXT);
 862         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 863         br(LE, FIRST_LOOP);
 864         b(NOMATCH);
 865 
 866       BIND(STR1_LOOP);
 867         adds(cnt1tmp, cnt1_neg, str1_chr_size);
 868         add(cnt2tmp, cnt2_neg, str2_chr_size);
 869         br(GE, MATCH);
 870 
 871       BIND(STR1_NEXT);
 872         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
 873         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 874         cmp(ch1, ch2);
 875         br(NE, STR2_NEXT);
 876         adds(cnt1tmp, cnt1tmp, str1_chr_size);
 877         add(cnt2tmp, cnt2tmp, str2_chr_size);
 878         br(LT, STR1_NEXT);
 879         b(MATCH);
 880 
 881       BIND(DOSHORT);
 882       if (str1_isL == str2_isL) {
 883         cmp(cnt1, (u1)2);
 884         br(LT, DO1);
 885         br(GT, DO3);
 886       }
 887     }
 888 
 889     if (icnt1 == 4) {
 890       Label CH1_LOOP;
 891 
 892         (this->*load_4chr)(ch1, str1);
 893         sub(result_tmp, cnt2, 4);
 894         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 895         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 896 
 897       BIND(CH1_LOOP);
 898         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
 899         cmp(ch1, ch2);
 900         br(EQ, MATCH);
 901         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 902         br(LE, CH1_LOOP);
 903         b(NOMATCH);
 904       }
 905 
 906     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
 907       Label CH1_LOOP;
 908 
 909       BIND(DO2);
 910         (this->*load_2chr)(ch1, str1);
 911         if (icnt1 == 2) {
 912           sub(result_tmp, cnt2, 2);
 913         }
 914         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 915         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 916       BIND(CH1_LOOP);
 917         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 918         cmp(ch1, ch2);
 919         br(EQ, MATCH);
 920         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 921         br(LE, CH1_LOOP);
 922         b(NOMATCH);
 923     }
 924 
 925     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
 926       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
 927 
 928       BIND(DO3);
 929         (this->*load_2chr)(first, str1);
 930         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
 931         if (icnt1 == 3) {
 932           sub(result_tmp, cnt2, 3);
 933         }
 934         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 935         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 936       BIND(FIRST_LOOP);
 937         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
 938         cmpw(first, ch2);
 939         br(EQ, STR1_LOOP);
 940       BIND(STR2_NEXT);
 941         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 942         br(LE, FIRST_LOOP);
 943         b(NOMATCH);
 944 
 945       BIND(STR1_LOOP);
 946         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
 947         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
 948         cmp(ch1, ch2);
 949         br(NE, STR2_NEXT);
 950         b(MATCH);
 951     }
 952 
 953     if (icnt1 == -1 || icnt1 == 1) {
 954       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
 955 
 956       BIND(DO1);
 957         (this->*str1_load_1chr)(ch1, str1);
 958         cmp(cnt2, (u1)8);
 959         br(LT, DO1_SHORT);
 960 
 961         sub(result_tmp, cnt2, 8/str2_chr_size);
 962         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 963         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 964         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 965 
 966         if (str2_isL) {
 967           orr(ch1, ch1, ch1, LSL, 8);
 968         }
 969         orr(ch1, ch1, ch1, LSL, 16);
 970         orr(ch1, ch1, ch1, LSL, 32);
 971       BIND(CH1_LOOP);
 972         ldr(ch2, Address(str2, cnt2_neg));
 973         eor(ch2, ch1, ch2);
 974         sub(tmp1, ch2, tmp3);
 975         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 976         bics(tmp1, tmp1, tmp2);
 977         br(NE, HAS_ZERO);
 978         adds(cnt2_neg, cnt2_neg, 8);
 979         br(LT, CH1_LOOP);
 980 
 981         cmp(cnt2_neg, (u1)8);
 982         mov(cnt2_neg, 0);
 983         br(LT, CH1_LOOP);
 984         b(NOMATCH);
 985 
 986       BIND(HAS_ZERO);
 987         rev(tmp1, tmp1);
 988         clz(tmp1, tmp1);
 989         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
 990         b(MATCH);
 991 
 992       BIND(DO1_SHORT);
 993         mov(result_tmp, cnt2);
 994         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
 995         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
 996       BIND(DO1_LOOP);
 997         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 998         cmpw(ch1, ch2);
 999         br(EQ, MATCH);
1000         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1001         br(LT, DO1_LOOP);
1002     }
1003   }
1004   BIND(NOMATCH);
1005     mov(result, -1);
1006     b(DONE);
1007   BIND(MATCH);
1008     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1009   BIND(DONE);
1010 }
1011 
1012 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1013 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1014 
1015 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1016                                             Register ch, Register result,
1017                                             Register tmp1, Register tmp2, Register tmp3)
1018 {
1019   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1020   Register cnt1_neg = cnt1;
1021   Register ch1 = rscratch1;
1022   Register result_tmp = rscratch2;
1023 
1024   cbz(cnt1, NOMATCH);
1025 
1026   cmp(cnt1, (u1)4);
1027   br(LT, DO1_SHORT);
1028 
1029   orr(ch, ch, ch, LSL, 16);
1030   orr(ch, ch, ch, LSL, 32);
1031 
1032   sub(cnt1, cnt1, 4);
1033   mov(result_tmp, cnt1);
1034   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1035   sub(cnt1_neg, zr, cnt1, LSL, 1);
1036 
1037   mov(tmp3, 0x0001000100010001);
1038 
1039   BIND(CH1_LOOP);
1040     ldr(ch1, Address(str1, cnt1_neg));
1041     eor(ch1, ch, ch1);
1042     sub(tmp1, ch1, tmp3);
1043     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1044     bics(tmp1, tmp1, tmp2);
1045     br(NE, HAS_ZERO);
1046     adds(cnt1_neg, cnt1_neg, 8);
1047     br(LT, CH1_LOOP);
1048 
1049     cmp(cnt1_neg, (u1)8);
1050     mov(cnt1_neg, 0);
1051     br(LT, CH1_LOOP);
1052     b(NOMATCH);
1053 
1054   BIND(HAS_ZERO);
1055     rev(tmp1, tmp1);
1056     clz(tmp1, tmp1);
1057     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1058     b(MATCH);
1059 
1060   BIND(DO1_SHORT);
1061     mov(result_tmp, cnt1);
1062     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1063     sub(cnt1_neg, zr, cnt1, LSL, 1);
1064   BIND(DO1_LOOP);
1065     ldrh(ch1, Address(str1, cnt1_neg));
1066     cmpw(ch, ch1);
1067     br(EQ, MATCH);
1068     adds(cnt1_neg, cnt1_neg, 2);
1069     br(LT, DO1_LOOP);
1070   BIND(NOMATCH);
1071     mov(result, -1);
1072     b(DONE);
1073   BIND(MATCH);
1074     add(result, result_tmp, cnt1_neg, ASR, 1);
1075   BIND(DONE);
1076 }
1077 
1078 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1079                                                 Register ch, Register result,
1080                                                 FloatRegister ztmp1,
1081                                                 FloatRegister ztmp2,
1082                                                 PRegister tmp_pg,
1083                                                 PRegister tmp_pdn, bool isL)
1084 {
1085   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1086   assert(tmp_pg->is_governing(),
1087          "this register has to be a governing predicate register");
1088 
1089   Label LOOP, MATCH, DONE, NOMATCH;
1090   Register vec_len = rscratch1;
1091   Register idx = rscratch2;
1092 
1093   SIMD_RegVariant T = (isL == true) ? B : H;
1094 
1095   cbz(cnt1, NOMATCH);
1096 
1097   // Assign the particular char throughout the vector.
1098   sve_dup(ztmp2, T, ch);
1099   if (isL) {
1100     sve_cntb(vec_len);
1101   } else {
1102     sve_cnth(vec_len);
1103   }
1104   mov(idx, 0);
1105 
1106   // Generate a predicate to control the reading of input string.
1107   sve_whilelt(tmp_pg, T, idx, cnt1);
1108 
1109   BIND(LOOP);
1110     // Read a vector of 8- or 16-bit data depending on the string type. Note
1111     // that inactive elements indicated by the predicate register won't cause
1112     // a data read from memory to the destination vector.
1113     if (isL) {
1114       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1115     } else {
1116       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1117     }
1118     add(idx, idx, vec_len);
1119 
1120     // Perform the comparison. An element of the destination predicate is set
1121     // to active if the particular char is matched.
1122     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1123 
1124     // Branch if the particular char is found.
1125     br(NE, MATCH);
1126 
1127     sve_whilelt(tmp_pg, T, idx, cnt1);
1128 
1129     // Loop back if the particular char not found.
1130     br(MI, LOOP);
1131 
1132   BIND(NOMATCH);
1133     mov(result, -1);
1134     b(DONE);
1135 
1136   BIND(MATCH);
1137     // Undo the index increment.
1138     sub(idx, idx, vec_len);
1139 
1140     // Crop the vector to find its location.
1141     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1142     add(result, idx, -1);
1143     sve_incp(result, T, tmp_pdn);
1144   BIND(DONE);
1145 }
1146 
1147 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1148                                             Register ch, Register result,
1149                                             Register tmp1, Register tmp2, Register tmp3)
1150 {
1151   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1152   Register cnt1_neg = cnt1;
1153   Register ch1 = rscratch1;
1154   Register result_tmp = rscratch2;
1155 
1156   cbz(cnt1, NOMATCH);
1157 
1158   cmp(cnt1, (u1)8);
1159   br(LT, DO1_SHORT);
1160 
1161   orr(ch, ch, ch, LSL, 8);
1162   orr(ch, ch, ch, LSL, 16);
1163   orr(ch, ch, ch, LSL, 32);
1164 
1165   sub(cnt1, cnt1, 8);
1166   mov(result_tmp, cnt1);
1167   lea(str1, Address(str1, cnt1));
1168   sub(cnt1_neg, zr, cnt1);
1169 
1170   mov(tmp3, 0x0101010101010101);
1171 
1172   BIND(CH1_LOOP);
1173     ldr(ch1, Address(str1, cnt1_neg));
1174     eor(ch1, ch, ch1);
1175     sub(tmp1, ch1, tmp3);
1176     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1177     bics(tmp1, tmp1, tmp2);
1178     br(NE, HAS_ZERO);
1179     adds(cnt1_neg, cnt1_neg, 8);
1180     br(LT, CH1_LOOP);
1181 
1182     cmp(cnt1_neg, (u1)8);
1183     mov(cnt1_neg, 0);
1184     br(LT, CH1_LOOP);
1185     b(NOMATCH);
1186 
1187   BIND(HAS_ZERO);
1188     rev(tmp1, tmp1);
1189     clz(tmp1, tmp1);
1190     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1191     b(MATCH);
1192 
1193   BIND(DO1_SHORT);
1194     mov(result_tmp, cnt1);
1195     lea(str1, Address(str1, cnt1));
1196     sub(cnt1_neg, zr, cnt1);
1197   BIND(DO1_LOOP);
1198     ldrb(ch1, Address(str1, cnt1_neg));
1199     cmp(ch, ch1);
1200     br(EQ, MATCH);
1201     adds(cnt1_neg, cnt1_neg, 1);
1202     br(LT, DO1_LOOP);
1203   BIND(NOMATCH);
1204     mov(result, -1);
1205     b(DONE);
1206   BIND(MATCH);
1207     add(result, result_tmp, cnt1_neg);
1208   BIND(DONE);
1209 }
1210 
1211 // Compare strings.
1212 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1213     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1214     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1215     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1216   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1217       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1218       SHORT_LOOP_START, TAIL_CHECK;
1219 
1220   bool isLL = ae == StrIntrinsicNode::LL;
1221   bool isLU = ae == StrIntrinsicNode::LU;
1222   bool isUL = ae == StrIntrinsicNode::UL;
1223 
1224   // The stub threshold for LL strings is: 72 (64 + 8) chars
1225   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1226   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1227   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1228 
1229   bool str1_isL = isLL || isLU;
1230   bool str2_isL = isLL || isUL;
1231 
1232   int str1_chr_shift = str1_isL ? 0 : 1;
1233   int str2_chr_shift = str2_isL ? 0 : 1;
1234   int str1_chr_size = str1_isL ? 1 : 2;
1235   int str2_chr_size = str2_isL ? 1 : 2;
1236   int minCharsInWord = isLL ? wordSize : wordSize/2;
1237 
1238   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1239   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1240                                       (chr_insn)&MacroAssembler::ldrh;
1241   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1242                                       (chr_insn)&MacroAssembler::ldrh;
1243   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1244                             (uxt_insn)&MacroAssembler::uxthw;
1245 
1246   BLOCK_COMMENT("string_compare {");
1247 
1248   // Bizarrely, the counts are passed in bytes, regardless of whether they
1249   // are L or U strings, however the result is always in characters.
1250   if (!str1_isL) asrw(cnt1, cnt1, 1);
1251   if (!str2_isL) asrw(cnt2, cnt2, 1);
1252 
1253   // Compute the minimum of the string lengths and save the difference.
1254   subsw(result, cnt1, cnt2);
1255   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1256 
1257   // A very short string
1258   cmpw(cnt2, minCharsInWord);
1259   br(Assembler::LE, SHORT_STRING);
1260 
1261   // Compare longwords
1262   // load first parts of strings and finish initialization while loading
1263   {
1264     if (str1_isL == str2_isL) { // LL or UU
1265       ldr(tmp1, Address(str1));
1266       cmp(str1, str2);
1267       br(Assembler::EQ, DONE);
1268       ldr(tmp2, Address(str2));
1269       cmp(cnt2, stub_threshold);
1270       br(GE, STUB);
1271       subsw(cnt2, cnt2, minCharsInWord);
1272       br(EQ, TAIL_CHECK);
1273       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1274       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1275       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1276     } else if (isLU) {
1277       ldrs(vtmp, Address(str1));
1278       ldr(tmp2, Address(str2));
1279       cmp(cnt2, stub_threshold);
1280       br(GE, STUB);
1281       subw(cnt2, cnt2, 4);
1282       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1283       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1284       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1285       zip1(vtmp, T8B, vtmp, vtmpZ);
1286       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1287       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1288       add(cnt1, cnt1, 4);
1289       fmovd(tmp1, vtmp);
1290     } else { // UL case
1291       ldr(tmp1, Address(str1));
1292       ldrs(vtmp, Address(str2));
1293       cmp(cnt2, stub_threshold);
1294       br(GE, STUB);
1295       subw(cnt2, cnt2, 4);
1296       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1297       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1298       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1299       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1300       zip1(vtmp, T8B, vtmp, vtmpZ);
1301       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1302       add(cnt1, cnt1, 8);
1303       fmovd(tmp2, vtmp);
1304     }
1305     adds(cnt2, cnt2, isUL ? 4 : 8);
1306     br(GE, TAIL);
1307     eor(rscratch2, tmp1, tmp2);
1308     cbnz(rscratch2, DIFF);
1309     // main loop
1310     bind(NEXT_WORD);
1311     if (str1_isL == str2_isL) {
1312       ldr(tmp1, Address(str1, cnt2));
1313       ldr(tmp2, Address(str2, cnt2));
1314       adds(cnt2, cnt2, 8);
1315     } else if (isLU) {
1316       ldrs(vtmp, Address(str1, cnt1));
1317       ldr(tmp2, Address(str2, cnt2));
1318       add(cnt1, cnt1, 4);
1319       zip1(vtmp, T8B, vtmp, vtmpZ);
1320       fmovd(tmp1, vtmp);
1321       adds(cnt2, cnt2, 8);
1322     } else { // UL
1323       ldrs(vtmp, Address(str2, cnt2));
1324       ldr(tmp1, Address(str1, cnt1));
1325       zip1(vtmp, T8B, vtmp, vtmpZ);
1326       add(cnt1, cnt1, 8);
1327       fmovd(tmp2, vtmp);
1328       adds(cnt2, cnt2, 4);
1329     }
1330     br(GE, TAIL);
1331 
1332     eor(rscratch2, tmp1, tmp2);
1333     cbz(rscratch2, NEXT_WORD);
1334     b(DIFF);
1335     bind(TAIL);
1336     eor(rscratch2, tmp1, tmp2);
1337     cbnz(rscratch2, DIFF);
1338     // Last longword.  In the case where length == 4 we compare the
1339     // same longword twice, but that's still faster than another
1340     // conditional branch.
1341     if (str1_isL == str2_isL) {
1342       ldr(tmp1, Address(str1));
1343       ldr(tmp2, Address(str2));
1344     } else if (isLU) {
1345       ldrs(vtmp, Address(str1));
1346       ldr(tmp2, Address(str2));
1347       zip1(vtmp, T8B, vtmp, vtmpZ);
1348       fmovd(tmp1, vtmp);
1349     } else { // UL
1350       ldrs(vtmp, Address(str2));
1351       ldr(tmp1, Address(str1));
1352       zip1(vtmp, T8B, vtmp, vtmpZ);
1353       fmovd(tmp2, vtmp);
1354     }
1355     bind(TAIL_CHECK);
1356     eor(rscratch2, tmp1, tmp2);
1357     cbz(rscratch2, DONE);
1358 
1359     // Find the first different characters in the longwords and
1360     // compute their difference.
1361     bind(DIFF);
1362     rev(rscratch2, rscratch2);
1363     clz(rscratch2, rscratch2);
1364     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1365     lsrv(tmp1, tmp1, rscratch2);
1366     (this->*ext_chr)(tmp1, tmp1);
1367     lsrv(tmp2, tmp2, rscratch2);
1368     (this->*ext_chr)(tmp2, tmp2);
1369     subw(result, tmp1, tmp2);
1370     b(DONE);
1371   }
1372 
1373   bind(STUB);
1374     RuntimeAddress stub = nullptr;
1375     switch(ae) {
1376       case StrIntrinsicNode::LL:
1377         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1378         break;
1379       case StrIntrinsicNode::UU:
1380         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1381         break;
1382       case StrIntrinsicNode::LU:
1383         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1384         break;
1385       case StrIntrinsicNode::UL:
1386         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1387         break;
1388       default:
1389         ShouldNotReachHere();
1390      }
1391     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1392     address call = trampoline_call(stub);
1393     if (call == nullptr) {
1394       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1395       ciEnv::current()->record_failure("CodeCache is full");
1396       return;
1397     }
1398     b(DONE);
1399 
1400   bind(SHORT_STRING);
1401   // Is the minimum length zero?
1402   cbz(cnt2, DONE);
1403   // arrange code to do most branches while loading and loading next characters
1404   // while comparing previous
1405   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1406   subs(cnt2, cnt2, 1);
1407   br(EQ, SHORT_LAST_INIT);
1408   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1409   b(SHORT_LOOP_START);
1410   bind(SHORT_LOOP);
1411   subs(cnt2, cnt2, 1);
1412   br(EQ, SHORT_LAST);
1413   bind(SHORT_LOOP_START);
1414   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1415   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1416   cmp(tmp1, cnt1);
1417   br(NE, SHORT_LOOP_TAIL);
1418   subs(cnt2, cnt2, 1);
1419   br(EQ, SHORT_LAST2);
1420   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1421   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1422   cmp(tmp2, rscratch1);
1423   br(EQ, SHORT_LOOP);
1424   sub(result, tmp2, rscratch1);
1425   b(DONE);
1426   bind(SHORT_LOOP_TAIL);
1427   sub(result, tmp1, cnt1);
1428   b(DONE);
1429   bind(SHORT_LAST2);
1430   cmp(tmp2, rscratch1);
1431   br(EQ, DONE);
1432   sub(result, tmp2, rscratch1);
1433 
1434   b(DONE);
1435   bind(SHORT_LAST_INIT);
1436   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1437   bind(SHORT_LAST);
1438   cmp(tmp1, cnt1);
1439   br(EQ, DONE);
1440   sub(result, tmp1, cnt1);
1441 
1442   bind(DONE);
1443 
1444   BLOCK_COMMENT("} string_compare");
1445 }
1446 
1447 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1448                                      FloatRegister src2, Condition cond, bool isQ) {
1449   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1450   FloatRegister zn = src1, zm = src2;
1451   bool needs_negation = false;
1452   switch (cond) {
1453     case LT: cond = GT; zn = src2; zm = src1; break;
1454     case LE: cond = GE; zn = src2; zm = src1; break;
1455     case LO: cond = HI; zn = src2; zm = src1; break;
1456     case LS: cond = HS; zn = src2; zm = src1; break;
1457     case NE: cond = EQ; needs_negation = true; break;
1458     default:
1459       break;
1460   }
1461 
1462   if (is_floating_point_type(bt)) {
1463     fcm(cond, dst, size, zn, zm);
1464   } else {
1465     cm(cond, dst, size, zn, zm);
1466   }
1467 
1468   if (needs_negation) {
1469     notr(dst, isQ ? T16B : T8B, dst);
1470   }
1471 }
1472 
1473 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1474                                           Condition cond, bool isQ) {
1475   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1476   if (bt == T_FLOAT || bt == T_DOUBLE) {
1477     if (cond == Assembler::NE) {
1478       fcm(Assembler::EQ, dst, size, src);
1479       notr(dst, isQ ? T16B : T8B, dst);
1480     } else {
1481       fcm(cond, dst, size, src);
1482     }
1483   } else {
1484     if (cond == Assembler::NE) {
1485       cm(Assembler::EQ, dst, size, src);
1486       notr(dst, isQ ? T16B : T8B, dst);
1487     } else {
1488       cm(cond, dst, size, src);
1489     }
1490   }
1491 }
1492 
1493 // Compress the least significant bit of each byte to the rightmost and clear
1494 // the higher garbage bits.
1495 void C2_MacroAssembler::bytemask_compress(Register dst) {
1496   // Example input, dst = 0x01 00 00 00 01 01 00 01
1497   // The "??" bytes are garbage.
1498   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1499   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1500   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1501   andr(dst, dst, 0xff);                   // dst = 0x8D
1502 }
1503 
1504 // Pack the lowest-numbered bit of each mask element in src into a long value
1505 // in dst, at most the first 64 lane elements.
1506 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1507 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1508                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1509   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1510   assert_different_registers(dst, rscratch1);
1511   assert_different_registers(vtmp1, vtmp2);
1512 
1513   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1514   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1515   // Expected:  dst = 0x658D
1516 
1517   // Convert the mask into vector with sequential bytes.
1518   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1519   sve_cpy(vtmp1, size, src, 1, false);
1520   if (bt != T_BYTE) {
1521     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1522   }
1523 
1524   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1525     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1526     // is to compress each significant bit of the byte in a cross-lane way. Due
1527     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1528     // (bit-compress in each lane) with the biggest lane size (T = D) then
1529     // concatenate the results.
1530 
1531     // The second source input of BEXT, initialized with 0x01 in each byte.
1532     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1533     sve_dup(vtmp2, B, 1);
1534 
1535     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1536     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1537     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1538     //         ---------------------------------------
1539     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1540     sve_bext(vtmp1, D, vtmp1, vtmp2);
1541 
1542     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1543     // result to dst.
1544     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1545     // dst   = 0x658D
1546     if (lane_cnt <= 8) {
1547       // No need to concatenate.
1548       umov(dst, vtmp1, B, 0);
1549     } else if (lane_cnt <= 16) {
1550       ins(vtmp1, B, vtmp1, 1, 8);
1551       umov(dst, vtmp1, H, 0);
1552     } else {
1553       // As the lane count is 64 at most, the final expected value must be in
1554       // the lowest 64 bits after narrowing vtmp1 from D to B.
1555       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1556       umov(dst, vtmp1, D, 0);
1557     }
1558   } else if (UseSVE > 0) {
1559     // Compress the lowest 8 bytes.
1560     fmovd(dst, vtmp1);
1561     bytemask_compress(dst);
1562     if (lane_cnt <= 8) return;
1563 
1564     // Repeat on higher bytes and join the results.
1565     // Compress 8 bytes in each iteration.
1566     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1567       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1568       bytemask_compress(rscratch1);
1569       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1570     }
1571   } else {
1572     assert(false, "unsupported");
1573     ShouldNotReachHere();
1574   }
1575 }
1576 
1577 // Unpack the mask, a long value in src, into predicate register dst based on the
1578 // corresponding data type. Note that dst can support at most 64 lanes.
1579 // Below example gives the expected dst predicate register in different types, with
1580 // a valid src(0x658D) on a 1024-bit vector size machine.
1581 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1582 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1583 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1584 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1585 //
1586 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1587 // has 24 significant bits would be an invalid input if dst predicate register refers to
1588 // a LONG type 1024-bit vector, which has at most 16 lanes.
1589 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1590                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1591   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1592          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1593   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1594   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1595   // Expected:  dst = 0b01101001 10001101
1596 
1597   // Put long value from general purpose register into the first lane of vector.
1598   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1599   sve_dup(vtmp1, B, 0);
1600   mov(vtmp1, D, 0, src);
1601 
1602   // As sve_cmp generates mask value with the minimum unit in byte, we should
1603   // transform the value in the first lane which is mask in bit now to the
1604   // mask in byte, which can be done by SVE2's BDEP instruction.
1605 
1606   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1607   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1608   if (lane_cnt <= 8) {
1609     // Nothing. As only one byte exsits.
1610   } else if (lane_cnt <= 16) {
1611     ins(vtmp1, B, vtmp1, 8, 1);
1612     mov(vtmp1, B, 1, zr);
1613   } else {
1614     sve_vector_extend(vtmp1, D, vtmp1, B);
1615   }
1616 
1617   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1618   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1619   sve_dup(vtmp2, B, 1);
1620 
1621   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1622   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1623   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1624   //         ---------------------------------------
1625   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1626   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1627 
1628   if (bt != T_BYTE) {
1629     sve_vector_extend(vtmp1, size, vtmp1, B);
1630   }
1631   // Generate mask according to the given vector, in which the elements have been
1632   // extended to expected type.
1633   // dst = 0b01101001 10001101
1634   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1635 }
1636 
1637 // Clobbers: rflags
1638 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1639                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1640   assert(pg->is_governing(), "This register has to be a governing predicate register");
1641   FloatRegister z1 = zn, z2 = zm;
1642   switch (cond) {
1643     case LE: z1 = zm; z2 = zn; cond = GE; break;
1644     case LT: z1 = zm; z2 = zn; cond = GT; break;
1645     case LO: z1 = zm; z2 = zn; cond = HI; break;
1646     case LS: z1 = zm; z2 = zn; cond = HS; break;
1647     default:
1648       break;
1649   }
1650 
1651   SIMD_RegVariant size = elemType_to_regVariant(bt);
1652   if (is_floating_point_type(bt)) {
1653     sve_fcm(cond, pd, size, pg, z1, z2);
1654   } else {
1655     assert(is_integral_type(bt), "unsupported element type");
1656     sve_cmp(cond, pd, size, pg, z1, z2);
1657   }
1658 }
1659 
1660 // Get index of the last mask lane that is set
1661 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1662   SIMD_RegVariant size = elemType_to_regVariant(bt);
1663   sve_rev(ptmp, size, src);
1664   sve_brkb(ptmp, ptrue, ptmp, false);
1665   sve_cntp(dst, size, ptrue, ptmp);
1666   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1667   subw(dst, rscratch1, dst);
1668 }
1669 
1670 // Extend integer vector src to dst with the same lane count
1671 // but larger element size, e.g. 4B -> 4I
1672 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1673                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1674   if (src_bt == T_BYTE) {
1675     if (dst_bt == T_SHORT) {
1676       // 4B/8B to 4S/8S
1677       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1678     } else {
1679       // 4B to 4I
1680       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1681       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1682       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1683     }
1684   } else if (src_bt == T_SHORT) {
1685     // 4S to 4I
1686     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1687     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1688   } else if (src_bt == T_INT) {
1689     // 2I to 2L
1690     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1691     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1692   } else {
1693     ShouldNotReachHere();
1694   }
1695 }
1696 
1697 // Narrow integer vector src down to dst with the same lane count
1698 // but smaller element size, e.g. 4I -> 4B
1699 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1700                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1701   if (src_bt == T_SHORT) {
1702     // 4S/8S to 4B/8B
1703     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1704     assert(dst_bt == T_BYTE, "unsupported");
1705     xtn(dst, T8B, src, T8H);
1706   } else if (src_bt == T_INT) {
1707     // 4I to 4B/4S
1708     assert(src_vlen_in_bytes == 16, "unsupported");
1709     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1710     xtn(dst, T4H, src, T4S);
1711     if (dst_bt == T_BYTE) {
1712       xtn(dst, T8B, dst, T8H);
1713     }
1714   } else if (src_bt == T_LONG) {
1715     // 2L to 2I
1716     assert(src_vlen_in_bytes == 16, "unsupported");
1717     assert(dst_bt == T_INT, "unsupported");
1718     xtn(dst, T2S, src, T2D);
1719   } else {
1720     ShouldNotReachHere();
1721   }
1722 }
1723 
1724 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1725                                           FloatRegister src, SIMD_RegVariant src_size,
1726                                           bool is_unsigned) {
1727   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1728 
1729   if (src_size == B) {
1730     switch (dst_size) {
1731     case H:
1732       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1733       break;
1734     case S:
1735       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1736       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1737       break;
1738     case D:
1739       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1740       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1741       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1742       break;
1743     default:
1744       ShouldNotReachHere();
1745     }
1746   } else if (src_size == H) {
1747     if (dst_size == S) {
1748       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1749     } else { // D
1750       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1751       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1752     }
1753   } else if (src_size == S) {
1754     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1755   }
1756 }
1757 
1758 // Vector narrow from src to dst with specified element sizes.
1759 // High part of dst vector will be filled with zero.
1760 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1761                                           FloatRegister src, SIMD_RegVariant src_size,
1762                                           FloatRegister tmp) {
1763   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1764   assert_different_registers(src, tmp);
1765   sve_dup(tmp, src_size, 0);
1766   if (src_size == D) {
1767     switch (dst_size) {
1768     case S:
1769       sve_uzp1(dst, S, src, tmp);
1770       break;
1771     case H:
1772       assert_different_registers(dst, tmp);
1773       sve_uzp1(dst, S, src, tmp);
1774       sve_uzp1(dst, H, dst, tmp);
1775       break;
1776     case B:
1777       assert_different_registers(dst, tmp);
1778       sve_uzp1(dst, S, src, tmp);
1779       sve_uzp1(dst, H, dst, tmp);
1780       sve_uzp1(dst, B, dst, tmp);
1781       break;
1782     default:
1783       ShouldNotReachHere();
1784     }
1785   } else if (src_size == S) {
1786     if (dst_size == H) {
1787       sve_uzp1(dst, H, src, tmp);
1788     } else { // B
1789       assert_different_registers(dst, tmp);
1790       sve_uzp1(dst, H, src, tmp);
1791       sve_uzp1(dst, B, dst, tmp);
1792     }
1793   } else if (src_size == H) {
1794     sve_uzp1(dst, B, src, tmp);
1795   }
1796 }
1797 
1798 // Extend src predicate to dst predicate with the same lane count but larger
1799 // element size, e.g. 64Byte -> 512Long
1800 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1801                                              uint dst_element_length_in_bytes,
1802                                              uint src_element_length_in_bytes) {
1803   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1804     sve_punpklo(dst, src);
1805   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1806     sve_punpklo(dst, src);
1807     sve_punpklo(dst, dst);
1808   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1809     sve_punpklo(dst, src);
1810     sve_punpklo(dst, dst);
1811     sve_punpklo(dst, dst);
1812   } else {
1813     assert(false, "unsupported");
1814     ShouldNotReachHere();
1815   }
1816 }
1817 
1818 // Narrow src predicate to dst predicate with the same lane count but
1819 // smaller element size, e.g. 512Long -> 64Byte
1820 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1821                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1822   // The insignificant bits in src predicate are expected to be zero.
1823   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1824   // passed as the second argument. An example narrowing operation with a given mask would be -
1825   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1826   // Mask (for 2 Longs) : TF
1827   // Predicate register for the above mask (16 bits) : 00000001 00000000
1828   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1829   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1830   assert_different_registers(src, ptmp);
1831   assert_different_registers(dst, ptmp);
1832   sve_pfalse(ptmp);
1833   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1834     sve_uzp1(dst, B, src, ptmp);
1835   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1836     sve_uzp1(dst, H, src, ptmp);
1837     sve_uzp1(dst, B, dst, ptmp);
1838   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1839     sve_uzp1(dst, S, src, ptmp);
1840     sve_uzp1(dst, H, dst, ptmp);
1841     sve_uzp1(dst, B, dst, ptmp);
1842   } else {
1843     assert(false, "unsupported");
1844     ShouldNotReachHere();
1845   }
1846 }
1847 
1848 // Vector reduction add for integral type with ASIMD instructions.
1849 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1850                                                  Register isrc, FloatRegister vsrc,
1851                                                  unsigned vector_length_in_bytes,
1852                                                  FloatRegister vtmp) {
1853   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1854   assert_different_registers(dst, isrc);
1855   bool isQ = vector_length_in_bytes == 16;
1856 
1857   BLOCK_COMMENT("neon_reduce_add_integral {");
1858     switch(bt) {
1859       case T_BYTE:
1860         addv(vtmp, isQ ? T16B : T8B, vsrc);
1861         smov(dst, vtmp, B, 0);
1862         addw(dst, dst, isrc, ext::sxtb);
1863         break;
1864       case T_SHORT:
1865         addv(vtmp, isQ ? T8H : T4H, vsrc);
1866         smov(dst, vtmp, H, 0);
1867         addw(dst, dst, isrc, ext::sxth);
1868         break;
1869       case T_INT:
1870         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1871         umov(dst, vtmp, S, 0);
1872         addw(dst, dst, isrc);
1873         break;
1874       case T_LONG:
1875         assert(isQ, "unsupported");
1876         addpd(vtmp, vsrc);
1877         umov(dst, vtmp, D, 0);
1878         add(dst, dst, isrc);
1879         break;
1880       default:
1881         assert(false, "unsupported");
1882         ShouldNotReachHere();
1883     }
1884   BLOCK_COMMENT("} neon_reduce_add_integral");
1885 }
1886 
1887 // Vector reduction multiply for integral type with ASIMD instructions.
1888 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1889 // Clobbers: rscratch1
1890 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1891                                                  Register isrc, FloatRegister vsrc,
1892                                                  unsigned vector_length_in_bytes,
1893                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
1894   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1895   bool isQ = vector_length_in_bytes == 16;
1896 
1897   BLOCK_COMMENT("neon_reduce_mul_integral {");
1898     switch(bt) {
1899       case T_BYTE:
1900         if (isQ) {
1901           // Multiply the lower half and higher half of vector iteratively.
1902           // vtmp1 = vsrc[8:15]
1903           ins(vtmp1, D, vsrc, 0, 1);
1904           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1905           mulv(vtmp1, T8B, vtmp1, vsrc);
1906           // vtmp2 = vtmp1[4:7]
1907           ins(vtmp2, S, vtmp1, 0, 1);
1908           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1909           mulv(vtmp1, T8B, vtmp2, vtmp1);
1910         } else {
1911           ins(vtmp1, S, vsrc, 0, 1);
1912           mulv(vtmp1, T8B, vtmp1, vsrc);
1913         }
1914         // vtmp2 = vtmp1[2:3]
1915         ins(vtmp2, H, vtmp1, 0, 1);
1916         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1917         mulv(vtmp2, T8B, vtmp2, vtmp1);
1918         // dst = vtmp2[0] * isrc * vtmp2[1]
1919         umov(rscratch1, vtmp2, B, 0);
1920         mulw(dst, rscratch1, isrc);
1921         sxtb(dst, dst);
1922         umov(rscratch1, vtmp2, B, 1);
1923         mulw(dst, rscratch1, dst);
1924         sxtb(dst, dst);
1925         break;
1926       case T_SHORT:
1927         if (isQ) {
1928           ins(vtmp2, D, vsrc, 0, 1);
1929           mulv(vtmp2, T4H, vtmp2, vsrc);
1930           ins(vtmp1, S, vtmp2, 0, 1);
1931           mulv(vtmp1, T4H, vtmp1, vtmp2);
1932         } else {
1933           ins(vtmp1, S, vsrc, 0, 1);
1934           mulv(vtmp1, T4H, vtmp1, vsrc);
1935         }
1936         umov(rscratch1, vtmp1, H, 0);
1937         mulw(dst, rscratch1, isrc);
1938         sxth(dst, dst);
1939         umov(rscratch1, vtmp1, H, 1);
1940         mulw(dst, rscratch1, dst);
1941         sxth(dst, dst);
1942         break;
1943       case T_INT:
1944         if (isQ) {
1945           ins(vtmp1, D, vsrc, 0, 1);
1946           mulv(vtmp1, T2S, vtmp1, vsrc);
1947         } else {
1948           vtmp1 = vsrc;
1949         }
1950         umov(rscratch1, vtmp1, S, 0);
1951         mul(dst, rscratch1, isrc);
1952         umov(rscratch1, vtmp1, S, 1);
1953         mul(dst, rscratch1, dst);
1954         break;
1955       case T_LONG:
1956         umov(rscratch1, vsrc, D, 0);
1957         mul(dst, isrc, rscratch1);
1958         umov(rscratch1, vsrc, D, 1);
1959         mul(dst, dst, rscratch1);
1960         break;
1961       default:
1962         assert(false, "unsupported");
1963         ShouldNotReachHere();
1964     }
1965   BLOCK_COMMENT("} neon_reduce_mul_integral");
1966 }
1967 
1968 // Vector reduction multiply for floating-point type with ASIMD instructions.
1969 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1970                                            FloatRegister fsrc, FloatRegister vsrc,
1971                                            unsigned vector_length_in_bytes,
1972                                            FloatRegister vtmp) {
1973   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1974   bool isQ = vector_length_in_bytes == 16;
1975 
1976   BLOCK_COMMENT("neon_reduce_mul_fp {");
1977     switch(bt) {
1978       case T_FLOAT:
1979         fmuls(dst, fsrc, vsrc);
1980         ins(vtmp, S, vsrc, 0, 1);
1981         fmuls(dst, dst, vtmp);
1982         if (isQ) {
1983           ins(vtmp, S, vsrc, 0, 2);
1984           fmuls(dst, dst, vtmp);
1985           ins(vtmp, S, vsrc, 0, 3);
1986           fmuls(dst, dst, vtmp);
1987          }
1988         break;
1989       case T_DOUBLE:
1990         assert(isQ, "unsupported");
1991         fmuld(dst, fsrc, vsrc);
1992         ins(vtmp, D, vsrc, 0, 1);
1993         fmuld(dst, dst, vtmp);
1994         break;
1995       default:
1996         assert(false, "unsupported");
1997         ShouldNotReachHere();
1998     }
1999   BLOCK_COMMENT("} neon_reduce_mul_fp");
2000 }
2001 
2002 // Helper to select logical instruction
2003 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2004                                                    Register Rn, Register Rm,
2005                                                    enum shift_kind kind, unsigned shift) {
2006   switch(opc) {
2007     case Op_AndReductionV:
2008       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2009       break;
2010     case Op_OrReductionV:
2011       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2012       break;
2013     case Op_XorReductionV:
2014       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2015       break;
2016     default:
2017       assert(false, "unsupported");
2018       ShouldNotReachHere();
2019   }
2020 }
2021 
2022 // Vector reduction logical operations And, Or, Xor
2023 // Clobbers: rscratch1
2024 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2025                                             Register isrc, FloatRegister vsrc,
2026                                             unsigned vector_length_in_bytes) {
2027   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2028          "unsupported");
2029   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2030   assert_different_registers(dst, isrc);
2031   bool isQ = vector_length_in_bytes == 16;
2032 
2033   BLOCK_COMMENT("neon_reduce_logical {");
2034     umov(rscratch1, vsrc, isQ ? D : S, 0);
2035     umov(dst, vsrc, isQ ? D : S, 1);
2036     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2037     switch(bt) {
2038       case T_BYTE:
2039         if (isQ) {
2040           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2041         }
2042         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2043         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2044         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2045         sxtb(dst, dst);
2046         break;
2047       case T_SHORT:
2048         if (isQ) {
2049           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2050         }
2051         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2052         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2053         sxth(dst, dst);
2054         break;
2055       case T_INT:
2056         if (isQ) {
2057           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2058         }
2059         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2060         break;
2061       case T_LONG:
2062         assert(isQ, "unsupported");
2063         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2064         break;
2065       default:
2066         assert(false, "unsupported");
2067         ShouldNotReachHere();
2068     }
2069   BLOCK_COMMENT("} neon_reduce_logical");
2070 }
2071 
2072 // Vector reduction min/max for integral type with ASIMD instructions.
2073 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2074 // Clobbers: rscratch1, rflags
2075 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2076                                                     Register isrc, FloatRegister vsrc,
2077                                                     unsigned vector_length_in_bytes,
2078                                                     FloatRegister vtmp) {
2079   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2080   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2081   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2082   assert_different_registers(dst, isrc);
2083   bool isQ = vector_length_in_bytes == 16;
2084   bool is_min = opc == Op_MinReductionV;
2085 
2086   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2087     if (bt == T_LONG) {
2088       assert(vtmp == fnoreg, "should be");
2089       assert(isQ, "should be");
2090       umov(rscratch1, vsrc, D, 0);
2091       cmp(isrc, rscratch1);
2092       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2093       umov(rscratch1, vsrc, D, 1);
2094       cmp(dst, rscratch1);
2095       csel(dst, dst, rscratch1, is_min ? LT : GT);
2096     } else {
2097       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2098       if (size == T2S) {
2099         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2100       } else {
2101         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2102       }
2103       if (bt == T_INT) {
2104         umov(dst, vtmp, S, 0);
2105       } else {
2106         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2107       }
2108       cmpw(dst, isrc);
2109       cselw(dst, dst, isrc, is_min ? LT : GT);
2110     }
2111   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2112 }
2113 
2114 // Vector reduction for integral type with SVE instruction.
2115 // Supported operations are Add, And, Or, Xor, Max, Min.
2116 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2117 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2118                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2119   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2120   assert(pg->is_governing(), "This register has to be a governing predicate register");
2121   assert_different_registers(src1, dst);
2122   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2123   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2124   switch (opc) {
2125     case Op_AddReductionVI: {
2126       sve_uaddv(tmp, size, pg, src2);
2127       if (bt == T_BYTE) {
2128         smov(dst, tmp, size, 0);
2129         addw(dst, src1, dst, ext::sxtb);
2130       } else if (bt == T_SHORT) {
2131         smov(dst, tmp, size, 0);
2132         addw(dst, src1, dst, ext::sxth);
2133       } else {
2134         umov(dst, tmp, size, 0);
2135         addw(dst, dst, src1);
2136       }
2137       break;
2138     }
2139     case Op_AddReductionVL: {
2140       sve_uaddv(tmp, size, pg, src2);
2141       umov(dst, tmp, size, 0);
2142       add(dst, dst, src1);
2143       break;
2144     }
2145     case Op_AndReductionV: {
2146       sve_andv(tmp, size, pg, src2);
2147       if (bt == T_INT || bt == T_LONG) {
2148         umov(dst, tmp, size, 0);
2149       } else {
2150         smov(dst, tmp, size, 0);
2151       }
2152       if (bt == T_LONG) {
2153         andr(dst, dst, src1);
2154       } else {
2155         andw(dst, dst, src1);
2156       }
2157       break;
2158     }
2159     case Op_OrReductionV: {
2160       sve_orv(tmp, size, pg, src2);
2161       if (bt == T_INT || bt == T_LONG) {
2162         umov(dst, tmp, size, 0);
2163       } else {
2164         smov(dst, tmp, size, 0);
2165       }
2166       if (bt == T_LONG) {
2167         orr(dst, dst, src1);
2168       } else {
2169         orrw(dst, dst, src1);
2170       }
2171       break;
2172     }
2173     case Op_XorReductionV: {
2174       sve_eorv(tmp, size, pg, src2);
2175       if (bt == T_INT || bt == T_LONG) {
2176         umov(dst, tmp, size, 0);
2177       } else {
2178         smov(dst, tmp, size, 0);
2179       }
2180       if (bt == T_LONG) {
2181         eor(dst, dst, src1);
2182       } else {
2183         eorw(dst, dst, src1);
2184       }
2185       break;
2186     }
2187     case Op_MaxReductionV: {
2188       sve_smaxv(tmp, size, pg, src2);
2189       if (bt == T_INT || bt == T_LONG) {
2190         umov(dst, tmp, size, 0);
2191       } else {
2192         smov(dst, tmp, size, 0);
2193       }
2194       if (bt == T_LONG) {
2195         cmp(dst, src1);
2196         csel(dst, dst, src1, Assembler::GT);
2197       } else {
2198         cmpw(dst, src1);
2199         cselw(dst, dst, src1, Assembler::GT);
2200       }
2201       break;
2202     }
2203     case Op_MinReductionV: {
2204       sve_sminv(tmp, size, pg, src2);
2205       if (bt == T_INT || bt == T_LONG) {
2206         umov(dst, tmp, size, 0);
2207       } else {
2208         smov(dst, tmp, size, 0);
2209       }
2210       if (bt == T_LONG) {
2211         cmp(dst, src1);
2212         csel(dst, dst, src1, Assembler::LT);
2213       } else {
2214         cmpw(dst, src1);
2215         cselw(dst, dst, src1, Assembler::LT);
2216       }
2217       break;
2218     }
2219     default:
2220       assert(false, "unsupported");
2221       ShouldNotReachHere();
2222   }
2223 
2224   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2225     if (bt == T_BYTE) {
2226       sxtb(dst, dst);
2227     } else if (bt == T_SHORT) {
2228       sxth(dst, dst);
2229     }
2230   }
2231 }
2232 
2233 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2234 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2235 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2236 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2237   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2238   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2239 
2240   // Set all elements to false if the input "lane_cnt" is zero.
2241   if (lane_cnt == 0) {
2242     sve_pfalse(dst);
2243     return;
2244   }
2245 
2246   SIMD_RegVariant size = elemType_to_regVariant(bt);
2247   assert(size != Q, "invalid size");
2248 
2249   // Set all true if "lane_cnt" equals to the max lane count.
2250   if (lane_cnt == max_vector_length) {
2251     sve_ptrue(dst, size, /* ALL */ 0b11111);
2252     return;
2253   }
2254 
2255   // Fixed numbers for "ptrue".
2256   switch(lane_cnt) {
2257   case 1: /* VL1 */
2258   case 2: /* VL2 */
2259   case 3: /* VL3 */
2260   case 4: /* VL4 */
2261   case 5: /* VL5 */
2262   case 6: /* VL6 */
2263   case 7: /* VL7 */
2264   case 8: /* VL8 */
2265     sve_ptrue(dst, size, lane_cnt);
2266     return;
2267   case 16:
2268     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2269     return;
2270   case 32:
2271     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2272     return;
2273   case 64:
2274     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2275     return;
2276   case 128:
2277     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2278     return;
2279   case 256:
2280     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2281     return;
2282   default:
2283     break;
2284   }
2285 
2286   // Special patterns for "ptrue".
2287   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2288     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2289   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2290     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2291   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2292     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2293   } else {
2294     // Encode to "whileltw" for the remaining cases.
2295     mov(rscratch1, lane_cnt);
2296     sve_whileltw(dst, size, zr, rscratch1);
2297   }
2298 }
2299 
2300 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2301 // Any remaining elements of dst will be filled with zero.
2302 // Clobbers: rscratch1
2303 // Preserves: src, mask
2304 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2305                                            FloatRegister vtmp1, FloatRegister vtmp2,
2306                                            PRegister pgtmp) {
2307   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2308   assert_different_registers(dst, src, vtmp1, vtmp2);
2309   assert_different_registers(mask, pgtmp);
2310 
2311   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2312   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2313   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2314   sve_dup(vtmp2, H, 0);
2315 
2316   // Extend lowest half to type INT.
2317   // dst = 00004444 00003333 00002222 00001111
2318   sve_uunpklo(dst, S, src);
2319   // pgtmp = 00000001 00000000 00000001 00000001
2320   sve_punpklo(pgtmp, mask);
2321   // Pack the active elements in size of type INT to the right,
2322   // and fill the remainings with zero.
2323   // dst = 00000000 00004444 00002222 00001111
2324   sve_compact(dst, S, dst, pgtmp);
2325   // Narrow the result back to type SHORT.
2326   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2327   sve_uzp1(dst, H, dst, vtmp2);
2328   // Count the active elements of lowest half.
2329   // rscratch1 = 3
2330   sve_cntp(rscratch1, S, ptrue, pgtmp);
2331 
2332   // Repeat to the highest half.
2333   // pgtmp = 00000001 00000000 00000000 00000001
2334   sve_punpkhi(pgtmp, mask);
2335   // vtmp1 = 00008888 00007777 00006666 00005555
2336   sve_uunpkhi(vtmp1, S, src);
2337   // vtmp1 = 00000000 00000000 00008888 00005555
2338   sve_compact(vtmp1, S, vtmp1, pgtmp);
2339   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2340   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2341 
2342   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2343   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2344   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2345   // TRUE_CNT is the number of active elements in the compressed low.
2346   neg(rscratch1, rscratch1);
2347   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2348   sve_index(vtmp2, H, rscratch1, 1);
2349   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2350   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2351 
2352   // Combine the compressed high(after shifted) with the compressed low.
2353   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2354   sve_orr(dst, dst, vtmp1);
2355 }
2356 
2357 // Clobbers: rscratch1, rscratch2
2358 // Preserves: src, mask
2359 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2360                                           FloatRegister vtmp1, FloatRegister vtmp2,
2361                                           FloatRegister vtmp3, FloatRegister vtmp4,
2362                                           PRegister ptmp, PRegister pgtmp) {
2363   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2364   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2365   assert_different_registers(mask, ptmp, pgtmp);
2366   // Example input:   src   = 88 77 66 55 44 33 22 11
2367   //                  mask  = 01 00 00 01 01 00 01 01
2368   // Expected result: dst   = 00 00 00 88 55 44 22 11
2369 
2370   sve_dup(vtmp4, B, 0);
2371   // Extend lowest half to type SHORT.
2372   // vtmp1 = 0044 0033 0022 0011
2373   sve_uunpklo(vtmp1, H, src);
2374   // ptmp = 0001 0000 0001 0001
2375   sve_punpklo(ptmp, mask);
2376   // Count the active elements of lowest half.
2377   // rscratch2 = 3
2378   sve_cntp(rscratch2, H, ptrue, ptmp);
2379   // Pack the active elements in size of type SHORT to the right,
2380   // and fill the remainings with zero.
2381   // dst = 0000 0044 0022 0011
2382   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2383   // Narrow the result back to type BYTE.
2384   // dst = 00 00 00 00 00 44 22 11
2385   sve_uzp1(dst, B, dst, vtmp4);
2386 
2387   // Repeat to the highest half.
2388   // ptmp = 0001 0000 0000 0001
2389   sve_punpkhi(ptmp, mask);
2390   // vtmp1 = 0088 0077 0066 0055
2391   sve_uunpkhi(vtmp2, H, src);
2392   // vtmp1 = 0000 0000 0088 0055
2393   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2394 
2395   sve_dup(vtmp4, B, 0);
2396   // vtmp1 = 00 00 00 00 00 00 88 55
2397   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2398 
2399   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2400   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2401   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2402   // TRUE_CNT is the number of active elements in the compressed low.
2403   neg(rscratch2, rscratch2);
2404   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2405   sve_index(vtmp2, B, rscratch2, 1);
2406   // vtmp1 = 00 00 00 88 55 00 00 00
2407   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2408   // Combine the compressed high(after shifted) with the compressed low.
2409   // dst = 00 00 00 88 55 44 22 11
2410   sve_orr(dst, dst, vtmp1);
2411 }
2412 
2413 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2414   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2415   SIMD_Arrangement size = isQ ? T16B : T8B;
2416   if (bt == T_BYTE) {
2417     rbit(dst, size, src);
2418   } else {
2419     neon_reverse_bytes(dst, src, bt, isQ);
2420     rbit(dst, size, dst);
2421   }
2422 }
2423 
2424 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2425   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2426   SIMD_Arrangement size = isQ ? T16B : T8B;
2427   switch (bt) {
2428     case T_BYTE:
2429       if (dst != src) {
2430         orr(dst, size, src, src);
2431       }
2432       break;
2433     case T_SHORT:
2434       rev16(dst, size, src);
2435       break;
2436     case T_INT:
2437       rev32(dst, size, src);
2438       break;
2439     case T_LONG:
2440       rev64(dst, size, src);
2441       break;
2442     default:
2443       assert(false, "unsupported");
2444       ShouldNotReachHere();
2445   }
2446 }
2447 
2448 // Extract a scalar element from an sve vector at position 'idx'.
2449 // The input elements in src are expected to be of integral type.
2450 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2451                                              int idx, FloatRegister vtmp) {
2452   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2453   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2454   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2455     if (bt == T_INT || bt == T_LONG) {
2456       umov(dst, src, size, idx);
2457     } else {
2458       smov(dst, src, size, idx);
2459     }
2460   } else {
2461     sve_orr(vtmp, src, src);
2462     sve_ext(vtmp, vtmp, idx << size);
2463     if (bt == T_INT || bt == T_LONG) {
2464       umov(dst, vtmp, size, 0);
2465     } else {
2466       smov(dst, vtmp, size, 0);
2467     }
2468   }
2469 }
2470 
2471 // java.lang.Math::round intrinsics
2472 
2473 // Clobbers: rscratch1, rflags
2474 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2475                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2476   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2477   switch (T) {
2478     case T2S:
2479     case T4S:
2480       fmovs(tmp1, T, 0.5f);
2481       mov(rscratch1, jint_cast(0x1.0p23f));
2482       break;
2483     case T2D:
2484       fmovd(tmp1, T, 0.5);
2485       mov(rscratch1, julong_cast(0x1.0p52));
2486       break;
2487     default:
2488       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2489   }
2490   fadd(tmp1, T, tmp1, src);
2491   fcvtms(tmp1, T, tmp1);
2492   // tmp1 = floor(src + 0.5, ties to even)
2493 
2494   fcvtas(dst, T, src);
2495   // dst = round(src), ties to away
2496 
2497   fneg(tmp3, T, src);
2498   dup(tmp2, T, rscratch1);
2499   cm(HS, tmp3, T, tmp3, tmp2);
2500   // tmp3 is now a set of flags
2501 
2502   bif(dst, T16B, tmp1, tmp3);
2503   // result in dst
2504 }
2505 
2506 // Clobbers: rscratch1, rflags
2507 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2508                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2509   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2510   assert_different_registers(tmp1, tmp2, src, dst);
2511 
2512   switch (T) {
2513     case S:
2514       mov(rscratch1, jint_cast(0x1.0p23f));
2515       break;
2516     case D:
2517       mov(rscratch1, julong_cast(0x1.0p52));
2518       break;
2519     default:
2520       assert(T == S || T == D, "invalid register variant");
2521   }
2522 
2523   sve_frinta(dst, T, ptrue, src);
2524   // dst = round(src), ties to away
2525 
2526   Label none;
2527 
2528   sve_fneg(tmp1, T, ptrue, src);
2529   sve_dup(tmp2, T, rscratch1);
2530   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2531   br(EQ, none);
2532   {
2533     sve_cpy(tmp1, T, pgtmp, 0.5);
2534     sve_fadd(tmp1, T, pgtmp, src);
2535     sve_frintm(dst, T, pgtmp, tmp1);
2536     // dst = floor(src + 0.5, ties to even)
2537   }
2538   bind(none);
2539 
2540   sve_fcvtzs(dst, T, ptrue, dst, T);
2541   // result in dst
2542 }
2543 
2544 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2545                                            FloatRegister one, SIMD_Arrangement T) {
2546   assert_different_registers(dst, src, zero, one);
2547   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2548 
2549   facgt(dst, T, src, zero);
2550   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2551   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2552 }
2553 
2554 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2555                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2556     assert_different_registers(dst, src, zero, one, vtmp);
2557     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2558 
2559     sve_orr(vtmp, src, src);
2560     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2561     switch (T) {
2562     case S:
2563       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2564       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2565                                         // on the sign of the float value
2566       break;
2567     case D:
2568       sve_and(vtmp, T, min_jlong);
2569       sve_orr(vtmp, T, jlong_cast(1.0));
2570       break;
2571     default:
2572       assert(false, "unsupported");
2573       ShouldNotReachHere();
2574     }
2575     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2576                                        // Result in dst
2577 }
2578 
2579 bool C2_MacroAssembler::in_scratch_emit_size() {
2580   if (ciEnv::current()->task() != nullptr) {
2581     PhaseOutput* phase_output = Compile::current()->output();
2582     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2583       return true;
2584     }
2585   }
2586   return MacroAssembler::in_scratch_emit_size();
2587 }