1 /*
   2  * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "code/compiledIC.hpp"
  31 #include "compiler/disassembler.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/cardTable.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "gc/shared/collectedHeap.hpp"
  37 #include "interpreter/bytecodeHistogram.hpp"
  38 #include "interpreter/interpreter.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "memory/universe.hpp"
  41 #include "nativeInst_riscv.hpp"
  42 #include "oops/accessDecorators.hpp"
  43 #include "oops/compressedKlass.inline.hpp"
  44 #include "oops/compressedOops.inline.hpp"
  45 #include "oops/klass.inline.hpp"
  46 #include "oops/oop.hpp"
  47 #include "runtime/interfaceSupport.inline.hpp"
  48 #include "runtime/javaThread.hpp"
  49 #include "runtime/jniHandles.inline.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubRoutines.hpp"
  52 #include "utilities/globalDefinitions.hpp"
  53 #include "utilities/powerOfTwo.hpp"
  54 #ifdef COMPILER2
  55 #include "opto/compile.hpp"
  56 #include "opto/node.hpp"
  57 #include "opto/output.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #endif
  65 #define STOP(str) stop(str);
  66 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  67 
  68 static void pass_arg0(MacroAssembler* masm, Register arg) {
  69   if (c_rarg0 != arg) {
  70     masm->mv(c_rarg0, arg);
  71   }
  72 }
  73 
  74 static void pass_arg1(MacroAssembler* masm, Register arg) {
  75   if (c_rarg1 != arg) {
  76     masm->mv(c_rarg1, arg);
  77   }
  78 }
  79 
  80 static void pass_arg2(MacroAssembler* masm, Register arg) {
  81   if (c_rarg2 != arg) {
  82     masm->mv(c_rarg2, arg);
  83   }
  84 }
  85 
  86 static void pass_arg3(MacroAssembler* masm, Register arg) {
  87   if (c_rarg3 != arg) {
  88     masm->mv(c_rarg3, arg);
  89   }
  90 }
  91 
  92 void MacroAssembler::push_cont_fastpath(Register java_thread) {
  93   if (!Continuations::enabled()) return;
  94   Label done;
  95   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
  96   bleu(sp, t0, done);
  97   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
  98   bind(done);
  99 }
 100 
 101 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 102   if (!Continuations::enabled()) return;
 103   Label done;
 104   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 105   bltu(sp, t0, done);
 106   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 107   bind(done);
 108 }
 109 
 110 int MacroAssembler::align(int modulus, int extra_offset) {
 111   CompressibleRegion cr(this);
 112   intptr_t before = offset();
 113   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 114   return (int)(offset() - before);
 115 }
 116 
 117 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 118   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 119 }
 120 
 121 // Implementation of call_VM versions
 122 
 123 void MacroAssembler::call_VM(Register oop_result,
 124                              address entry_point,
 125                              bool check_exceptions) {
 126   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 127 }
 128 
 129 void MacroAssembler::call_VM(Register oop_result,
 130                              address entry_point,
 131                              Register arg_1,
 132                              bool check_exceptions) {
 133   pass_arg1(this, arg_1);
 134   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 135 }
 136 
 137 void MacroAssembler::call_VM(Register oop_result,
 138                              address entry_point,
 139                              Register arg_1,
 140                              Register arg_2,
 141                              bool check_exceptions) {
 142   assert_different_registers(arg_1, c_rarg2);
 143   pass_arg2(this, arg_2);
 144   pass_arg1(this, arg_1);
 145   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 146 }
 147 
 148 void MacroAssembler::call_VM(Register oop_result,
 149                              address entry_point,
 150                              Register arg_1,
 151                              Register arg_2,
 152                              Register arg_3,
 153                              bool check_exceptions) {
 154   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 155   assert_different_registers(arg_2, c_rarg3);
 156   pass_arg3(this, arg_3);
 157 
 158   pass_arg2(this, arg_2);
 159 
 160   pass_arg1(this, arg_1);
 161   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 162 }
 163 
 164 void MacroAssembler::call_VM(Register oop_result,
 165                              Register last_java_sp,
 166                              address entry_point,
 167                              int number_of_arguments,
 168                              bool check_exceptions) {
 169   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 170 }
 171 
 172 void MacroAssembler::call_VM(Register oop_result,
 173                              Register last_java_sp,
 174                              address entry_point,
 175                              Register arg_1,
 176                              bool check_exceptions) {
 177   pass_arg1(this, arg_1);
 178   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 179 }
 180 
 181 void MacroAssembler::call_VM(Register oop_result,
 182                              Register last_java_sp,
 183                              address entry_point,
 184                              Register arg_1,
 185                              Register arg_2,
 186                              bool check_exceptions) {
 187 
 188   assert_different_registers(arg_1, c_rarg2);
 189   pass_arg2(this, arg_2);
 190   pass_arg1(this, arg_1);
 191   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 192 }
 193 
 194 void MacroAssembler::call_VM(Register oop_result,
 195                              Register last_java_sp,
 196                              address entry_point,
 197                              Register arg_1,
 198                              Register arg_2,
 199                              Register arg_3,
 200                              bool check_exceptions) {
 201   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 202   assert_different_registers(arg_2, c_rarg3);
 203   pass_arg3(this, arg_3);
 204   pass_arg2(this, arg_2);
 205   pass_arg1(this, arg_1);
 206   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 207 }
 208 
 209 void MacroAssembler::post_call_nop() {
 210   if (!Continuations::enabled()) {
 211     return;
 212   }
 213   relocate(post_call_nop_Relocation::spec(), [&] {
 214     InlineSkippedInstructionsCounter skipCounter(this);
 215     nop();
 216     li32(zr, 0);
 217   });
 218 }
 219 
 220 // these are no-ops overridden by InterpreterMacroAssembler
 221 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 222 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 223 
 224 // Calls to C land
 225 //
 226 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 227 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 228 // has to be reset to 0. This is required to allow proper stack traversal.
 229 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 230                                          Register last_java_fp,
 231                                          Register last_java_pc,
 232                                          Register tmp) {
 233 
 234   if (last_java_pc->is_valid()) {
 235       sd(last_java_pc, Address(xthread,
 236                                JavaThread::frame_anchor_offset() +
 237                                JavaFrameAnchor::last_Java_pc_offset()));
 238   }
 239 
 240   // determine last_java_sp register
 241   if (last_java_sp == sp) {
 242     mv(tmp, sp);
 243     last_java_sp = tmp;
 244   } else if (!last_java_sp->is_valid()) {
 245     last_java_sp = esp;
 246   }
 247 
 248   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 249 
 250   // last_java_fp is optional
 251   if (last_java_fp->is_valid()) {
 252     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 253   }
 254 }
 255 
 256 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 257                                          Register last_java_fp,
 258                                          address  last_java_pc,
 259                                          Register tmp) {
 260   assert(last_java_pc != nullptr, "must provide a valid PC");
 261 
 262   la(tmp, last_java_pc);
 263   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 264 
 265   set_last_Java_frame(last_java_sp, last_java_fp, noreg, tmp);
 266 }
 267 
 268 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 269                                          Register last_java_fp,
 270                                          Label &L,
 271                                          Register tmp) {
 272   if (L.is_bound()) {
 273     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 274   } else {
 275     L.add_patch_at(code(), locator());
 276     IncompressibleRegion ir(this);  // the label address will be patched back.
 277     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 278   }
 279 }
 280 
 281 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 282   // we must set sp to zero to clear frame
 283   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 284 
 285   // must clear fp, so that compiled frames are not confused; it is
 286   // possible that we need it only for debugging
 287   if (clear_fp) {
 288     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 289   }
 290 
 291   // Always clear the pc because it could have been set by make_walkable()
 292   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 293 }
 294 
 295 void MacroAssembler::call_VM_base(Register oop_result,
 296                                   Register java_thread,
 297                                   Register last_java_sp,
 298                                   address  entry_point,
 299                                   int      number_of_arguments,
 300                                   bool     check_exceptions) {
 301    // determine java_thread register
 302   if (!java_thread->is_valid()) {
 303     java_thread = xthread;
 304   }
 305   // determine last_java_sp register
 306   if (!last_java_sp->is_valid()) {
 307     last_java_sp = esp;
 308   }
 309 
 310   // debugging support
 311   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 312   assert(java_thread == xthread, "unexpected register");
 313 
 314   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 315   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 316 
 317   // push java thread (becomes first argument of C function)
 318   mv(c_rarg0, java_thread);
 319 
 320   // set last Java frame before call
 321   assert(last_java_sp != fp, "can't use fp");
 322 
 323   Label l;
 324   set_last_Java_frame(last_java_sp, fp, l, t0);
 325 
 326   // do the call, remove parameters
 327   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 328 
 329   // reset last Java frame
 330   // Only interpreter should have to clear fp
 331   reset_last_Java_frame(true);
 332 
 333    // C++ interp handles this in the interpreter
 334   check_and_handle_popframe(java_thread);
 335   check_and_handle_earlyret(java_thread);
 336 
 337   if (check_exceptions) {
 338     // check for pending exceptions (java_thread is set upon return)
 339     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 340     Label ok;
 341     beqz(t0, ok);
 342     RuntimeAddress target(StubRoutines::forward_exception_entry());
 343     relocate(target.rspec(), [&] {
 344       int32_t offset;
 345       la(t0, target.target(), offset);
 346       jalr(x0, t0, offset);
 347     });
 348     bind(ok);
 349   }
 350 
 351   // get oop result if there is one and reset the value in the thread
 352   if (oop_result->is_valid()) {
 353     get_vm_result(oop_result, java_thread);
 354   }
 355 }
 356 
 357 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 358   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 359   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 360   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 361 }
 362 
 363 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 364   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 365   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 366 }
 367 
 368 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 369   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 370   assert_different_registers(klass, xthread, tmp);
 371 
 372   Label L_fallthrough, L_tmp;
 373   if (L_fast_path == nullptr) {
 374     L_fast_path = &L_fallthrough;
 375   } else if (L_slow_path == nullptr) {
 376     L_slow_path = &L_fallthrough;
 377   }
 378 
 379   // Fast path check: class is fully initialized
 380   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 381   sub(tmp, tmp, InstanceKlass::fully_initialized);
 382   beqz(tmp, *L_fast_path);
 383 
 384   // Fast path check: current thread is initializer thread
 385   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 386 
 387   if (L_slow_path == &L_fallthrough) {
 388     beq(xthread, tmp, *L_fast_path);
 389     bind(*L_slow_path);
 390   } else if (L_fast_path == &L_fallthrough) {
 391     bne(xthread, tmp, *L_slow_path);
 392     bind(*L_fast_path);
 393   } else {
 394     Unimplemented();
 395   }
 396 }
 397 
 398 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 399   if (!VerifyOops) { return; }
 400 
 401   // Pass register number to verify_oop_subroutine
 402   const char* b = nullptr;
 403   {
 404     ResourceMark rm;
 405     stringStream ss;
 406     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 407     b = code_string(ss.as_string());
 408   }
 409   BLOCK_COMMENT("verify_oop {");
 410 
 411   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 412 
 413   mv(c_rarg0, reg); // c_rarg0 : x10
 414   {
 415     // The length of the instruction sequence emitted should not depend
 416     // on the address of the char buffer so that the size of mach nodes for
 417     // scratch emit and normal emit matches.
 418     IncompressibleRegion ir(this);  // Fixed length
 419     movptr(t0, (address) b);
 420   }
 421 
 422   // call indirectly to solve generation ordering problem
 423   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 424   relocate(target.rspec(), [&] {
 425     int32_t offset;
 426     la(t1, target.target(), offset);
 427     ld(t1, Address(t1, offset));
 428   });
 429   jalr(t1);
 430 
 431   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 432 
 433   BLOCK_COMMENT("} verify_oop");
 434 }
 435 
 436 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 437   if (!VerifyOops) {
 438     return;
 439   }
 440 
 441   const char* b = nullptr;
 442   {
 443     ResourceMark rm;
 444     stringStream ss;
 445     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 446     b = code_string(ss.as_string());
 447   }
 448   BLOCK_COMMENT("verify_oop_addr {");
 449 
 450   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 451 
 452   if (addr.uses(sp)) {
 453     la(x10, addr);
 454     ld(x10, Address(x10, 4 * wordSize));
 455   } else {
 456     ld(x10, addr);
 457   }
 458 
 459   {
 460     // The length of the instruction sequence emitted should not depend
 461     // on the address of the char buffer so that the size of mach nodes for
 462     // scratch emit and normal emit matches.
 463     IncompressibleRegion ir(this);  // Fixed length
 464     movptr(t0, (address) b);
 465   }
 466 
 467   // call indirectly to solve generation ordering problem
 468   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 469   relocate(target.rspec(), [&] {
 470     int32_t offset;
 471     la(t1, target.target(), offset);
 472     ld(t1, Address(t1, offset));
 473   });
 474   jalr(t1);
 475 
 476   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 477 
 478   BLOCK_COMMENT("} verify_oop_addr");
 479 }
 480 
 481 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 482                                          int extra_slot_offset) {
 483   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 484   int stackElementSize = Interpreter::stackElementSize;
 485   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 486 #ifdef ASSERT
 487   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 488   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 489 #endif
 490   if (arg_slot.is_constant()) {
 491     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 492   } else {
 493     assert_different_registers(t0, arg_slot.as_register());
 494     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 495     return Address(t0, offset);
 496   }
 497 }
 498 
 499 #ifndef PRODUCT
 500 extern "C" void findpc(intptr_t x);
 501 #endif
 502 
 503 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 504 {
 505   // In order to get locks to work, we need to fake a in_VM state
 506   if (ShowMessageBoxOnError) {
 507     JavaThread* thread = JavaThread::current();
 508     JavaThreadState saved_state = thread->thread_state();
 509     thread->set_thread_state(_thread_in_vm);
 510 #ifndef PRODUCT
 511     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 512       ttyLocker ttyl;
 513       BytecodeCounter::print();
 514     }
 515 #endif
 516     if (os::message_box(msg, "Execution stopped, print registers?")) {
 517       ttyLocker ttyl;
 518       tty->print_cr(" pc = 0x%016lx", pc);
 519 #ifndef PRODUCT
 520       tty->cr();
 521       findpc(pc);
 522       tty->cr();
 523 #endif
 524       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 525       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 526       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 527       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 528       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 529       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 530       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 531       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 532       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 533       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 534       tty->print_cr("x10 = 0x%016lx", regs[10]);
 535       tty->print_cr("x11 = 0x%016lx", regs[11]);
 536       tty->print_cr("x12 = 0x%016lx", regs[12]);
 537       tty->print_cr("x13 = 0x%016lx", regs[13]);
 538       tty->print_cr("x14 = 0x%016lx", regs[14]);
 539       tty->print_cr("x15 = 0x%016lx", regs[15]);
 540       tty->print_cr("x16 = 0x%016lx", regs[16]);
 541       tty->print_cr("x17 = 0x%016lx", regs[17]);
 542       tty->print_cr("x18 = 0x%016lx", regs[18]);
 543       tty->print_cr("x19 = 0x%016lx", regs[19]);
 544       tty->print_cr("x20 = 0x%016lx", regs[20]);
 545       tty->print_cr("x21 = 0x%016lx", regs[21]);
 546       tty->print_cr("x22 = 0x%016lx", regs[22]);
 547       tty->print_cr("x23 = 0x%016lx", regs[23]);
 548       tty->print_cr("x24 = 0x%016lx", regs[24]);
 549       tty->print_cr("x25 = 0x%016lx", regs[25]);
 550       tty->print_cr("x26 = 0x%016lx", regs[26]);
 551       tty->print_cr("x27 = 0x%016lx", regs[27]);
 552       tty->print_cr("x28 = 0x%016lx", regs[28]);
 553       tty->print_cr("x30 = 0x%016lx", regs[30]);
 554       tty->print_cr("x31 = 0x%016lx", regs[31]);
 555       BREAKPOINT;
 556     }
 557   }
 558   fatal("DEBUG MESSAGE: %s", msg);
 559 }
 560 
 561 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 562   assert_different_registers(value, tmp1, tmp2);
 563   Label done, tagged, weak_tagged;
 564 
 565   beqz(value, done);           // Use null as-is.
 566   // Test for tag.
 567   andi(tmp1, value, JNIHandles::tag_mask);
 568   bnez(tmp1, tagged);
 569 
 570   // Resolve local handle
 571   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 572   verify_oop(value);
 573   j(done);
 574 
 575   bind(tagged);
 576   // Test for jweak tag.
 577   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 578   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 579   bnez(tmp1, weak_tagged);
 580 
 581   // Resolve global handle
 582   access_load_at(T_OBJECT, IN_NATIVE, value,
 583                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 584   verify_oop(value);
 585   j(done);
 586 
 587   bind(weak_tagged);
 588   // Resolve jweak.
 589   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 590                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 591   verify_oop(value);
 592 
 593   bind(done);
 594 }
 595 
 596 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 597   assert_different_registers(value, tmp1, tmp2);
 598   Label done;
 599 
 600   beqz(value, done);           // Use null as-is.
 601 
 602 #ifdef ASSERT
 603   {
 604     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 605     Label valid_global_tag;
 606     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 607     bnez(tmp1, valid_global_tag);
 608     stop("non global jobject using resolve_global_jobject");
 609     bind(valid_global_tag);
 610   }
 611 #endif
 612 
 613   // Resolve global handle
 614   access_load_at(T_OBJECT, IN_NATIVE, value,
 615                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 616   verify_oop(value);
 617 
 618   bind(done);
 619 }
 620 
 621 void MacroAssembler::stop(const char* msg) {
 622   BLOCK_COMMENT(msg);
 623   illegal_instruction(Assembler::csr::time);
 624   emit_int64((uintptr_t)msg);
 625 }
 626 
 627 void MacroAssembler::unimplemented(const char* what) {
 628   const char* buf = nullptr;
 629   {
 630     ResourceMark rm;
 631     stringStream ss;
 632     ss.print("unimplemented: %s", what);
 633     buf = code_string(ss.as_string());
 634   }
 635   stop(buf);
 636 }
 637 
 638 void MacroAssembler::emit_static_call_stub() {
 639   IncompressibleRegion ir(this);  // Fixed length: see CompiledDirectCall::to_interp_stub_size().
 640   // CompiledDirectCall::set_to_interpreted knows the
 641   // exact layout of this stub.
 642 
 643   mov_metadata(xmethod, (Metadata*)nullptr);
 644 
 645   // Jump to the entry point of the c2i stub.
 646   int32_t offset = 0;
 647   movptr(t0, 0, offset);
 648   jalr(x0, t0, offset);
 649 }
 650 
 651 void MacroAssembler::call_VM_leaf_base(address entry_point,
 652                                        int number_of_arguments,
 653                                        Label *retaddr) {
 654   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 655   call(entry_point);
 656   if (retaddr != nullptr) {
 657     bind(*retaddr);
 658   }
 659   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 660 }
 661 
 662 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 663   call_VM_leaf_base(entry_point, number_of_arguments);
 664 }
 665 
 666 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 667   pass_arg0(this, arg_0);
 668   call_VM_leaf_base(entry_point, 1);
 669 }
 670 
 671 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 672   assert_different_registers(arg_1, c_rarg0);
 673   pass_arg0(this, arg_0);
 674   pass_arg1(this, arg_1);
 675   call_VM_leaf_base(entry_point, 2);
 676 }
 677 
 678 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 679                                   Register arg_1, Register arg_2) {
 680   assert_different_registers(arg_1, c_rarg0);
 681   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 682   pass_arg0(this, arg_0);
 683   pass_arg1(this, arg_1);
 684   pass_arg2(this, arg_2);
 685   call_VM_leaf_base(entry_point, 3);
 686 }
 687 
 688 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 689   pass_arg0(this, arg_0);
 690   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 691 }
 692 
 693 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 694 
 695   assert_different_registers(arg_0, c_rarg1);
 696   pass_arg1(this, arg_1);
 697   pass_arg0(this, arg_0);
 698   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 699 }
 700 
 701 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 702   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 703   assert_different_registers(arg_1, c_rarg2);
 704   pass_arg2(this, arg_2);
 705   pass_arg1(this, arg_1);
 706   pass_arg0(this, arg_0);
 707   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 708 }
 709 
 710 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 711   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 712   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 713   assert_different_registers(arg_2, c_rarg3);
 714 
 715   pass_arg3(this, arg_3);
 716   pass_arg2(this, arg_2);
 717   pass_arg1(this, arg_1);
 718   pass_arg0(this, arg_0);
 719   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 720 }
 721 
 722 void MacroAssembler::la(Register Rd, const address addr) {
 723   int64_t offset = addr - pc();
 724   if (is_simm32(offset)) {
 725     auipc(Rd, (int32_t)offset + 0x800);  //0x800, Note:the 11th sign bit
 726     addi(Rd, Rd, ((int64_t)offset << 52) >> 52);
 727   } else {
 728     movptr(Rd, addr);
 729   }
 730 }
 731 
 732 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
 733   assert((uintptr_t)addr < (1ull << 48), "bad address");
 734 
 735   unsigned long target_address = (uintptr_t)addr;
 736   unsigned long low_address = (uintptr_t)CodeCache::low_bound();
 737   unsigned long high_address = (uintptr_t)CodeCache::high_bound();
 738   long offset_low = target_address - low_address;
 739   long offset_high = target_address - high_address;
 740 
 741   // RISC-V doesn't compute a page-aligned address, in order to partially
 742   // compensate for the use of *signed* offsets in its base+disp12
 743   // addressing mode (RISC-V's PC-relative reach remains asymmetric
 744   // [-(2G + 2K), 2G - 2K).
 745   if (offset_high >= -((1L << 31) + (1L << 11)) && offset_low < (1L << 31) - (1L << 11)) {
 746     int64_t distance = addr - pc();
 747     auipc(Rd, (int32_t)distance + 0x800);
 748     offset = ((int32_t)distance << 20) >> 20;
 749   } else {
 750     movptr(Rd, addr, offset);
 751   }
 752 }
 753 
 754 void MacroAssembler::la(Register Rd, const Address &adr) {
 755   switch (adr.getMode()) {
 756     case Address::literal: {
 757       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 758       if (rtype == relocInfo::none) {
 759         mv(Rd, (intptr_t)(adr.target()));
 760       } else {
 761         relocate(adr.rspec(), [&] {
 762           movptr(Rd, adr.target());
 763         });
 764       }
 765       break;
 766     }
 767     case Address::base_plus_offset: {
 768       Address new_adr = legitimize_address(Rd, adr);
 769       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 770         addi(Rd, new_adr.base(), new_adr.offset());
 771       }
 772       break;
 773     }
 774     default:
 775       ShouldNotReachHere();
 776   }
 777 }
 778 
 779 void MacroAssembler::la(Register Rd, Label &label) {
 780   IncompressibleRegion ir(this);   // the label address may be patched back.
 781   wrap_label(Rd, label, &MacroAssembler::la);
 782 }
 783 
 784 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 785   lui(Rd, (uint32_t)imm << 12);
 786   srli(Rd, Rd, 12);
 787 }
 788 
 789 void MacroAssembler::li32(Register Rd, int32_t imm) {
 790   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 791   int64_t upper = imm, lower = imm;
 792   lower = (imm << 20) >> 20;
 793   upper -= lower;
 794   upper = (int32_t)upper;
 795   // lui Rd, imm[31:12] + imm[11]
 796   lui(Rd, upper);
 797   // use addiw to distinguish li32 to li64
 798   addiw(Rd, Rd, lower);
 799 }
 800 
 801 void MacroAssembler::li64(Register Rd, int64_t imm) {
 802   // Load upper 32 bits. upper = imm[63:32], but if imm[31] == 1 or
 803   // (imm[31:20] == 0x7ff && imm[19] == 1), upper = imm[63:32] + 1.
 804   int64_t lower = imm & 0xffffffff;
 805   lower -= ((lower << 44) >> 44);
 806   int64_t tmp_imm = ((uint64_t)(imm & 0xffffffff00000000)) + (uint64_t)lower;
 807   int32_t upper = (tmp_imm - (int32_t)lower) >> 32;
 808 
 809   // Load upper 32 bits
 810   int64_t up = upper, lo = upper;
 811   lo = (lo << 52) >> 52;
 812   up -= lo;
 813   up = (int32_t)up;
 814   lui(Rd, up);
 815   addi(Rd, Rd, lo);
 816 
 817   // Load the rest 32 bits.
 818   slli(Rd, Rd, 12);
 819   addi(Rd, Rd, (int32_t)lower >> 20);
 820   slli(Rd, Rd, 12);
 821   lower = ((int32_t)imm << 12) >> 20;
 822   addi(Rd, Rd, lower);
 823   slli(Rd, Rd, 8);
 824   lower = imm & 0xff;
 825   addi(Rd, Rd, lower);
 826 }
 827 
 828 void MacroAssembler::li(Register Rd, int64_t imm) {
 829   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 830   // li -> c.li
 831   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 832     c_li(Rd, imm);
 833     return;
 834   }
 835 
 836   int shift = 12;
 837   int64_t upper = imm, lower = imm;
 838   // Split imm to a lower 12-bit sign-extended part and the remainder,
 839   // because addi will sign-extend the lower imm.
 840   lower = ((int32_t)imm << 20) >> 20;
 841   upper -= lower;
 842 
 843   // Test whether imm is a 32-bit integer.
 844   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 845         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 846     while (((upper >> shift) & 1) == 0) { shift++; }
 847     upper >>= shift;
 848     li(Rd, upper);
 849     slli(Rd, Rd, shift);
 850     if (lower != 0) {
 851       addi(Rd, Rd, lower);
 852     }
 853   } else {
 854     // 32-bit integer
 855     Register hi_Rd = zr;
 856     if (upper != 0) {
 857       lui(Rd, (int32_t)upper);
 858       hi_Rd = Rd;
 859     }
 860     if (lower != 0 || hi_Rd == zr) {
 861       addiw(Rd, hi_Rd, lower);
 862     }
 863   }
 864 }
 865 
 866 #define INSN(NAME, REGISTER)                                       \
 867   void MacroAssembler::NAME(const address dest, Register temp) {   \
 868     assert_cond(dest != nullptr);                                  \
 869     int64_t distance = dest - pc();                                \
 870     if (is_simm21(distance) && ((distance % 2) == 0)) {            \
 871       Assembler::jal(REGISTER, distance);                          \
 872     } else {                                                       \
 873       assert(temp != noreg, "expecting a register");               \
 874       int32_t offset = 0;                                          \
 875       movptr(temp, dest, offset);                                  \
 876       Assembler::jalr(REGISTER, temp, offset);                     \
 877     }                                                              \
 878   }                                                                \
 879 
 880   INSN(j,   x0);
 881   INSN(jal, x1);
 882 
 883 #undef INSN
 884 
 885 #define INSN(NAME, REGISTER)                                       \
 886   void MacroAssembler::NAME(const Address &adr, Register temp) {   \
 887     switch (adr.getMode()) {                                       \
 888       case Address::literal: {                                     \
 889         relocate(adr.rspec(), [&] {                                \
 890           NAME(adr.target(), temp);                                \
 891         });                                                        \
 892         break;                                                     \
 893       }                                                            \
 894       case Address::base_plus_offset: {                            \
 895         int32_t offset = ((int32_t)adr.offset() << 20) >> 20;      \
 896         la(temp, Address(adr.base(), adr.offset() - offset));      \
 897         Assembler::jalr(REGISTER, temp, offset);                   \
 898         break;                                                     \
 899       }                                                            \
 900       default:                                                     \
 901         ShouldNotReachHere();                                      \
 902     }                                                              \
 903   }
 904 
 905   INSN(j,   x0);
 906   INSN(jal, x1);
 907 
 908 #undef INSN
 909 
 910 #define INSN(NAME)                                                                    \
 911   void MacroAssembler::NAME(Register Rd, const address dest, Register temp) {         \
 912     assert_cond(dest != nullptr);                                                     \
 913     int64_t distance = dest - pc();                                                   \
 914     if (is_simm21(distance) && ((distance % 2) == 0)) {                               \
 915       Assembler::NAME(Rd, distance);                                                  \
 916     } else {                                                                          \
 917       assert_different_registers(Rd, temp);                                           \
 918       int32_t offset = 0;                                                             \
 919       movptr(temp, dest, offset);                                                     \
 920       jalr(Rd, temp, offset);                                                         \
 921     }                                                                                 \
 922   }                                                                                   \
 923   void MacroAssembler::NAME(Register Rd, Label &L, Register temp) {                   \
 924     assert_different_registers(Rd, temp);                                             \
 925     wrap_label(Rd, L, temp, &MacroAssembler::NAME);                                   \
 926   }
 927 
 928   INSN(jal);
 929 
 930 #undef INSN
 931 
 932 #define INSN(NAME, REGISTER)                                       \
 933   void MacroAssembler::NAME(Label &l, Register temp) {             \
 934     jal(REGISTER, l, temp);                                        \
 935   }                                                                \
 936 
 937   INSN(j,   x0);
 938   INSN(jal, x1);
 939 
 940 #undef INSN
 941 
 942 void MacroAssembler::wrap_label(Register Rt, Label &L, Register tmp, load_insn_by_temp insn) {
 943   if (L.is_bound()) {
 944     (this->*insn)(Rt, target(L), tmp);
 945   } else {
 946     L.add_patch_at(code(), locator());
 947     (this->*insn)(Rt, pc(), tmp);
 948   }
 949 }
 950 
 951 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
 952   if (L.is_bound()) {
 953     (this->*insn)(Rt, target(L));
 954   } else {
 955     L.add_patch_at(code(), locator());
 956     (this->*insn)(Rt, pc());
 957   }
 958 }
 959 
 960 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
 961                                 compare_and_branch_insn insn,
 962                                 compare_and_branch_label_insn neg_insn, bool is_far) {
 963   if (is_far) {
 964     Label done;
 965     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
 966     j(L);
 967     bind(done);
 968   } else {
 969     if (L.is_bound()) {
 970       (this->*insn)(r1, r2, target(L));
 971     } else {
 972       L.add_patch_at(code(), locator());
 973       (this->*insn)(r1, r2, pc());
 974     }
 975   }
 976 }
 977 
 978 #define INSN(NAME, NEG_INSN)                                                              \
 979   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
 980     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
 981   }
 982 
 983   INSN(beq,  bne);
 984   INSN(bne,  beq);
 985   INSN(blt,  bge);
 986   INSN(bge,  blt);
 987   INSN(bltu, bgeu);
 988   INSN(bgeu, bltu);
 989 
 990 #undef INSN
 991 
 992 #define INSN(NAME)                                                                \
 993   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
 994     NAME(Rs, zr, dest);                                                           \
 995   }                                                                               \
 996   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
 997     NAME(Rs, zr, l, is_far);                                                      \
 998   }                                                                               \
 999 
1000   INSN(beq);
1001   INSN(bne);
1002   INSN(blt);
1003   INSN(ble);
1004   INSN(bge);
1005   INSN(bgt);
1006 
1007 #undef INSN
1008 
1009 #define INSN(NAME, NEG_INSN)                                                      \
1010   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
1011     NEG_INSN(Rt, Rs, dest);                                                       \
1012   }                                                                               \
1013   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
1014     NEG_INSN(Rt, Rs, l, is_far);                                                  \
1015   }
1016 
1017   INSN(bgt,  blt);
1018   INSN(ble,  bge);
1019   INSN(bgtu, bltu);
1020   INSN(bleu, bgeu);
1021 
1022 #undef INSN
1023 
1024 // Float compare branch instructions
1025 
1026 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1027   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1028     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1029     BRANCH(t0, l, is_far);                                                                                              \
1030   }                                                                                                                     \
1031   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1032     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1033     BRANCH(t0, l, is_far);                                                                                              \
1034   }
1035 
1036   INSN(beq, feq, bnez);
1037   INSN(bne, feq, beqz);
1038 
1039 #undef INSN
1040 
1041 
1042 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1043   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1044                                     bool is_far, bool is_unordered) {                 \
1045     if (is_unordered) {                                                               \
1046       /* jump if either source is NaN or condition is expected */                     \
1047       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1048       beqz(t0, l, is_far);                                                            \
1049     } else {                                                                          \
1050       /* jump if no NaN in source and condition is expected */                        \
1051       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1052       bnez(t0, l, is_far);                                                            \
1053     }                                                                                 \
1054   }                                                                                   \
1055   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1056                                      bool is_far, bool is_unordered) {                \
1057     if (is_unordered) {                                                               \
1058       /* jump if either source is NaN or condition is expected */                     \
1059       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1060       beqz(t0, l, is_far);                                                            \
1061     } else {                                                                          \
1062       /* jump if no NaN in source and condition is expected */                        \
1063       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1064       bnez(t0, l, is_far);                                                            \
1065     }                                                                                 \
1066   }
1067 
1068   INSN(ble, fle, flt);
1069   INSN(blt, flt, fle);
1070 
1071 #undef INSN
1072 
1073 #define INSN(NAME, CMP)                                                              \
1074   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1075                                     bool is_far, bool is_unordered) {                \
1076     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1077   }                                                                                  \
1078   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1079                                      bool is_far, bool is_unordered) {               \
1080     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1081   }
1082 
1083   INSN(bgt, blt);
1084   INSN(bge, ble);
1085 
1086 #undef INSN
1087 
1088 
1089 #define INSN(NAME, CSR)                       \
1090   void MacroAssembler::NAME(Register Rd) {    \
1091     csrr(Rd, CSR);                            \
1092   }
1093 
1094   INSN(rdinstret,  CSR_INSTRET);
1095   INSN(rdcycle,    CSR_CYCLE);
1096   INSN(rdtime,     CSR_TIME);
1097   INSN(frcsr,      CSR_FCSR);
1098   INSN(frrm,       CSR_FRM);
1099   INSN(frflags,    CSR_FFLAGS);
1100 
1101 #undef INSN
1102 
1103 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1104   csrrs(Rd, csr, x0);
1105 }
1106 
1107 #define INSN(NAME, OPFUN)                                      \
1108   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1109     OPFUN(x0, csr, Rs);                                        \
1110   }
1111 
1112   INSN(csrw, csrrw);
1113   INSN(csrs, csrrs);
1114   INSN(csrc, csrrc);
1115 
1116 #undef INSN
1117 
1118 #define INSN(NAME, OPFUN)                                      \
1119   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1120     OPFUN(x0, csr, imm);                                       \
1121   }
1122 
1123   INSN(csrwi, csrrwi);
1124   INSN(csrsi, csrrsi);
1125   INSN(csrci, csrrci);
1126 
1127 #undef INSN
1128 
1129 #define INSN(NAME, CSR)                                      \
1130   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1131     csrrw(Rd, CSR, Rs);                                      \
1132   }
1133 
1134   INSN(fscsr,   CSR_FCSR);
1135   INSN(fsrm,    CSR_FRM);
1136   INSN(fsflags, CSR_FFLAGS);
1137 
1138 #undef INSN
1139 
1140 #define INSN(NAME)                              \
1141   void MacroAssembler::NAME(Register Rs) {      \
1142     NAME(x0, Rs);                               \
1143   }
1144 
1145   INSN(fscsr);
1146   INSN(fsrm);
1147   INSN(fsflags);
1148 
1149 #undef INSN
1150 
1151 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1152   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1153   csrrwi(Rd, CSR_FRM, imm);
1154 }
1155 
1156 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1157    csrrwi(Rd, CSR_FFLAGS, imm);
1158 }
1159 
1160 #define INSN(NAME)                             \
1161   void MacroAssembler::NAME(unsigned imm) {    \
1162     NAME(x0, imm);                             \
1163   }
1164 
1165   INSN(fsrmi);
1166   INSN(fsflagsi);
1167 
1168 #undef INSN
1169 
1170 void MacroAssembler::push_reg(Register Rs)
1171 {
1172   addi(esp, esp, 0 - wordSize);
1173   sd(Rs, Address(esp, 0));
1174 }
1175 
1176 void MacroAssembler::pop_reg(Register Rd)
1177 {
1178   ld(Rd, Address(esp, 0));
1179   addi(esp, esp, wordSize);
1180 }
1181 
1182 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1183   int count = 0;
1184   // Scan bitset to accumulate register pairs
1185   for (int reg = 31; reg >= 0; reg--) {
1186     if ((1U << 31) & bitset) {
1187       regs[count++] = reg;
1188     }
1189     bitset <<= 1;
1190   }
1191   return count;
1192 }
1193 
1194 // Push integer registers in the bitset supplied. Don't push sp.
1195 // Return the number of words pushed
1196 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1197   DEBUG_ONLY(int words_pushed = 0;)
1198   unsigned char regs[32];
1199   int count = bitset_to_regs(bitset, regs);
1200   // reserve one slot to align for odd count
1201   int offset = is_even(count) ? 0 : wordSize;
1202 
1203   if (count) {
1204     addi(stack, stack, -count * wordSize - offset);
1205   }
1206   for (int i = count - 1; i >= 0; i--) {
1207     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1208     DEBUG_ONLY(words_pushed++;)
1209   }
1210 
1211   assert(words_pushed == count, "oops, pushed != count");
1212 
1213   return count;
1214 }
1215 
1216 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1217   DEBUG_ONLY(int words_popped = 0;)
1218   unsigned char regs[32];
1219   int count = bitset_to_regs(bitset, regs);
1220   // reserve one slot to align for odd count
1221   int offset = is_even(count) ? 0 : wordSize;
1222 
1223   for (int i = count - 1; i >= 0; i--) {
1224     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1225     DEBUG_ONLY(words_popped++;)
1226   }
1227 
1228   if (count) {
1229     addi(stack, stack, count * wordSize + offset);
1230   }
1231   assert(words_popped == count, "oops, popped != count");
1232 
1233   return count;
1234 }
1235 
1236 // Push floating-point registers in the bitset supplied.
1237 // Return the number of words pushed
1238 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1239   DEBUG_ONLY(int words_pushed = 0;)
1240   unsigned char regs[32];
1241   int count = bitset_to_regs(bitset, regs);
1242   int push_slots = count + (count & 1);
1243 
1244   if (count) {
1245     addi(stack, stack, -push_slots * wordSize);
1246   }
1247 
1248   for (int i = count - 1; i >= 0; i--) {
1249     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1250     DEBUG_ONLY(words_pushed++;)
1251   }
1252 
1253   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1254 
1255   return count;
1256 }
1257 
1258 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1259   DEBUG_ONLY(int words_popped = 0;)
1260   unsigned char regs[32];
1261   int count = bitset_to_regs(bitset, regs);
1262   int pop_slots = count + (count & 1);
1263 
1264   for (int i = count - 1; i >= 0; i--) {
1265     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1266     DEBUG_ONLY(words_popped++;)
1267   }
1268 
1269   if (count) {
1270     addi(stack, stack, pop_slots * wordSize);
1271   }
1272 
1273   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1274 
1275   return count;
1276 }
1277 
1278 #ifdef COMPILER2
1279 // Push vector registers in the bitset supplied.
1280 // Return the number of words pushed
1281 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1282   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1283 
1284   // Scan bitset to accumulate register pairs
1285   unsigned char regs[32];
1286   int count = bitset_to_regs(bitset, regs);
1287 
1288   for (int i = 0; i < count; i++) {
1289     sub(stack, stack, vector_size_in_bytes);
1290     vs1r_v(as_VectorRegister(regs[i]), stack);
1291   }
1292 
1293   return count * vector_size_in_bytes / wordSize;
1294 }
1295 
1296 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1297   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1298 
1299   // Scan bitset to accumulate register pairs
1300   unsigned char regs[32];
1301   int count = bitset_to_regs(bitset, regs);
1302 
1303   for (int i = count - 1; i >= 0; i--) {
1304     vl1r_v(as_VectorRegister(regs[i]), stack);
1305     add(stack, stack, vector_size_in_bytes);
1306   }
1307 
1308   return count * vector_size_in_bytes / wordSize;
1309 }
1310 #endif // COMPILER2
1311 
1312 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1313   // Push integer registers x7, x10-x17, x28-x31.
1314   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1315 
1316   // Push float registers f0-f7, f10-f17, f28-f31.
1317   addi(sp, sp, - wordSize * 20);
1318   int offset = 0;
1319   for (int i = 0; i < 32; i++) {
1320     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1321       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1322     }
1323   }
1324 }
1325 
1326 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1327   int offset = 0;
1328   for (int i = 0; i < 32; i++) {
1329     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1330       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1331     }
1332   }
1333   addi(sp, sp, wordSize * 20);
1334 
1335   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1336 }
1337 
1338 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1339   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1340   push_reg(RegSet::range(x5, x31), sp);
1341 
1342   // float registers
1343   addi(sp, sp, - 32 * wordSize);
1344   for (int i = 0; i < 32; i++) {
1345     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1346   }
1347 
1348   // vector registers
1349   if (save_vectors) {
1350     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1351     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1352     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1353       add(t0, sp, vector_size_in_bytes * i);
1354       vse64_v(as_VectorRegister(i), t0);
1355     }
1356   }
1357 }
1358 
1359 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1360   // vector registers
1361   if (restore_vectors) {
1362     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1363     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1364       vle64_v(as_VectorRegister(i), sp);
1365       add(sp, sp, vector_size_in_bytes * 8);
1366     }
1367   }
1368 
1369   // float registers
1370   for (int i = 0; i < 32; i++) {
1371     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1372   }
1373   addi(sp, sp, 32 * wordSize);
1374 
1375   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1376   pop_reg(RegSet::range(x5, x31), sp);
1377 }
1378 
1379 static int patch_offset_in_jal(address branch, int64_t offset) {
1380   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1381          "offset is too large to be patched in one jal instruction!\n");
1382   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1383   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1384   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1385   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1386   return NativeInstruction::instruction_size;                                   // only one instruction
1387 }
1388 
1389 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1390   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1391          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1392   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1393   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1394   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1395   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1396   return NativeInstruction::instruction_size;                                   // only one instruction
1397 }
1398 
1399 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1400   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1401   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1402   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1403   return PC_RELATIVE_INSTRUCTION_NUM * NativeInstruction::instruction_size;
1404 }
1405 
1406 static int patch_addr_in_movptr(address branch, address target) {
1407   const int MOVPTR_INSTRUCTIONS_NUM = 6;                                        // lui + addi + slli + addi + slli + addi/jalr/load
1408   int32_t lower = ((intptr_t)target << 35) >> 35;
1409   int64_t upper = ((intptr_t)target - lower) >> 29;
1410   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1411   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1412   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1413   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1414   return MOVPTR_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1415 }
1416 
1417 static int patch_imm_in_li64(address branch, address target) {
1418   const int LI64_INSTRUCTIONS_NUM = 8;                                          // lui + addi + slli + addi + slli + addi + slli + addi
1419   int64_t lower = (intptr_t)target & 0xffffffff;
1420   lower = lower - ((lower << 44) >> 44);
1421   int64_t tmp_imm = ((uint64_t)((intptr_t)target & 0xffffffff00000000)) + (uint64_t)lower;
1422   int32_t upper =  (tmp_imm - (int32_t)lower) >> 32;
1423   int64_t tmp_upper = upper, tmp_lower = upper;
1424   tmp_lower = (tmp_lower << 52) >> 52;
1425   tmp_upper -= tmp_lower;
1426   tmp_upper >>= 12;
1427   // Load upper 32 bits. Upper = target[63:32], but if target[31] = 1 or (target[31:20] == 0x7ff && target[19] == 1),
1428   // upper = target[63:32] + 1.
1429   Assembler::patch(branch + 0,  31, 12, tmp_upper & 0xfffff);                       // Lui.
1430   Assembler::patch(branch + 4,  31, 20, tmp_lower & 0xfff);                         // Addi.
1431   // Load the rest 32 bits.
1432   Assembler::patch(branch + 12, 31, 20, ((int32_t)lower >> 20) & 0xfff);            // Addi.
1433   Assembler::patch(branch + 20, 31, 20, (((intptr_t)target << 44) >> 52) & 0xfff);  // Addi.
1434   Assembler::patch(branch + 28, 31, 20, (intptr_t)target & 0xff);                   // Addi.
1435   return LI64_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1436 }
1437 
1438 static int patch_imm_in_li16u(address branch, uint16_t target) {
1439   Assembler::patch(branch, 31, 12, target); // patch lui only
1440   return NativeInstruction::instruction_size;
1441 }
1442 
1443 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1444   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1445   int64_t upper = (intptr_t)target;
1446   int32_t lower = (((int32_t)target) << 20) >> 20;
1447   upper -= lower;
1448   upper = (int32_t)upper;
1449   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1450   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1451   return LI32_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1452 }
1453 
1454 static long get_offset_of_jal(address insn_addr) {
1455   assert_cond(insn_addr != nullptr);
1456   long offset = 0;
1457   unsigned insn = Assembler::ld_instr(insn_addr);
1458   long val = (long)Assembler::sextract(insn, 31, 12);
1459   offset |= ((val >> 19) & 0x1) << 20;
1460   offset |= (val & 0xff) << 12;
1461   offset |= ((val >> 8) & 0x1) << 11;
1462   offset |= ((val >> 9) & 0x3ff) << 1;
1463   offset = (offset << 43) >> 43;
1464   return offset;
1465 }
1466 
1467 static long get_offset_of_conditional_branch(address insn_addr) {
1468   long offset = 0;
1469   assert_cond(insn_addr != nullptr);
1470   unsigned insn = Assembler::ld_instr(insn_addr);
1471   offset = (long)Assembler::sextract(insn, 31, 31);
1472   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1473   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1474   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1475   offset = (offset << 41) >> 41;
1476   return offset;
1477 }
1478 
1479 static long get_offset_of_pc_relative(address insn_addr) {
1480   long offset = 0;
1481   assert_cond(insn_addr != nullptr);
1482   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1483   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1484   offset = (offset << 32) >> 32;
1485   return offset;
1486 }
1487 
1488 static address get_target_of_movptr(address insn_addr) {
1489   assert_cond(insn_addr != nullptr);
1490   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1491   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1492   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1493   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1494   return (address) target_address;
1495 }
1496 
1497 static address get_target_of_li64(address insn_addr) {
1498   assert_cond(insn_addr != nullptr);
1499   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 44; // Lui.
1500   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 32;                 // Addi.
1501   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 20;                // Addi.
1502   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)) << 8;                 // Addi.
1503   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 28), 31, 20));                      // Addi.
1504   return (address)target_address;
1505 }
1506 
1507 address MacroAssembler::get_target_of_li32(address insn_addr) {
1508   assert_cond(insn_addr != nullptr);
1509   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1510   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1511   return (address)target_address;
1512 }
1513 
1514 // Patch any kind of instruction; there may be several instructions.
1515 // Return the total length (in bytes) of the instructions.
1516 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
1517   assert_cond(branch != nullptr);
1518   int64_t offset = target - branch;
1519   if (NativeInstruction::is_jal_at(branch)) {                         // jal
1520     return patch_offset_in_jal(branch, offset);
1521   } else if (NativeInstruction::is_branch_at(branch)) {               // beq/bge/bgeu/blt/bltu/bne
1522     return patch_offset_in_conditional_branch(branch, offset);
1523   } else if (NativeInstruction::is_pc_relative_at(branch)) {          // auipc, addi/jalr/load
1524     return patch_offset_in_pc_relative(branch, offset);
1525   } else if (NativeInstruction::is_movptr_at(branch)) {               // movptr
1526     return patch_addr_in_movptr(branch, target);
1527   } else if (NativeInstruction::is_li64_at(branch)) {                 // li64
1528     return patch_imm_in_li64(branch, target);
1529   } else if (NativeInstruction::is_li32_at(branch)) {                 // li32
1530     int64_t imm = (intptr_t)target;
1531     return patch_imm_in_li32(branch, (int32_t)imm);
1532   } else if (NativeInstruction::is_li16u_at(branch)) {
1533     int64_t imm = (intptr_t)target;
1534     return patch_imm_in_li16u(branch, (uint16_t)imm);
1535   } else {
1536 #ifdef ASSERT
1537     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1538                   Assembler::ld_instr(branch), p2i(branch));
1539     Disassembler::decode(branch - 16, branch + 16);
1540 #endif
1541     ShouldNotReachHere();
1542     return -1;
1543   }
1544 }
1545 
1546 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1547   long offset = 0;
1548   assert_cond(insn_addr != nullptr);
1549   if (NativeInstruction::is_jal_at(insn_addr)) {                     // jal
1550     offset = get_offset_of_jal(insn_addr);
1551   } else if (NativeInstruction::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1552     offset = get_offset_of_conditional_branch(insn_addr);
1553   } else if (NativeInstruction::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1554     offset = get_offset_of_pc_relative(insn_addr);
1555   } else if (NativeInstruction::is_movptr_at(insn_addr)) {           // movptr
1556     return get_target_of_movptr(insn_addr);
1557   } else if (NativeInstruction::is_li64_at(insn_addr)) {             // li64
1558     return get_target_of_li64(insn_addr);
1559   } else if (NativeInstruction::is_li32_at(insn_addr)) {             // li32
1560     return get_target_of_li32(insn_addr);
1561   } else {
1562     ShouldNotReachHere();
1563   }
1564   return address(((uintptr_t)insn_addr + offset));
1565 }
1566 
1567 int MacroAssembler::patch_oop(address insn_addr, address o) {
1568   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1569   // narrow OOPs by setting the upper 16 bits in the first
1570   // instruction.
1571   if (NativeInstruction::is_li32_at(insn_addr)) {
1572     // Move narrow OOP
1573     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1574     return patch_imm_in_li32(insn_addr, (int32_t)n);
1575   } else if (NativeInstruction::is_movptr_at(insn_addr)) {
1576     // Move wide OOP
1577     return patch_addr_in_movptr(insn_addr, o);
1578   }
1579   ShouldNotReachHere();
1580   return -1;
1581 }
1582 
1583 void MacroAssembler::reinit_heapbase() {
1584   if (UseCompressedOops) {
1585     if (Universe::is_fully_initialized()) {
1586       mv(xheapbase, CompressedOops::ptrs_base());
1587     } else {
1588       ExternalAddress target(CompressedOops::ptrs_base_addr());
1589       relocate(target.rspec(), [&] {
1590         int32_t offset;
1591         la(xheapbase, target.target(), offset);
1592         ld(xheapbase, Address(xheapbase, offset));
1593       });
1594     }
1595   }
1596 }
1597 
1598 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset) {
1599   int64_t imm64 = (int64_t)addr;
1600 #ifndef PRODUCT
1601   {
1602     char buffer[64];
1603     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, imm64);
1604     block_comment(buffer);
1605   }
1606 #endif
1607   assert((uintptr_t)imm64 < (1ull << 48), "48-bit overflow in address constant");
1608   // Load upper 31 bits
1609   int64_t imm = imm64 >> 17;
1610   int64_t upper = imm, lower = imm;
1611   lower = (lower << 52) >> 52;
1612   upper -= lower;
1613   upper = (int32_t)upper;
1614   lui(Rd, upper);
1615   addi(Rd, Rd, lower);
1616 
1617   // Load the rest 17 bits.
1618   slli(Rd, Rd, 11);
1619   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
1620   slli(Rd, Rd, 6);
1621 
1622   // This offset will be used by following jalr/ld.
1623   offset = imm64 & 0x3f;
1624 }
1625 
1626 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
1627   if (is_simm12(increment)) {
1628     addi(Rd, Rn, increment);
1629   } else {
1630     assert_different_registers(Rn, temp);
1631     li(temp, increment);
1632     add(Rd, Rn, temp);
1633   }
1634 }
1635 
1636 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
1637   if (is_simm12(increment)) {
1638     addiw(Rd, Rn, increment);
1639   } else {
1640     assert_different_registers(Rn, temp);
1641     li(temp, increment);
1642     addw(Rd, Rn, temp);
1643   }
1644 }
1645 
1646 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
1647   if (is_simm12(-decrement)) {
1648     addi(Rd, Rn, -decrement);
1649   } else {
1650     assert_different_registers(Rn, temp);
1651     li(temp, decrement);
1652     sub(Rd, Rn, temp);
1653   }
1654 }
1655 
1656 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
1657   if (is_simm12(-decrement)) {
1658     addiw(Rd, Rn, -decrement);
1659   } else {
1660     assert_different_registers(Rn, temp);
1661     li(temp, decrement);
1662     subw(Rd, Rn, temp);
1663   }
1664 }
1665 
1666 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
1667   andr(Rd, Rs1, Rs2);
1668   sign_extend(Rd, Rd, 32);
1669 }
1670 
1671 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
1672   orr(Rd, Rs1, Rs2);
1673   sign_extend(Rd, Rd, 32);
1674 }
1675 
1676 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
1677   xorr(Rd, Rs1, Rs2);
1678   sign_extend(Rd, Rd, 32);
1679 }
1680 
1681 // Rd = Rs1 & (~Rd2)
1682 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
1683   if (UseZbb) {
1684     Assembler::andn(Rd, Rs1, Rs2);
1685     return;
1686   }
1687 
1688   notr(Rd, Rs2);
1689   andr(Rd, Rs1, Rd);
1690 }
1691 
1692 // Rd = Rs1 | (~Rd2)
1693 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
1694   if (UseZbb) {
1695     Assembler::orn(Rd, Rs1, Rs2);
1696     return;
1697   }
1698 
1699   notr(Rd, Rs2);
1700   orr(Rd, Rs1, Rd);
1701 }
1702 
1703 // Note: load_unsigned_short used to be called load_unsigned_word.
1704 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1705   int off = offset();
1706   lhu(dst, src);
1707   return off;
1708 }
1709 
1710 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1711   int off = offset();
1712   lbu(dst, src);
1713   return off;
1714 }
1715 
1716 int MacroAssembler::load_signed_short(Register dst, Address src) {
1717   int off = offset();
1718   lh(dst, src);
1719   return off;
1720 }
1721 
1722 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1723   int off = offset();
1724   lb(dst, src);
1725   return off;
1726 }
1727 
1728 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
1729   switch (size_in_bytes) {
1730     case  8:  ld(dst, src); break;
1731     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
1732     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1733     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1734     default:  ShouldNotReachHere();
1735   }
1736 }
1737 
1738 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
1739   switch (size_in_bytes) {
1740     case  8:  sd(src, dst); break;
1741     case  4:  sw(src, dst); break;
1742     case  2:  sh(src, dst); break;
1743     case  1:  sb(src, dst); break;
1744     default:  ShouldNotReachHere();
1745   }
1746 }
1747 
1748 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
1749 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1750   if (granularity != 1 && granularity != 2) {
1751     ShouldNotReachHere();
1752   }
1753   if (AvoidUnalignedAccesses && (granularity != 2)) {
1754     assert_different_registers(dst, tmp);
1755     assert_different_registers(tmp, src.base());
1756     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
1757     slli(tmp, tmp, 8);
1758     lbu(dst, src);
1759     add(dst, dst, tmp);
1760   } else {
1761     is_signed ? lh(dst, src) : lhu(dst, src);
1762   }
1763 }
1764 
1765 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
1766 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1767   if (AvoidUnalignedAccesses && (granularity != 4)) {
1768     switch(granularity) {
1769       case 1:
1770         assert_different_registers(dst, tmp, src.base());
1771         lbu(dst, src);
1772         lbu(tmp, Address(src.base(), src.offset() + 1));
1773         slli(tmp, tmp, 8);
1774         add(dst, dst, tmp);
1775         lbu(tmp, Address(src.base(), src.offset() + 2));
1776         slli(tmp, tmp, 16);
1777         add(dst, dst, tmp);
1778         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
1779         slli(tmp, tmp, 24);
1780         add(dst, dst, tmp);
1781         break;
1782       case 2:
1783         assert_different_registers(dst, tmp);
1784         assert_different_registers(tmp, src.base());
1785         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
1786         slli(tmp, tmp, 16);
1787         lhu(dst, src);
1788         add(dst, dst, tmp);
1789         break;
1790       default:
1791         ShouldNotReachHere();
1792     }
1793   } else {
1794     is_signed ? lw(dst, src) : lwu(dst, src);
1795   }
1796 }
1797 
1798 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
1799 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
1800   if (AvoidUnalignedAccesses && (granularity != 8)) {
1801     switch(granularity){
1802       case 1:
1803         assert_different_registers(dst, tmp, src.base());
1804         lbu(dst, src);
1805         lbu(tmp, Address(src.base(), src.offset() + 1));
1806         slli(tmp, tmp, 8);
1807         add(dst, dst, tmp);
1808         lbu(tmp, Address(src.base(), src.offset() + 2));
1809         slli(tmp, tmp, 16);
1810         add(dst, dst, tmp);
1811         lbu(tmp, Address(src.base(), src.offset() + 3));
1812         slli(tmp, tmp, 24);
1813         add(dst, dst, tmp);
1814         lbu(tmp, Address(src.base(), src.offset() + 4));
1815         slli(tmp, tmp, 32);
1816         add(dst, dst, tmp);
1817         lbu(tmp, Address(src.base(), src.offset() + 5));
1818         slli(tmp, tmp, 40);
1819         add(dst, dst, tmp);
1820         lbu(tmp, Address(src.base(), src.offset() + 6));
1821         slli(tmp, tmp, 48);
1822         add(dst, dst, tmp);
1823         lbu(tmp, Address(src.base(), src.offset() + 7));
1824         slli(tmp, tmp, 56);
1825         add(dst, dst, tmp);
1826         break;
1827       case 2:
1828         assert_different_registers(dst, tmp, src.base());
1829         lhu(dst, src);
1830         lhu(tmp, Address(src.base(), src.offset() + 2));
1831         slli(tmp, tmp, 16);
1832         add(dst, dst, tmp);
1833         lhu(tmp, Address(src.base(), src.offset() + 4));
1834         slli(tmp, tmp, 32);
1835         add(dst, dst, tmp);
1836         lhu(tmp, Address(src.base(), src.offset() + 6));
1837         slli(tmp, tmp, 48);
1838         add(dst, dst, tmp);
1839         break;
1840       case 4:
1841         assert_different_registers(dst, tmp);
1842         assert_different_registers(tmp, src.base());
1843         lwu(tmp, Address(src.base(), src.offset() + 4));
1844         slli(tmp, tmp, 32);
1845         lwu(dst, src);
1846         add(dst, dst, tmp);
1847         break;
1848       default:
1849         ShouldNotReachHere();
1850     }
1851   } else {
1852     ld(dst, src);
1853   }
1854 }
1855 
1856 
1857 // reverse bytes in halfword in lower 16 bits and sign-extend
1858 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
1859 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
1860   if (UseZbb) {
1861     rev8(Rd, Rs);
1862     srai(Rd, Rd, 48);
1863     return;
1864   }
1865   assert_different_registers(Rs, tmp);
1866   assert_different_registers(Rd, tmp);
1867   srli(tmp, Rs, 8);
1868   andi(tmp, tmp, 0xFF);
1869   slli(Rd, Rs, 56);
1870   srai(Rd, Rd, 48); // sign-extend
1871   orr(Rd, Rd, tmp);
1872 }
1873 
1874 // reverse bytes in lower word and sign-extend
1875 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
1876 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1877   if (UseZbb) {
1878     rev8(Rd, Rs);
1879     srai(Rd, Rd, 32);
1880     return;
1881   }
1882   assert_different_registers(Rs, tmp1, tmp2);
1883   assert_different_registers(Rd, tmp1, tmp2);
1884   revb_h_w_u(Rd, Rs, tmp1, tmp2);
1885   slli(tmp2, Rd, 48);
1886   srai(tmp2, tmp2, 32); // sign-extend
1887   srli(Rd, Rd, 16);
1888   orr(Rd, Rd, tmp2);
1889 }
1890 
1891 // reverse bytes in halfword in lower 16 bits and zero-extend
1892 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1893 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
1894   if (UseZbb) {
1895     rev8(Rd, Rs);
1896     srli(Rd, Rd, 48);
1897     return;
1898   }
1899   assert_different_registers(Rs, tmp);
1900   assert_different_registers(Rd, tmp);
1901   srli(tmp, Rs, 8);
1902   andi(tmp, tmp, 0xFF);
1903   andi(Rd, Rs, 0xFF);
1904   slli(Rd, Rd, 8);
1905   orr(Rd, Rd, tmp);
1906 }
1907 
1908 // reverse bytes in halfwords in lower 32 bits and zero-extend
1909 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1910 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1911   if (UseZbb) {
1912     rev8(Rd, Rs);
1913     rori(Rd, Rd, 32);
1914     roriw(Rd, Rd, 16);
1915     zero_extend(Rd, Rd, 32);
1916     return;
1917   }
1918   assert_different_registers(Rs, tmp1, tmp2);
1919   assert_different_registers(Rd, tmp1, tmp2);
1920   srli(tmp2, Rs, 16);
1921   revb_h_h_u(tmp2, tmp2, tmp1);
1922   revb_h_h_u(Rd, Rs, tmp1);
1923   slli(tmp2, tmp2, 16);
1924   orr(Rd, Rd, tmp2);
1925 }
1926 
1927 // This method is only used for revb_h
1928 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
1929 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1930   assert_different_registers(Rs, tmp1, tmp2);
1931   assert_different_registers(Rd, tmp1);
1932   srli(tmp1, Rs, 48);
1933   andi(tmp2, tmp1, 0xFF);
1934   slli(tmp2, tmp2, 8);
1935   srli(tmp1, tmp1, 8);
1936   orr(tmp1, tmp1, tmp2);
1937   slli(Rd, Rs, 16);
1938   orr(Rd, Rd, tmp1);
1939 }
1940 
1941 // reverse bytes in each halfword
1942 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
1943 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1944   if (UseZbb) {
1945     assert_different_registers(Rs, tmp1);
1946     assert_different_registers(Rd, tmp1);
1947     rev8(Rd, Rs);
1948     zero_extend(tmp1, Rd, 32);
1949     roriw(tmp1, tmp1, 16);
1950     slli(tmp1, tmp1, 32);
1951     srli(Rd, Rd, 32);
1952     roriw(Rd, Rd, 16);
1953     zero_extend(Rd, Rd, 32);
1954     orr(Rd, Rd, tmp1);
1955     return;
1956   }
1957   assert_different_registers(Rs, tmp1, tmp2);
1958   assert_different_registers(Rd, tmp1, tmp2);
1959   revb_h_helper(Rd, Rs, tmp1, tmp2);
1960   for (int i = 0; i < 3; ++i) {
1961     revb_h_helper(Rd, Rd, tmp1, tmp2);
1962   }
1963 }
1964 
1965 // reverse bytes in each word
1966 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
1967 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1968   if (UseZbb) {
1969     rev8(Rd, Rs);
1970     rori(Rd, Rd, 32);
1971     return;
1972   }
1973   assert_different_registers(Rs, tmp1, tmp2);
1974   assert_different_registers(Rd, tmp1, tmp2);
1975   revb(Rd, Rs, tmp1, tmp2);
1976   ror_imm(Rd, Rd, 32);
1977 }
1978 
1979 // reverse bytes in doubleword
1980 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
1981 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1982   if (UseZbb) {
1983     rev8(Rd, Rs);
1984     return;
1985   }
1986   assert_different_registers(Rs, tmp1, tmp2);
1987   assert_different_registers(Rd, tmp1, tmp2);
1988   andi(tmp1, Rs, 0xFF);
1989   slli(tmp1, tmp1, 8);
1990   for (int step = 8; step < 56; step += 8) {
1991     srli(tmp2, Rs, step);
1992     andi(tmp2, tmp2, 0xFF);
1993     orr(tmp1, tmp1, tmp2);
1994     slli(tmp1, tmp1, 8);
1995   }
1996   srli(Rd, Rs, 56);
1997   andi(Rd, Rd, 0xFF);
1998   orr(Rd, tmp1, Rd);
1999 }
2000 
2001 // rotate right with shift bits
2002 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
2003 {
2004   if (UseZbb) {
2005     rori(dst, src, shift);
2006     return;
2007   }
2008 
2009   assert_different_registers(dst, tmp);
2010   assert_different_registers(src, tmp);
2011   assert(shift < 64, "shift amount must be < 64");
2012   slli(tmp, src, 64 - shift);
2013   srli(dst, src, shift);
2014   orr(dst, dst, tmp);
2015 }
2016 
2017 // rotate left with shift bits, 32-bit version
2018 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
2019   if (UseZbb) {
2020     // no roliw available
2021     roriw(dst, src, 32 - shift);
2022     return;
2023   }
2024 
2025   assert_different_registers(dst, tmp);
2026   assert_different_registers(src, tmp);
2027   assert(shift < 32, "shift amount must be < 32");
2028   srliw(tmp, src, 32 - shift);
2029   slliw(dst, src, shift);
2030   orr(dst, dst, tmp);
2031 }
2032 
2033 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2034   if (is_simm12(imm)) {
2035     and_imm12(Rd, Rn, imm);
2036   } else {
2037     assert_different_registers(Rn, tmp);
2038     mv(tmp, imm);
2039     andr(Rd, Rn, tmp);
2040   }
2041 }
2042 
2043 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2044   ld(tmp1, adr);
2045   if (src.is_register()) {
2046     orr(tmp1, tmp1, src.as_register());
2047   } else {
2048     if (is_simm12(src.as_constant())) {
2049       ori(tmp1, tmp1, src.as_constant());
2050     } else {
2051       assert_different_registers(tmp1, tmp2);
2052       mv(tmp2, src.as_constant());
2053       orr(tmp1, tmp1, tmp2);
2054     }
2055   }
2056   sd(tmp1, adr);
2057 }
2058 
2059 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2060   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2061   if (UseCompressedClassPointers) {
2062     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2063     if (CompressedKlassPointers::base() == nullptr) {
2064       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2065       beq(trial_klass, tmp1, L);
2066       return;
2067     }
2068     decode_klass_not_null(tmp1, tmp2);
2069   } else {
2070     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2071   }
2072   beq(trial_klass, tmp1, L);
2073 }
2074 
2075 // Move an oop into a register.
2076 void MacroAssembler::movoop(Register dst, jobject obj) {
2077   int oop_index;
2078   if (obj == nullptr) {
2079     oop_index = oop_recorder()->allocate_oop_index(obj);
2080   } else {
2081 #ifdef ASSERT
2082     {
2083       ThreadInVMfromUnknown tiv;
2084       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2085     }
2086 #endif
2087     oop_index = oop_recorder()->find_index(obj);
2088   }
2089   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2090 
2091   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2092     mv(dst, Address((address)obj, rspec));
2093   } else {
2094     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2095     ld_constant(dst, Address(dummy, rspec));
2096   }
2097 }
2098 
2099 // Move a metadata address into a register.
2100 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2101   int oop_index;
2102   if (obj == nullptr) {
2103     oop_index = oop_recorder()->allocate_metadata_index(obj);
2104   } else {
2105     oop_index = oop_recorder()->find_index(obj);
2106   }
2107   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2108   mv(dst, Address((address)obj, rspec));
2109 }
2110 
2111 // Writes to stack successive pages until offset reached to check for
2112 // stack overflow + shadow pages.  This clobbers tmp.
2113 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2114   assert_different_registers(tmp, size, t0);
2115   // Bang stack for total size given plus shadow page size.
2116   // Bang one page at a time because large size can bang beyond yellow and
2117   // red zones.
2118   mv(t0, (int)os::vm_page_size());
2119   Label loop;
2120   bind(loop);
2121   sub(tmp, sp, t0);
2122   subw(size, size, t0);
2123   sd(size, Address(tmp));
2124   bgtz(size, loop);
2125 
2126   // Bang down shadow pages too.
2127   // At this point, (tmp-0) is the last address touched, so don't
2128   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2129   // was post-decremented.)  Skip this address by starting at i=1, and
2130   // touch a few more pages below.  N.B.  It is important to touch all
2131   // the way down to and including i=StackShadowPages.
2132   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2133     // this could be any sized move but this is can be a debugging crumb
2134     // so the bigger the better.
2135     sub(tmp, tmp, (int)os::vm_page_size());
2136     sd(size, Address(tmp, 0));
2137   }
2138 }
2139 
2140 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2141   int32_t offset = 0;
2142   _masm = masm;
2143   ExternalAddress target((address)flag_addr);
2144   _masm->relocate(target.rspec(), [&] {
2145     int32_t offset;
2146     _masm->la(t0, target.target(), offset);
2147     _masm->lbu(t0, Address(t0, offset));
2148   });
2149   if (value) {
2150     _masm->bnez(t0, _label);
2151   } else {
2152     _masm->beqz(t0, _label);
2153   }
2154 }
2155 
2156 SkipIfEqual::~SkipIfEqual() {
2157   _masm->bind(_label);
2158   _masm = nullptr;
2159 }
2160 
2161 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2162   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2163   ld(dst, Address(xmethod, Method::const_offset()));
2164   ld(dst, Address(dst, ConstMethod::constants_offset()));
2165   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2166   ld(dst, Address(dst, mirror_offset));
2167   resolve_oop_handle(dst, tmp1, tmp2);
2168 }
2169 
2170 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2171   // OopHandle::resolve is an indirection.
2172   assert_different_registers(result, tmp1, tmp2);
2173   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2174 }
2175 
2176 // ((WeakHandle)result).resolve()
2177 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2178   assert_different_registers(result, tmp1, tmp2);
2179   Label resolved;
2180 
2181   // A null weak handle resolves to null.
2182   beqz(result, resolved);
2183 
2184   // Only 64 bit platforms support GCs that require a tmp register
2185   // Only IN_HEAP loads require a thread_tmp register
2186   // WeakHandle::resolve is an indirection like jweak.
2187   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2188                  result, Address(result), tmp1, tmp2);
2189   bind(resolved);
2190 }
2191 
2192 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2193                                     Register dst, Address src,
2194                                     Register tmp1, Register tmp2) {
2195   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2196   decorators = AccessInternal::decorator_fixup(decorators, type);
2197   bool as_raw = (decorators & AS_RAW) != 0;
2198   if (as_raw) {
2199     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2200   } else {
2201     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2202   }
2203 }
2204 
2205 void MacroAssembler::null_check(Register reg, int offset) {
2206   if (needs_explicit_null_check(offset)) {
2207     // provoke OS null exception if reg is null by
2208     // accessing M[reg] w/o changing any registers
2209     // NOTE: this is plenty to provoke a segv
2210     ld(zr, Address(reg, 0));
2211   } else {
2212     // nothing to do, (later) access of M[reg + offset]
2213     // will provoke OS null exception if reg is null
2214   }
2215 }
2216 
2217 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2218                                      Address dst, Register val,
2219                                      Register tmp1, Register tmp2, Register tmp3) {
2220   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2221   decorators = AccessInternal::decorator_fixup(decorators, type);
2222   bool as_raw = (decorators & AS_RAW) != 0;
2223   if (as_raw) {
2224     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2225   } else {
2226     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2227   }
2228 }
2229 
2230 // Algorithm must match CompressedOops::encode.
2231 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2232   verify_oop_msg(s, "broken oop in encode_heap_oop");
2233   if (CompressedOops::base() == nullptr) {
2234     if (CompressedOops::shift() != 0) {
2235       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2236       srli(d, s, LogMinObjAlignmentInBytes);
2237     } else {
2238       mv(d, s);
2239     }
2240   } else {
2241     Label notNull;
2242     sub(d, s, xheapbase);
2243     bgez(d, notNull);
2244     mv(d, zr);
2245     bind(notNull);
2246     if (CompressedOops::shift() != 0) {
2247       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2248       srli(d, d, CompressedOops::shift());
2249     }
2250   }
2251 }
2252 
2253 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2254   assert_different_registers(dst, tmp);
2255   assert_different_registers(src, tmp);
2256   if (UseCompressedClassPointers) {
2257     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2258     decode_klass_not_null(dst, tmp);
2259   } else {
2260     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2261   }
2262 }
2263 
2264 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2265   // FIXME: Should this be a store release? concurrent gcs assumes
2266   // klass length is valid if klass field is not null.
2267   if (UseCompressedClassPointers) {
2268     encode_klass_not_null(src, tmp);
2269     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2270   } else {
2271     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2272   }
2273 }
2274 
2275 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2276   if (UseCompressedClassPointers) {
2277     // Store to klass gap in destination
2278     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2279   }
2280 }
2281 
2282 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2283   assert_different_registers(r, tmp);
2284   decode_klass_not_null(r, r, tmp);
2285 }
2286 
2287 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2288   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2289 
2290   if (CompressedKlassPointers::base() == nullptr) {
2291     if (CompressedKlassPointers::shift() != 0) {
2292       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2293       slli(dst, src, LogKlassAlignmentInBytes);
2294     } else {
2295       mv(dst, src);
2296     }
2297     return;
2298   }
2299 
2300   Register xbase = dst;
2301   if (dst == src) {
2302     xbase = tmp;
2303   }
2304 
2305   assert_different_registers(src, xbase);
2306   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2307 
2308   if (CompressedKlassPointers::shift() != 0) {
2309     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2310     assert_different_registers(t0, xbase);
2311     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2312   } else {
2313     add(dst, xbase, src);
2314   }
2315 }
2316 
2317 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2318   assert_different_registers(r, tmp);
2319   encode_klass_not_null(r, r, tmp);
2320 }
2321 
2322 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2323   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2324 
2325   if (CompressedKlassPointers::base() == nullptr) {
2326     if (CompressedKlassPointers::shift() != 0) {
2327       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2328       srli(dst, src, LogKlassAlignmentInBytes);
2329     } else {
2330       mv(dst, src);
2331     }
2332     return;
2333   }
2334 
2335   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2336       CompressedKlassPointers::shift() == 0) {
2337     zero_extend(dst, src, 32);
2338     return;
2339   }
2340 
2341   Register xbase = dst;
2342   if (dst == src) {
2343     xbase = tmp;
2344   }
2345 
2346   assert_different_registers(src, xbase);
2347   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2348   sub(dst, src, xbase);
2349   if (CompressedKlassPointers::shift() != 0) {
2350     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2351     srli(dst, dst, LogKlassAlignmentInBytes);
2352   }
2353 }
2354 
2355 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2356   decode_heap_oop_not_null(r, r);
2357 }
2358 
2359 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2360   assert(UseCompressedOops, "should only be used for compressed headers");
2361   assert(Universe::heap() != nullptr, "java heap should be initialized");
2362   // Cannot assert, unverified entry point counts instructions (see .ad file)
2363   // vtableStubs also counts instructions in pd_code_size_limit.
2364   // Also do not verify_oop as this is called by verify_oop.
2365   if (CompressedOops::shift() != 0) {
2366     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2367     slli(dst, src, LogMinObjAlignmentInBytes);
2368     if (CompressedOops::base() != nullptr) {
2369       add(dst, xheapbase, dst);
2370     }
2371   } else {
2372     assert(CompressedOops::base() == nullptr, "sanity");
2373     mv(dst, src);
2374   }
2375 }
2376 
2377 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2378   if (CompressedOops::base() == nullptr) {
2379     if (CompressedOops::shift() != 0 || d != s) {
2380       slli(d, s, CompressedOops::shift());
2381     }
2382   } else {
2383     Label done;
2384     mv(d, s);
2385     beqz(s, done);
2386     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2387     bind(done);
2388   }
2389   verify_oop_msg(d, "broken oop in decode_heap_oop");
2390 }
2391 
2392 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2393                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2394   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2395 }
2396 
2397 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2398                                    Register tmp2, DecoratorSet decorators) {
2399   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2400 }
2401 
2402 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2403                                             Register tmp2, DecoratorSet decorators) {
2404   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2405 }
2406 
2407 // Used for storing nulls.
2408 void MacroAssembler::store_heap_oop_null(Address dst) {
2409   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2410 }
2411 
2412 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2413                                     bool want_remainder, bool is_signed)
2414 {
2415   // Full implementation of Java idiv and irem.  The function
2416   // returns the (pc) offset of the div instruction - may be needed
2417   // for implicit exceptions.
2418   //
2419   // input : rs1: dividend
2420   //         rs2: divisor
2421   //
2422   // result: either
2423   //         quotient  (= rs1 idiv rs2)
2424   //         remainder (= rs1 irem rs2)
2425 
2426 
2427   int idivl_offset = offset();
2428   if (!want_remainder) {
2429     if (is_signed) {
2430       divw(result, rs1, rs2);
2431     } else {
2432       divuw(result, rs1, rs2);
2433     }
2434   } else {
2435     // result = rs1 % rs2;
2436     if (is_signed) {
2437       remw(result, rs1, rs2);
2438     } else {
2439       remuw(result, rs1, rs2);
2440     }
2441   }
2442   return idivl_offset;
2443 }
2444 
2445 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2446                                     bool want_remainder, bool is_signed)
2447 {
2448   // Full implementation of Java ldiv and lrem.  The function
2449   // returns the (pc) offset of the div instruction - may be needed
2450   // for implicit exceptions.
2451   //
2452   // input : rs1: dividend
2453   //         rs2: divisor
2454   //
2455   // result: either
2456   //         quotient  (= rs1 idiv rs2)
2457   //         remainder (= rs1 irem rs2)
2458 
2459   int idivq_offset = offset();
2460   if (!want_remainder) {
2461     if (is_signed) {
2462       div(result, rs1, rs2);
2463     } else {
2464       divu(result, rs1, rs2);
2465     }
2466   } else {
2467     // result = rs1 % rs2;
2468     if (is_signed) {
2469       rem(result, rs1, rs2);
2470     } else {
2471       remu(result, rs1, rs2);
2472     }
2473   }
2474   return idivq_offset;
2475 }
2476 
2477 // Look up the method for a megamorpic invkkeinterface call.
2478 // The target method is determined by <intf_klass, itable_index>.
2479 // The receiver klass is in recv_klass.
2480 // On success, the result will be in method_result, and execution falls through.
2481 // On failure, execution transfers to the given label.
2482 void MacroAssembler::lookup_interface_method(Register recv_klass,
2483                                              Register intf_klass,
2484                                              RegisterOrConstant itable_index,
2485                                              Register method_result,
2486                                              Register scan_tmp,
2487                                              Label& L_no_such_interface,
2488                                              bool return_method) {
2489   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2490   assert_different_registers(method_result, intf_klass, scan_tmp);
2491   assert(recv_klass != method_result || !return_method,
2492          "recv_klass can be destroyed when mehtid isn't needed");
2493   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2494          "caller must be same register for non-constant itable index as for method");
2495 
2496   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2497   int vtable_base = in_bytes(Klass::vtable_start_offset());
2498   int itentry_off = in_bytes(itableMethodEntry::method_offset());
2499   int scan_step   = itableOffsetEntry::size() * wordSize;
2500   int vte_size    = vtableEntry::size_in_bytes();
2501   assert(vte_size == wordSize, "else adjust times_vte_scale");
2502 
2503   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2504 
2505   // Could store the aligned, prescaled offset in the klass.
2506   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2507   add(scan_tmp, scan_tmp, vtable_base);
2508 
2509   if (return_method) {
2510     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2511     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2512     if (itable_index.is_register()) {
2513       slli(t0, itable_index.as_register(), 3);
2514     } else {
2515       mv(t0, itable_index.as_constant() << 3);
2516     }
2517     add(recv_klass, recv_klass, t0);
2518     if (itentry_off) {
2519       add(recv_klass, recv_klass, itentry_off);
2520     }
2521   }
2522 
2523   Label search, found_method;
2524 
2525   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2526   beq(intf_klass, method_result, found_method);
2527   bind(search);
2528   // Check that the previous entry is non-null. A null entry means that
2529   // the receiver class doesn't implement the interface, and wasn't the
2530   // same as when the caller was compiled.
2531   beqz(method_result, L_no_such_interface, /* is_far */ true);
2532   addi(scan_tmp, scan_tmp, scan_step);
2533   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2534   bne(intf_klass, method_result, search);
2535 
2536   bind(found_method);
2537 
2538   // Got a hit.
2539   if (return_method) {
2540     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
2541     add(method_result, recv_klass, scan_tmp);
2542     ld(method_result, Address(method_result));
2543   }
2544 }
2545 
2546 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
2547 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
2548 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
2549 // The target method is determined by <holder_klass, itable_index>.
2550 // The receiver klass is in recv_klass.
2551 // On success, the result will be in method_result, and execution falls through.
2552 // On failure, execution transfers to the given label.
2553 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
2554                                                   Register holder_klass,
2555                                                   Register resolved_klass,
2556                                                   Register method_result,
2557                                                   Register temp_itbl_klass,
2558                                                   Register scan_temp,
2559                                                   int itable_index,
2560                                                   Label& L_no_such_interface) {
2561   // 'method_result' is only used as output register at the very end of this method.
2562   // Until then we can reuse it as 'holder_offset'.
2563   Register holder_offset = method_result;
2564   assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
2565 
2566   int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
2567   int scan_step = itableOffsetEntry::size() * wordSize;
2568   int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
2569   int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
2570   int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
2571   const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
2572 
2573   Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
2574 
2575   lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
2576   add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
2577   // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
2578   //                            + sizeof(vtableEntry) * (recv_klass->_vtable_len);
2579   // scan_temp = &(itable[0]._interface)
2580   // temp_itbl_klass = itable[0]._interface;
2581   shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
2582   ld(temp_itbl_klass, Address(scan_temp));
2583   mv(holder_offset, zr);
2584 
2585   // Initial checks:
2586   //   - if (holder_klass != resolved_klass), go to "scan for resolved"
2587   //   - if (itable[0] == holder_klass), shortcut to "holder found"
2588   //   - if (itable[0] == 0), no such interface
2589   bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
2590   beq(holder_klass, temp_itbl_klass, L_holder_found);
2591   beqz(temp_itbl_klass, L_no_such_interface);
2592 
2593   // Loop: Look for holder_klass record in itable
2594   //   do {
2595   //     temp_itbl_klass = *(scan_temp += scan_step);
2596   //     if (temp_itbl_klass == holder_klass) {
2597   //       goto L_holder_found; // Found!
2598   //     }
2599   //   } while (temp_itbl_klass != 0);
2600   //   goto L_no_such_interface // Not found.
2601   Label L_search_holder;
2602   bind(L_search_holder);
2603     add(scan_temp, scan_temp, scan_step);
2604     ld(temp_itbl_klass, Address(scan_temp));
2605     beq(holder_klass, temp_itbl_klass, L_holder_found);
2606     bnez(temp_itbl_klass, L_search_holder);
2607 
2608   j(L_no_such_interface);
2609 
2610   // Loop: Look for resolved_class record in itable
2611   //   while (true) {
2612   //     temp_itbl_klass = *(scan_temp += scan_step);
2613   //     if (temp_itbl_klass == 0) {
2614   //       goto L_no_such_interface;
2615   //     }
2616   //     if (temp_itbl_klass == resolved_klass) {
2617   //        goto L_resolved_found;  // Found!
2618   //     }
2619   //     if (temp_itbl_klass == holder_klass) {
2620   //        holder_offset = scan_temp;
2621   //     }
2622   //   }
2623   //
2624   Label L_loop_search_resolved;
2625   bind(L_loop_search_resolved);
2626     add(scan_temp, scan_temp, scan_step);
2627     ld(temp_itbl_klass, Address(scan_temp));
2628   bind(L_loop_search_resolved_entry);
2629     beqz(temp_itbl_klass, L_no_such_interface);
2630     beq(resolved_klass, temp_itbl_klass, L_resolved_found);
2631     bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
2632     mv(holder_offset, scan_temp);
2633     j(L_loop_search_resolved);
2634 
2635   // See if we already have a holder klass. If not, go and scan for it.
2636   bind(L_resolved_found);
2637   beqz(holder_offset, L_search_holder);
2638   mv(scan_temp, holder_offset);
2639 
2640   // Finally, scan_temp contains holder_klass vtable offset
2641   bind(L_holder_found);
2642   lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
2643   add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
2644                               - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
2645   add(method_result, recv_klass, method_result);
2646   ld(method_result, Address(method_result));
2647 }
2648 
2649 // virtual method calling
2650 void MacroAssembler::lookup_virtual_method(Register recv_klass,
2651                                            RegisterOrConstant vtable_index,
2652                                            Register method_result) {
2653   const ByteSize base = Klass::vtable_start_offset();
2654   assert(vtableEntry::size() * wordSize == 8,
2655          "adjust the scaling in the code below");
2656   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
2657 
2658   if (vtable_index.is_register()) {
2659     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
2660     ld(method_result, Address(method_result, vtable_offset_in_bytes));
2661   } else {
2662     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
2663     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
2664   }
2665 }
2666 
2667 void MacroAssembler::membar(uint32_t order_constraint) {
2668   address prev = pc() - NativeMembar::instruction_size;
2669   address last = code()->last_insn();
2670 
2671   if (last != nullptr && nativeInstruction_at(last)->is_membar() && prev == last) {
2672     NativeMembar *bar = NativeMembar_at(prev);
2673     // We are merging two memory barrier instructions.  On RISCV we
2674     // can do this simply by ORing them together.
2675     bar->set_kind(bar->get_kind() | order_constraint);
2676     BLOCK_COMMENT("merged membar");
2677   } else {
2678     code()->set_last_insn(pc());
2679 
2680     uint32_t predecessor = 0;
2681     uint32_t successor = 0;
2682 
2683     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
2684     fence(predecessor, successor);
2685   }
2686 }
2687 
2688 // Form an address from base + offset in Rd. Rd my or may not
2689 // actually be used: you must use the Address that is returned. It
2690 // is up to you to ensure that the shift provided matches the size
2691 // of your data.
2692 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
2693   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
2694     return Address(base, byte_offset);
2695   }
2696 
2697   assert_different_registers(Rd, base, noreg);
2698 
2699   // Do it the hard way
2700   mv(Rd, byte_offset);
2701   add(Rd, base, Rd);
2702   return Address(Rd);
2703 }
2704 
2705 void MacroAssembler::check_klass_subtype(Register sub_klass,
2706                                          Register super_klass,
2707                                          Register tmp_reg,
2708                                          Label& L_success) {
2709   Label L_failure;
2710   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
2711   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
2712   bind(L_failure);
2713 }
2714 
2715 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
2716   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
2717   if (acquire) {
2718     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
2719   }
2720   if (at_return) {
2721     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
2722   } else {
2723     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
2724     bnez(t0, slow_path, true /* is_far */);
2725   }
2726 }
2727 
2728 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2729                                 Label &succeed, Label *fail) {
2730   assert_different_registers(addr, tmp, t0);
2731   assert_different_registers(newv, tmp, t0);
2732   assert_different_registers(oldv, tmp, t0);
2733 
2734   // oldv holds comparison value
2735   // newv holds value to write in exchange
2736   // addr identifies memory word to compare against/update
2737   if (UseZacas) {
2738     mv(tmp, oldv);
2739     atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
2740     beq(tmp, oldv, succeed);
2741   } else {
2742     Label retry_load, nope;
2743     bind(retry_load);
2744     // Load reserved from the memory location
2745     load_reserved(tmp, addr, int64, Assembler::aqrl);
2746     // Fail and exit if it is not what we expect
2747     bne(tmp, oldv, nope);
2748     // If the store conditional succeeds, tmp will be zero
2749     store_conditional(tmp, newv, addr, int64, Assembler::rl);
2750     beqz(tmp, succeed);
2751     // Retry only when the store conditional failed
2752     j(retry_load);
2753 
2754     bind(nope);
2755   }
2756 
2757   // neither amocas nor lr/sc have an implied barrier in the failing case
2758   membar(AnyAny);
2759 
2760   mv(oldv, tmp);
2761   if (fail != nullptr) {
2762     j(*fail);
2763   }
2764 }
2765 
2766 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2767                                         Label &succeed, Label *fail) {
2768   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2769   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2770 }
2771 
2772 void MacroAssembler::load_reserved(Register dst,
2773                                    Register addr,
2774                                    enum operand_size size,
2775                                    Assembler::Aqrl acquire) {
2776   switch (size) {
2777     case int64:
2778       lr_d(dst, addr, acquire);
2779       break;
2780     case int32:
2781       lr_w(dst, addr, acquire);
2782       break;
2783     case uint32:
2784       lr_w(dst, addr, acquire);
2785       zero_extend(dst, dst, 32);
2786       break;
2787     default:
2788       ShouldNotReachHere();
2789   }
2790 }
2791 
2792 void MacroAssembler::store_conditional(Register dst,
2793                                        Register new_val,
2794                                        Register addr,
2795                                        enum operand_size size,
2796                                        Assembler::Aqrl release) {
2797   switch (size) {
2798     case int64:
2799       sc_d(dst, new_val, addr, release);
2800       break;
2801     case int32:
2802     case uint32:
2803       sc_w(dst, new_val, addr, release);
2804       break;
2805     default:
2806       ShouldNotReachHere();
2807   }
2808 }
2809 
2810 
2811 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
2812                                                  Register new_val,
2813                                                  enum operand_size size,
2814                                                  Register tmp1, Register tmp2, Register tmp3) {
2815   assert(size == int8 || size == int16, "unsupported operand size");
2816 
2817   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
2818 
2819   andi(shift, addr, 3);
2820   slli(shift, shift, 3);
2821 
2822   andi(aligned_addr, addr, ~3);
2823 
2824   if (size == int8) {
2825     mv(mask, 0xff);
2826   } else {
2827     // size == int16 case
2828     mv(mask, -1);
2829     zero_extend(mask, mask, 16);
2830   }
2831   sll(mask, mask, shift);
2832 
2833   notr(not_mask, mask);
2834 
2835   sll(expected, expected, shift);
2836   andr(expected, expected, mask);
2837 
2838   sll(new_val, new_val, shift);
2839   andr(new_val, new_val, mask);
2840 }
2841 
2842 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
2843 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
2844 // which are forced to work with 4-byte aligned address.
2845 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
2846                                           Register new_val,
2847                                           enum operand_size size,
2848                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
2849                                           Register result, bool result_as_bool,
2850                                           Register tmp1, Register tmp2, Register tmp3) {
2851   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2852   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2853   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2854 
2855   Label retry, fail, done;
2856 
2857   bind(retry);
2858 
2859   if (UseZacas) {
2860     lw(old, aligned_addr);
2861 
2862     // if old & mask != expected
2863     andr(tmp, old, mask);
2864     bne(tmp, expected, fail);
2865 
2866     andr(tmp, old, not_mask);
2867     orr(tmp, tmp, new_val);
2868 
2869     atomic_cas(old, tmp, aligned_addr, operand_size::int32, acquire, release);
2870     bne(tmp, old, retry);
2871   } else {
2872     lr_w(old, aligned_addr, acquire);
2873     andr(tmp, old, mask);
2874     bne(tmp, expected, fail);
2875 
2876     andr(tmp, old, not_mask);
2877     orr(tmp, tmp, new_val);
2878     sc_w(tmp, tmp, aligned_addr, release);
2879     bnez(tmp, retry);
2880   }
2881 
2882   if (result_as_bool) {
2883     mv(result, 1);
2884     j(done);
2885 
2886     bind(fail);
2887     mv(result, zr);
2888 
2889     bind(done);
2890   } else {
2891     andr(tmp, old, mask);
2892 
2893     bind(fail);
2894     srl(result, tmp, shift);
2895 
2896     if (size == int8) {
2897       sign_extend(result, result, 8);
2898     } else {
2899       // size == int16 case
2900       sign_extend(result, result, 16);
2901     }
2902   }
2903 }
2904 
2905 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
2906 // the weak CAS stuff. The major difference is that it just failed when store conditional
2907 // failed.
2908 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
2909                                                Register new_val,
2910                                                enum operand_size size,
2911                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
2912                                                Register result,
2913                                                Register tmp1, Register tmp2, Register tmp3) {
2914   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2915   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2916   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2917 
2918   Label fail, done;
2919 
2920   if (UseZacas) {
2921     lw(old, aligned_addr);
2922 
2923     // if old & mask != expected
2924     andr(tmp, old, mask);
2925     bne(tmp, expected, fail);
2926 
2927     andr(tmp, old, not_mask);
2928     orr(tmp, tmp, new_val);
2929 
2930     atomic_cas(tmp, new_val, addr, operand_size::int32, acquire, release);
2931     bne(tmp, old, fail);
2932   } else {
2933     lr_w(old, aligned_addr, acquire);
2934     andr(tmp, old, mask);
2935     bne(tmp, expected, fail);
2936 
2937     andr(tmp, old, not_mask);
2938     orr(tmp, tmp, new_val);
2939     sc_w(tmp, tmp, aligned_addr, release);
2940     bnez(tmp, fail);
2941   }
2942 
2943   // Success
2944   mv(result, 1);
2945   j(done);
2946 
2947   // Fail
2948   bind(fail);
2949   mv(result, zr);
2950 
2951   bind(done);
2952 }
2953 
2954 void MacroAssembler::cmpxchg(Register addr, Register expected,
2955                              Register new_val,
2956                              enum operand_size size,
2957                              Assembler::Aqrl acquire, Assembler::Aqrl release,
2958                              Register result, bool result_as_bool) {
2959   assert(size != int8 && size != int16, "unsupported operand size");
2960   assert_different_registers(addr, t0);
2961   assert_different_registers(expected, t0);
2962   assert_different_registers(new_val, t0);
2963 
2964   if (UseZacas) {
2965     if (result_as_bool) {
2966       mv(t0, expected);
2967       atomic_cas(t0, new_val, addr, size, acquire, release);
2968       xorr(t0, t0, expected);
2969       seqz(result, t0);
2970     } else {
2971       mv(result, expected);
2972       atomic_cas(result, new_val, addr, size, acquire, release);
2973     }
2974     return;
2975   }
2976 
2977   Label retry_load, done, ne_done;
2978   bind(retry_load);
2979   load_reserved(t0, addr, size, acquire);
2980   bne(t0, expected, ne_done);
2981   store_conditional(t0, new_val, addr, size, release);
2982   bnez(t0, retry_load);
2983 
2984   // equal, succeed
2985   if (result_as_bool) {
2986     mv(result, 1);
2987   } else {
2988     mv(result, expected);
2989   }
2990   j(done);
2991 
2992   // not equal, failed
2993   bind(ne_done);
2994   if (result_as_bool) {
2995     mv(result, zr);
2996   } else {
2997     mv(result, t0);
2998   }
2999 
3000   bind(done);
3001 }
3002 
3003 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
3004                                   Register new_val,
3005                                   enum operand_size size,
3006                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
3007                                   Register result) {
3008   if (UseZacas) {
3009     cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
3010     return;
3011   }
3012 
3013   assert_different_registers(addr, t0);
3014   assert_different_registers(expected, t0);
3015   assert_different_registers(new_val, t0);
3016 
3017   Label fail, done;
3018   load_reserved(t0, addr, size, acquire);
3019   bne(t0, expected, fail);
3020   store_conditional(t0, new_val, addr, size, release);
3021   bnez(t0, fail);
3022 
3023   // Success
3024   mv(result, 1);
3025   j(done);
3026 
3027   // Fail
3028   bind(fail);
3029   mv(result, zr);
3030 
3031   bind(done);
3032 }
3033 
3034 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
3035 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
3036   prev = prev->is_valid() ? prev : zr;                                                      \
3037   if (incr.is_register()) {                                                                 \
3038     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
3039   } else {                                                                                  \
3040     mv(t0, incr.as_constant());                                                             \
3041     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
3042   }                                                                                         \
3043   return;                                                                                   \
3044 }
3045 
3046 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
3047 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
3048 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
3049 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
3050 
3051 #undef ATOMIC_OP
3052 
3053 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
3054 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3055   prev = prev->is_valid() ? prev : zr;                                               \
3056   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3057   return;                                                                            \
3058 }
3059 
3060 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
3061 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
3062 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
3063 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
3064 
3065 #undef ATOMIC_XCHG
3066 
3067 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
3068 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3069   atomic_##OP2(prev, newv, addr);                                                    \
3070   zero_extend(prev, prev, 32);                                                       \
3071   return;                                                                            \
3072 }
3073 
3074 ATOMIC_XCHGU(xchgwu, xchgw)
3075 ATOMIC_XCHGU(xchgalwu, xchgalw)
3076 
3077 #undef ATOMIC_XCHGU
3078 
3079 #define ATOMIC_CAS(OP, AOP, ACQUIRE, RELEASE)                                        \
3080 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
3081   assert(UseZacas, "invariant");                                                     \
3082   prev = prev->is_valid() ? prev : zr;                                               \
3083   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
3084   return;                                                                            \
3085 }
3086 
3087 ATOMIC_CAS(cas, amocas_d, Assembler::relaxed, Assembler::relaxed)
3088 ATOMIC_CAS(casw, amocas_w, Assembler::relaxed, Assembler::relaxed)
3089 ATOMIC_CAS(casl, amocas_d, Assembler::relaxed, Assembler::rl)
3090 ATOMIC_CAS(caslw, amocas_w, Assembler::relaxed, Assembler::rl)
3091 ATOMIC_CAS(casal, amocas_d, Assembler::aq, Assembler::rl)
3092 ATOMIC_CAS(casalw, amocas_w, Assembler::aq, Assembler::rl)
3093 
3094 #undef ATOMIC_CAS
3095 
3096 #define ATOMIC_CASU(OP1, OP2)                                                        \
3097 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
3098   atomic_##OP2(prev, newv, addr);                                                    \
3099   zero_extend(prev, prev, 32);                                                       \
3100   return;                                                                            \
3101 }
3102 
3103 ATOMIC_CASU(caswu, casw)
3104 ATOMIC_CASU(caslwu, caslw)
3105 ATOMIC_CASU(casalwu, casalw)
3106 
3107 #undef ATOMIC_CASU
3108 
3109 void MacroAssembler::atomic_cas(
3110     Register prev, Register newv, Register addr, enum operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
3111   switch (size) {
3112     case int64:
3113       switch ((Assembler::Aqrl)(acquire | release)) {
3114         case Assembler::relaxed:
3115           atomic_cas(prev, newv, addr);
3116           break;
3117         case Assembler::rl:
3118           atomic_casl(prev, newv, addr);
3119           break;
3120         case Assembler::aqrl:
3121           atomic_casal(prev, newv, addr);
3122           break;
3123         default:
3124           ShouldNotReachHere();
3125       }
3126       break;
3127     case int32:
3128       switch ((Assembler::Aqrl)(acquire | release)) {
3129         case Assembler::relaxed:
3130           atomic_casw(prev, newv, addr);
3131           break;
3132         case Assembler::rl:
3133           atomic_caslw(prev, newv, addr);
3134           break;
3135         case Assembler::aqrl:
3136           atomic_casalw(prev, newv, addr);
3137           break;
3138         default:
3139           ShouldNotReachHere();
3140       }
3141       break;
3142     case uint32:
3143       switch ((Assembler::Aqrl)(acquire | release)) {
3144         case Assembler::relaxed:
3145           atomic_caswu(prev, newv, addr);
3146           break;
3147         case Assembler::rl:
3148           atomic_caslwu(prev, newv, addr);
3149           break;
3150         case Assembler::aqrl:
3151           atomic_casalwu(prev, newv, addr);
3152           break;
3153         default:
3154           ShouldNotReachHere();
3155       }
3156       break;
3157     default:
3158       ShouldNotReachHere();
3159   }
3160 }
3161 
3162 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
3163   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
3164   assert(CodeCache::find_blob(entry.target()) != nullptr,
3165          "destination of far call not found in code cache");
3166   assert(entry.rspec().type() == relocInfo::external_word_type
3167         || entry.rspec().type() == relocInfo::runtime_call_type
3168         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3169   // Fixed length: see MacroAssembler::far_branch_size()
3170   relocate(entry.rspec(), [&] {
3171     int32_t offset;
3172     la(tmp, entry.target(), offset);
3173     jalr(x0, tmp, offset);
3174   });
3175 }
3176 
3177 void MacroAssembler::far_call(const Address &entry, Register tmp) {
3178   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
3179   assert(CodeCache::find_blob(entry.target()) != nullptr,
3180          "destination of far call not found in code cache");
3181   assert(entry.rspec().type() == relocInfo::external_word_type
3182         || entry.rspec().type() == relocInfo::runtime_call_type
3183         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
3184   // Fixed length: see MacroAssembler::far_branch_size()
3185   // We can use auipc + jalr here because we know that the total size of
3186   // the code cache cannot exceed 2Gb.
3187   relocate(entry.rspec(), [&] {
3188     int32_t offset;
3189     la(tmp, entry.target(), offset);
3190     jalr(x1, tmp, offset); // link
3191   });
3192 }
3193 
3194 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3195                                                    Register super_klass,
3196                                                    Register tmp_reg,
3197                                                    Label* L_success,
3198                                                    Label* L_failure,
3199                                                    Label* L_slow_path,
3200                                                    Register super_check_offset) {
3201   assert_different_registers(sub_klass, super_klass, tmp_reg);
3202   bool must_load_sco = (super_check_offset == noreg);
3203   if (must_load_sco) {
3204     assert(tmp_reg != noreg, "supply either a temp or a register offset");
3205   } else {
3206     assert_different_registers(sub_klass, super_klass, super_check_offset);
3207   }
3208 
3209   Label L_fallthrough;
3210   int label_nulls = 0;
3211   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3212   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3213   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
3214   assert(label_nulls <= 1, "at most one null in batch");
3215 
3216   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3217   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3218   Address super_check_offset_addr(super_klass, sco_offset);
3219 
3220   // Hacked jmp, which may only be used just before L_fallthrough.
3221 #define final_jmp(label)                                                \
3222   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3223   else                            j(label)             /*omit semi*/
3224 
3225   // If the pointers are equal, we are done (e.g., String[] elements).
3226   // This self-check enables sharing of secondary supertype arrays among
3227   // non-primary types such as array-of-interface. Otherwise, each such
3228   // type would need its own customized SSA.
3229   // We move this check to the front of the fast path because many
3230   // type checks are in fact trivially successful in this manner,
3231   // so we get a nicely predicted branch right at the start of the check.
3232   beq(sub_klass, super_klass, *L_success);
3233 
3234   // Check the supertype display:
3235   if (must_load_sco) {
3236     lwu(tmp_reg, super_check_offset_addr);
3237     super_check_offset = tmp_reg;
3238   }
3239   add(t0, sub_klass, super_check_offset);
3240   Address super_check_addr(t0);
3241   ld(t0, super_check_addr); // load displayed supertype
3242 
3243   // This check has worked decisively for primary supers.
3244   // Secondary supers are sought in the super_cache ('super_cache_addr').
3245   // (Secondary supers are interfaces and very deeply nested subtypes.)
3246   // This works in the same check above because of a tricky aliasing
3247   // between the super_Cache and the primary super display elements.
3248   // (The 'super_check_addr' can address either, as the case requires.)
3249   // Note that the cache is updated below if it does not help us find
3250   // what we need immediately.
3251   // So if it was a primary super, we can just fail immediately.
3252   // Otherwise, it's the slow path for us (no success at this point).
3253 
3254   beq(super_klass, t0, *L_success);
3255   mv(t1, sc_offset);
3256   if (L_failure == &L_fallthrough) {
3257     beq(super_check_offset, t1, *L_slow_path);
3258   } else {
3259     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
3260     final_jmp(*L_slow_path);
3261   }
3262 
3263   bind(L_fallthrough);
3264 
3265 #undef final_jmp
3266 }
3267 
3268 // Scans count pointer sized words at [addr] for occurrence of value,
3269 // generic
3270 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
3271                                 Register tmp) {
3272   Label Lloop, Lexit;
3273   beqz(count, Lexit);
3274   bind(Lloop);
3275   ld(tmp, addr);
3276   beq(value, tmp, Lexit);
3277   add(addr, addr, wordSize);
3278   sub(count, count, 1);
3279   bnez(count, Lloop);
3280   bind(Lexit);
3281 }
3282 
3283 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3284                                                    Register super_klass,
3285                                                    Register tmp1_reg,
3286                                                    Register tmp2_reg,
3287                                                    Label* L_success,
3288                                                    Label* L_failure) {
3289   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3290   if (tmp2_reg != noreg) {
3291     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3292   }
3293 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3294 
3295   Label L_fallthrough;
3296   int label_nulls = 0;
3297   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3298   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3299 
3300   assert(label_nulls <= 1, "at most one null in the batch");
3301 
3302   // A couple of useful fields in sub_klass:
3303   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3304   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3305   Address secondary_supers_addr(sub_klass, ss_offset);
3306   Address super_cache_addr(     sub_klass, sc_offset);
3307 
3308   BLOCK_COMMENT("check_klass_subtype_slow_path");
3309 
3310   // Do a linear scan of the secondary super-klass chain.
3311   // This code is rarely used, so simplicity is a virtue here.
3312   // The repne_scan instruction uses fixed registers, which we must spill.
3313   // Don't worry too much about pre-existing connections with the input regs.
3314 
3315   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3316   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3317 
3318   RegSet pushed_registers;
3319   if (!IS_A_TEMP(x12)) {
3320     pushed_registers += x12;
3321   }
3322   if (!IS_A_TEMP(x15)) {
3323     pushed_registers += x15;
3324   }
3325 
3326   if (super_klass != x10) {
3327     if (!IS_A_TEMP(x10)) {
3328       pushed_registers += x10;
3329     }
3330   }
3331 
3332   push_reg(pushed_registers, sp);
3333 
3334   // Get super_klass value into x10 (even if it was in x15 or x12)
3335   mv(x10, super_klass);
3336 
3337 #ifndef PRODUCT
3338   mv(t1, (address)&SharedRuntime::_partial_subtype_ctr);
3339   Address pst_counter_addr(t1);
3340   ld(t0, pst_counter_addr);
3341   add(t0, t0, 1);
3342   sd(t0, pst_counter_addr);
3343 #endif // PRODUCT
3344 
3345   // We will consult the secondary-super array.
3346   ld(x15, secondary_supers_addr);
3347   // Load the array length.
3348   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3349   // Skip to start of data.
3350   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3351 
3352   // Set t0 to an obvious invalid value, falling through by default
3353   mv(t0, -1);
3354   // Scan X12 words at [X15] for an occurrence of X10.
3355   repne_scan(x15, x10, x12, t0);
3356 
3357   // pop will restore x10, so we should use a temp register to keep its value
3358   mv(t1, x10);
3359 
3360   // Unspill the temp registers:
3361   pop_reg(pushed_registers, sp);
3362 
3363   bne(t1, t0, *L_failure);
3364 
3365   // Success. Cache the super we found an proceed in triumph.
3366   sd(super_klass, super_cache_addr);
3367 
3368   if (L_success != &L_fallthrough) {
3369     j(*L_success);
3370   }
3371 
3372 #undef IS_A_TEMP
3373 
3374   bind(L_fallthrough);
3375 }
3376 
3377 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
3378 void MacroAssembler::tlab_allocate(Register obj,
3379                                    Register var_size_in_bytes,
3380                                    int con_size_in_bytes,
3381                                    Register tmp1,
3382                                    Register tmp2,
3383                                    Label& slow_case,
3384                                    bool is_far) {
3385   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3386   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
3387 }
3388 
3389 // get_thread() can be called anywhere inside generated code so we
3390 // need to save whatever non-callee save context might get clobbered
3391 // by the call to Thread::current() or, indeed, the call setup code.
3392 void MacroAssembler::get_thread(Register thread) {
3393   // save all call-clobbered regs except thread
3394   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
3395                       RegSet::range(x28, x31) + ra - thread;
3396   push_reg(saved_regs, sp);
3397 
3398   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
3399   jalr(ra);
3400   if (thread != c_rarg0) {
3401     mv(thread, c_rarg0);
3402   }
3403 
3404   // restore pushed registers
3405   pop_reg(saved_regs, sp);
3406 }
3407 
3408 void MacroAssembler::load_byte_map_base(Register reg) {
3409   CardTable::CardValue* byte_map_base =
3410     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
3411   mv(reg, (uint64_t)byte_map_base);
3412 }
3413 
3414 void MacroAssembler::build_frame(int framesize) {
3415   assert(framesize >= 2, "framesize must include space for FP/RA");
3416   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3417   sub(sp, sp, framesize);
3418   sd(fp, Address(sp, framesize - 2 * wordSize));
3419   sd(ra, Address(sp, framesize - wordSize));
3420   if (PreserveFramePointer) { add(fp, sp, framesize); }
3421 }
3422 
3423 void MacroAssembler::remove_frame(int framesize) {
3424   assert(framesize >= 2, "framesize must include space for FP/RA");
3425   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3426   ld(fp, Address(sp, framesize - 2 * wordSize));
3427   ld(ra, Address(sp, framesize - wordSize));
3428   add(sp, sp, framesize);
3429 }
3430 
3431 void MacroAssembler::reserved_stack_check() {
3432     // testing if reserved zone needs to be enabled
3433     Label no_reserved_zone_enabling;
3434 
3435     ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
3436     bltu(sp, t0, no_reserved_zone_enabling);
3437 
3438     enter();   // RA and FP are live.
3439     mv(c_rarg0, xthread);
3440     rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
3441     leave();
3442 
3443     // We have already removed our own frame.
3444     // throw_delayed_StackOverflowError will think that it's been
3445     // called by our caller.
3446     RuntimeAddress target(StubRoutines::throw_delayed_StackOverflowError_entry());
3447     relocate(target.rspec(), [&] {
3448       int32_t offset;
3449       movptr(t0, target.target(), offset);
3450       jalr(x0, t0, offset);
3451     });
3452     should_not_reach_here();
3453 
3454     bind(no_reserved_zone_enabling);
3455 }
3456 
3457 // Move the address of the polling page into dest.
3458 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
3459   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
3460 }
3461 
3462 // Read the polling page.  The address of the polling page must
3463 // already be in r.
3464 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
3465   relocate(rtype, [&] {
3466     lwu(zr, Address(r, offset));
3467   });
3468 }
3469 
3470 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3471 #ifdef ASSERT
3472   {
3473     ThreadInVMfromUnknown tiv;
3474     assert (UseCompressedOops, "should only be used for compressed oops");
3475     assert (Universe::heap() != nullptr, "java heap should be initialized");
3476     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3477     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3478   }
3479 #endif
3480   int oop_index = oop_recorder()->find_index(obj);
3481   relocate(oop_Relocation::spec(oop_index), [&] {
3482     li32(dst, 0xDEADBEEF);
3483   });
3484   zero_extend(dst, dst, 32);
3485 }
3486 
3487 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3488   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3489   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3490   int index = oop_recorder()->find_index(k);
3491   assert(!Universe::heap()->is_in(k), "should not be an oop");
3492 
3493   narrowKlass nk = CompressedKlassPointers::encode(k);
3494   relocate(metadata_Relocation::spec(index), [&] {
3495     li32(dst, nk);
3496   });
3497   zero_extend(dst, dst, 32);
3498 }
3499 
3500 // Maybe emit a call via a trampoline. If the code cache is small
3501 // trampolines won't be emitted.
3502 address MacroAssembler::trampoline_call(Address entry) {
3503   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
3504          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
3505          entry.rspec().type() == relocInfo::static_call_type ||
3506          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
3507 
3508   address target = entry.target();
3509 
3510   // We need a trampoline if branches are far.
3511   if (!in_scratch_emit_size()) {
3512     if (entry.rspec().type() == relocInfo::runtime_call_type) {
3513       assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
3514       code()->share_trampoline_for(entry.target(), offset());
3515     } else {
3516       address stub = emit_trampoline_stub(offset(), target);
3517       if (stub == nullptr) {
3518         postcond(pc() == badAddress);
3519         return nullptr; // CodeCache is full
3520       }
3521     }
3522   }
3523   target = pc();
3524 
3525   address call_pc = pc();
3526 #ifdef ASSERT
3527   if (entry.rspec().type() != relocInfo::runtime_call_type) {
3528     assert_alignment(call_pc);
3529   }
3530 #endif
3531   relocate(entry.rspec(), [&] {
3532     jal(target);
3533   });
3534 
3535   postcond(pc() != badAddress);
3536   return call_pc;
3537 }
3538 
3539 address MacroAssembler::ic_call(address entry, jint method_index) {
3540   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
3541   IncompressibleRegion ir(this);  // relocations
3542   movptr(t1, (address)Universe::non_oop_word());
3543   assert_cond(entry != nullptr);
3544   return trampoline_call(Address(entry, rh));
3545 }
3546 
3547 int MacroAssembler::ic_check_size() {
3548   // No compressed
3549   return (NativeInstruction::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
3550           far_branch_size();
3551 }
3552 
3553 int MacroAssembler::ic_check(int end_alignment) {
3554   IncompressibleRegion ir(this);
3555   Register receiver = j_rarg0;
3556   Register data = t1;
3557 
3558   Register tmp1 = t0; // t0 always scratch
3559   // t2 is saved on call, thus should have been saved before this check.
3560   // Hence we can clobber it.
3561   Register tmp2 = t2;
3562 
3563   // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
3564   // before the inline cache check, so we don't have to execute any nop instructions when dispatching
3565   // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
3566   // before the inline cache check here, and not after
3567   align(end_alignment, ic_check_size());
3568   int uep_offset = offset();
3569 
3570   if (UseCompressedClassPointers) {
3571     lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
3572     lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
3573   } else {
3574     ld(tmp1,  Address(receiver, oopDesc::klass_offset_in_bytes()));
3575     ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
3576   }
3577 
3578   Label ic_hit;
3579   beq(tmp1, tmp2, ic_hit);
3580   // Note, far_jump is not fixed size.
3581   // Is this ever generates a movptr alignment/size will be off.
3582   far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
3583   bind(ic_hit);
3584 
3585   assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
3586   return uep_offset;
3587 }
3588 
3589 // Emit a trampoline stub for a call to a target which is too far away.
3590 //
3591 // code sequences:
3592 //
3593 // call-site:
3594 //   branch-and-link to <destination> or <trampoline stub>
3595 //
3596 // Related trampoline stub for this call site in the stub section:
3597 //   load the call target from the constant pool
3598 //   branch (RA still points to the call site above)
3599 
3600 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
3601                                              address dest) {
3602   // Max stub size: alignment nop, TrampolineStub.
3603   address stub = start_a_stub(max_trampoline_stub_size());
3604   if (stub == nullptr) {
3605     return nullptr;  // CodeBuffer::expand failed
3606   }
3607 
3608   // We are always 4-byte aligned here.
3609   assert_alignment(pc());
3610 
3611   // Create a trampoline stub relocation which relates this trampoline stub
3612   // with the call instruction at insts_call_instruction_offset in the
3613   // instructions code-section.
3614 
3615   // Make sure the address of destination 8-byte aligned after 3 instructions.
3616   align(wordSize, NativeCallTrampolineStub::data_offset);
3617 
3618   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
3619                                                          insts_call_instruction_offset);
3620   const int stub_start_offset = offset();
3621   relocate(rh, [&] {
3622     // Now, create the trampoline stub's code:
3623     // - load the call
3624     // - call
3625     Label target;
3626     ld(t0, target);  // auipc + ld
3627     jr(t0);          // jalr
3628     bind(target);
3629     assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
3630            "should be");
3631     assert(offset() % wordSize == 0, "bad alignment");
3632     emit_int64((int64_t)dest);
3633   });
3634 
3635   const address stub_start_addr = addr_at(stub_start_offset);
3636 
3637   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
3638 
3639   end_a_stub();
3640   return stub_start_addr;
3641 }
3642 
3643 int MacroAssembler::max_trampoline_stub_size() {
3644   // Max stub size: alignment nop, TrampolineStub.
3645   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
3646 }
3647 
3648 int MacroAssembler::static_call_stub_size() {
3649   // (lui, addi, slli, addi, slli, addi) + (lui, addi, slli, addi, slli) + jalr
3650   return 12 * NativeInstruction::instruction_size;
3651 }
3652 
3653 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
3654   switch (dst.getMode()) {
3655     case Address::base_plus_offset:
3656       // This is the expected mode, although we allow all the other
3657       // forms below.
3658       return form_address(tmp, dst.base(), dst.offset());
3659     default:
3660       la(tmp, dst);
3661       return Address(tmp);
3662   }
3663 }
3664 
3665 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3666   assert(((dst.getMode() == Address::base_plus_offset &&
3667            is_simm12(dst.offset())) || is_simm12(value)),
3668           "invalid value and address mode combination");
3669   Address adr = add_memory_helper(dst, tmp2);
3670   assert(!adr.uses(tmp1), "invalid dst for address increment");
3671   ld(tmp1, adr);
3672   add(tmp1, tmp1, value, tmp2);
3673   sd(tmp1, adr);
3674 }
3675 
3676 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3677   assert(((dst.getMode() == Address::base_plus_offset &&
3678            is_simm12(dst.offset())) || is_simm12(value)),
3679           "invalid value and address mode combination");
3680   Address adr = add_memory_helper(dst, tmp2);
3681   assert(!adr.uses(tmp1), "invalid dst for address increment");
3682   lwu(tmp1, adr);
3683   addw(tmp1, tmp1, value, tmp2);
3684   sw(tmp1, adr);
3685 }
3686 
3687 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3688   assert(((dst.getMode() == Address::base_plus_offset &&
3689            is_simm12(dst.offset())) || is_simm12(value)),
3690           "invalid value and address mode combination");
3691   Address adr = add_memory_helper(dst, tmp2);
3692   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3693   ld(tmp1, adr);
3694   sub(tmp1, tmp1, value, tmp2);
3695   sd(tmp1, adr);
3696 }
3697 
3698 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3699   assert(((dst.getMode() == Address::base_plus_offset &&
3700            is_simm12(dst.offset())) || is_simm12(value)),
3701           "invalid value and address mode combination");
3702   Address adr = add_memory_helper(dst, tmp2);
3703   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3704   lwu(tmp1, adr);
3705   subw(tmp1, tmp1, value, tmp2);
3706   sw(tmp1, adr);
3707 }
3708 
3709 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
3710   assert_different_registers(src1, t0);
3711   relocate(src2.rspec(), [&] {
3712     int32_t offset;
3713     la(t0, src2.target(), offset);
3714     ld(t0, Address(t0, offset));
3715   });
3716   beq(src1, t0, equal);
3717 }
3718 
3719 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
3720   load_method_holder(result, method);
3721   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
3722 }
3723 
3724 void MacroAssembler::load_method_holder(Register holder, Register method) {
3725   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3726   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3727   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
3728 }
3729 
3730 // string indexof
3731 // compute index by trailing zeros
3732 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
3733                                    Register match_mask, Register result,
3734                                    Register ch2, Register tmp,
3735                                    bool haystack_isL) {
3736   int haystack_chr_shift = haystack_isL ? 0 : 1;
3737   srl(match_mask, match_mask, trailing_zeros);
3738   srli(match_mask, match_mask, 1);
3739   srli(tmp, trailing_zeros, LogBitsPerByte);
3740   if (!haystack_isL) andi(tmp, tmp, 0xE);
3741   add(haystack, haystack, tmp);
3742   ld(ch2, Address(haystack));
3743   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
3744   add(result, result, tmp);
3745 }
3746 
3747 // string indexof
3748 // Find pattern element in src, compute match mask,
3749 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
3750 // match mask patterns and corresponding indices would be like:
3751 // - 0x8080808080808080 (Latin1)
3752 // -   7 6 5 4 3 2 1 0  (match index)
3753 // - 0x8000800080008000 (UTF16)
3754 // -   3   2   1   0    (match index)
3755 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
3756                                         Register mask1, Register mask2) {
3757   xorr(src, pattern, src);
3758   sub(match_mask, src, mask1);
3759   orr(src, src, mask2);
3760   notr(src, src);
3761   andr(match_mask, match_mask, src);
3762 }
3763 
3764 #ifdef COMPILER2
3765 // Code for BigInteger::mulAdd intrinsic
3766 // out     = x10
3767 // in      = x11
3768 // offset  = x12  (already out.length-offset)
3769 // len     = x13
3770 // k       = x14
3771 // tmp     = x28
3772 //
3773 // pseudo code from java implementation:
3774 // long kLong = k & LONG_MASK;
3775 // carry = 0;
3776 // offset = out.length-offset - 1;
3777 // for (int j = len - 1; j >= 0; j--) {
3778 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3779 //     out[offset--] = (int)product;
3780 //     carry = product >>> 32;
3781 // }
3782 // return (int)carry;
3783 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3784                              Register len, Register k, Register tmp) {
3785   Label L_tail_loop, L_unroll, L_end;
3786   mv(tmp, out);
3787   mv(out, zr);
3788   blez(len, L_end);
3789   zero_extend(k, k, 32);
3790   slliw(t0, offset, LogBytesPerInt);
3791   add(offset, tmp, t0);
3792   slliw(t0, len, LogBytesPerInt);
3793   add(in, in, t0);
3794 
3795   const int unroll = 8;
3796   mv(tmp, unroll);
3797   blt(len, tmp, L_tail_loop);
3798   bind(L_unroll);
3799   for (int i = 0; i < unroll; i++) {
3800     sub(in, in, BytesPerInt);
3801     lwu(t0, Address(in, 0));
3802     mul(t1, t0, k);
3803     add(t0, t1, out);
3804     sub(offset, offset, BytesPerInt);
3805     lwu(t1, Address(offset, 0));
3806     add(t0, t0, t1);
3807     sw(t0, Address(offset, 0));
3808     srli(out, t0, 32);
3809   }
3810   subw(len, len, tmp);
3811   bge(len, tmp, L_unroll);
3812 
3813   bind(L_tail_loop);
3814   blez(len, L_end);
3815   sub(in, in, BytesPerInt);
3816   lwu(t0, Address(in, 0));
3817   mul(t1, t0, k);
3818   add(t0, t1, out);
3819   sub(offset, offset, BytesPerInt);
3820   lwu(t1, Address(offset, 0));
3821   add(t0, t0, t1);
3822   sw(t0, Address(offset, 0));
3823   srli(out, t0, 32);
3824   subw(len, len, 1);
3825   j(L_tail_loop);
3826 
3827   bind(L_end);
3828 }
3829 
3830 // Multiply and multiply-accumulate unsigned 64-bit registers.
3831 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
3832   assert_different_registers(prod_lo, prod_hi);
3833 
3834   mul(prod_lo, n, m);
3835   mulhu(prod_hi, n, m);
3836 }
3837 
3838 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
3839                                Register m, Register tmp1, Register tmp2) {
3840   assert_different_registers(sum_lo, sum_hi);
3841   assert_different_registers(sum_hi, tmp2);
3842 
3843   wide_mul(tmp1, tmp2, n, m);
3844   cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
3845   adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
3846 }
3847 
3848 // add two unsigned input and output carry
3849 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
3850 {
3851   assert_different_registers(dst, carry);
3852   assert_different_registers(dst, src2);
3853   add(dst, src1, src2);
3854   sltu(carry, dst, src2);
3855 }
3856 
3857 // add two input with carry
3858 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
3859   assert_different_registers(dst, carry);
3860   add(dst, src1, src2);
3861   add(dst, dst, carry);
3862 }
3863 
3864 // add two unsigned input with carry and output carry
3865 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
3866   assert_different_registers(dst, src2);
3867   adc(dst, src1, src2, carry);
3868   sltu(carry, dst, src2);
3869 }
3870 
3871 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3872                                      Register src1, Register src2, Register carry) {
3873   cad(dest_lo, dest_lo, src1, carry);
3874   add(dest_hi, dest_hi, carry);
3875   cad(dest_lo, dest_lo, src2, carry);
3876   add(final_dest_hi, dest_hi, carry);
3877 }
3878 
3879 /**
3880  * Multiply 32 bit by 32 bit first loop.
3881  */
3882 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
3883                                            Register y, Register y_idx, Register z,
3884                                            Register carry, Register product,
3885                                            Register idx, Register kdx) {
3886   // jlong carry, x[], y[], z[];
3887   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3888   //     long product = y[idx] * x[xstart] + carry;
3889   //     z[kdx] = (int)product;
3890   //     carry = product >>> 32;
3891   // }
3892   // z[xstart] = (int)carry;
3893 
3894   Label L_first_loop, L_first_loop_exit;
3895   blez(idx, L_first_loop_exit);
3896 
3897   shadd(t0, xstart, x, t0, LogBytesPerInt);
3898   lwu(x_xstart, Address(t0, 0));
3899 
3900   bind(L_first_loop);
3901   subw(idx, idx, 1);
3902   shadd(t0, idx, y, t0, LogBytesPerInt);
3903   lwu(y_idx, Address(t0, 0));
3904   mul(product, x_xstart, y_idx);
3905   add(product, product, carry);
3906   srli(carry, product, 32);
3907   subw(kdx, kdx, 1);
3908   shadd(t0, kdx, z, t0, LogBytesPerInt);
3909   sw(product, Address(t0, 0));
3910   bgtz(idx, L_first_loop);
3911 
3912   bind(L_first_loop_exit);
3913 }
3914 
3915 /**
3916  * Multiply 64 bit by 64 bit first loop.
3917  */
3918 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3919                                            Register y, Register y_idx, Register z,
3920                                            Register carry, Register product,
3921                                            Register idx, Register kdx) {
3922   //
3923   //  jlong carry, x[], y[], z[];
3924   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3925   //    huge_128 product = y[idx] * x[xstart] + carry;
3926   //    z[kdx] = (jlong)product;
3927   //    carry  = (jlong)(product >>> 64);
3928   //  }
3929   //  z[xstart] = carry;
3930   //
3931 
3932   Label L_first_loop, L_first_loop_exit;
3933   Label L_one_x, L_one_y, L_multiply;
3934 
3935   subw(xstart, xstart, 1);
3936   bltz(xstart, L_one_x);
3937 
3938   shadd(t0, xstart, x, t0, LogBytesPerInt);
3939   ld(x_xstart, Address(t0, 0));
3940   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3941 
3942   bind(L_first_loop);
3943   subw(idx, idx, 1);
3944   bltz(idx, L_first_loop_exit);
3945   subw(idx, idx, 1);
3946   bltz(idx, L_one_y);
3947 
3948   shadd(t0, idx, y, t0, LogBytesPerInt);
3949   ld(y_idx, Address(t0, 0));
3950   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
3951   bind(L_multiply);
3952 
3953   mulhu(t0, x_xstart, y_idx);
3954   mul(product, x_xstart, y_idx);
3955   cad(product, product, carry, t1);
3956   adc(carry, t0, zr, t1);
3957 
3958   subw(kdx, kdx, 2);
3959   ror_imm(product, product, 32); // back to big-endian
3960   shadd(t0, kdx, z, t0, LogBytesPerInt);
3961   sd(product, Address(t0, 0));
3962 
3963   j(L_first_loop);
3964 
3965   bind(L_one_y);
3966   lwu(y_idx, Address(y, 0));
3967   j(L_multiply);
3968 
3969   bind(L_one_x);
3970   lwu(x_xstart, Address(x, 0));
3971   j(L_first_loop);
3972 
3973   bind(L_first_loop_exit);
3974 }
3975 
3976 /**
3977  * Multiply 128 bit by 128 bit. Unrolled inner loop.
3978  *
3979  */
3980 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3981                                              Register carry, Register carry2,
3982                                              Register idx, Register jdx,
3983                                              Register yz_idx1, Register yz_idx2,
3984                                              Register tmp, Register tmp3, Register tmp4,
3985                                              Register tmp6, Register product_hi) {
3986   //   jlong carry, x[], y[], z[];
3987   //   int kdx = xstart+1;
3988   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3989   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3990   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3991   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3992   //     carry  = (jlong)(tmp4 >>> 64);
3993   //     z[kdx+idx+1] = (jlong)tmp3;
3994   //     z[kdx+idx] = (jlong)tmp4;
3995   //   }
3996   //   idx += 2;
3997   //   if (idx > 0) {
3998   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3999   //     z[kdx+idx] = (jlong)yz_idx1;
4000   //     carry  = (jlong)(yz_idx1 >>> 64);
4001   //   }
4002   //
4003 
4004   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4005 
4006   srliw(jdx, idx, 2);
4007 
4008   bind(L_third_loop);
4009 
4010   subw(jdx, jdx, 1);
4011   bltz(jdx, L_third_loop_exit);
4012   subw(idx, idx, 4);
4013 
4014   shadd(t0, idx, y, t0, LogBytesPerInt);
4015   ld(yz_idx2, Address(t0, 0));
4016   ld(yz_idx1, Address(t0, wordSize));
4017 
4018   shadd(tmp6, idx, z, t0, LogBytesPerInt);
4019 
4020   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
4021   ror_imm(yz_idx2, yz_idx2, 32);
4022 
4023   ld(t1, Address(tmp6, 0));
4024   ld(t0, Address(tmp6, wordSize));
4025 
4026   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4027   mulhu(tmp4, product_hi, yz_idx1);
4028 
4029   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
4030   ror_imm(t1, t1, 32, tmp);
4031 
4032   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
4033   mulhu(carry2, product_hi, yz_idx2);
4034 
4035   cad(tmp3, tmp3, carry, carry);
4036   adc(tmp4, tmp4, zr, carry);
4037   cad(tmp3, tmp3, t0, t0);
4038   cadc(tmp4, tmp4, tmp, t0);
4039   adc(carry, carry2, zr, t0);
4040   cad(tmp4, tmp4, t1, carry2);
4041   adc(carry, carry, zr, carry2);
4042 
4043   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
4044   ror_imm(tmp4, tmp4, 32);
4045   sd(tmp4, Address(tmp6, 0));
4046   sd(tmp3, Address(tmp6, wordSize));
4047 
4048   j(L_third_loop);
4049 
4050   bind(L_third_loop_exit);
4051 
4052   andi(idx, idx, 0x3);
4053   beqz(idx, L_post_third_loop_done);
4054 
4055   Label L_check_1;
4056   subw(idx, idx, 2);
4057   bltz(idx, L_check_1);
4058 
4059   shadd(t0, idx, y, t0, LogBytesPerInt);
4060   ld(yz_idx1, Address(t0, 0));
4061   ror_imm(yz_idx1, yz_idx1, 32);
4062 
4063   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
4064   mulhu(tmp4, product_hi, yz_idx1);
4065 
4066   shadd(t0, idx, z, t0, LogBytesPerInt);
4067   ld(yz_idx2, Address(t0, 0));
4068   ror_imm(yz_idx2, yz_idx2, 32, tmp);
4069 
4070   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
4071 
4072   ror_imm(tmp3, tmp3, 32, tmp);
4073   sd(tmp3, Address(t0, 0));
4074 
4075   bind(L_check_1);
4076 
4077   andi(idx, idx, 0x1);
4078   subw(idx, idx, 1);
4079   bltz(idx, L_post_third_loop_done);
4080   shadd(t0, idx, y, t0, LogBytesPerInt);
4081   lwu(tmp4, Address(t0, 0));
4082   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
4083   mulhu(carry2, tmp4, product_hi);
4084 
4085   shadd(t0, idx, z, t0, LogBytesPerInt);
4086   lwu(tmp4, Address(t0, 0));
4087 
4088   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
4089 
4090   shadd(t0, idx, z, t0, LogBytesPerInt);
4091   sw(tmp3, Address(t0, 0));
4092 
4093   slli(t0, carry2, 32);
4094   srli(carry, tmp3, 32);
4095   orr(carry, carry, t0);
4096 
4097   bind(L_post_third_loop_done);
4098 }
4099 
4100 /**
4101  * Code for BigInteger::multiplyToLen() intrinsic.
4102  *
4103  * x10: x
4104  * x11: xlen
4105  * x12: y
4106  * x13: ylen
4107  * x14: z
4108  * x15: zlen
4109  * x16: tmp1
4110  * x17: tmp2
4111  * x7:  tmp3
4112  * x28: tmp4
4113  * x29: tmp5
4114  * x30: tmp6
4115  * x31: tmp7
4116  */
4117 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
4118                                      Register z, Register zlen,
4119                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
4120                                      Register tmp5, Register tmp6, Register product_hi) {
4121   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4122 
4123   const Register idx = tmp1;
4124   const Register kdx = tmp2;
4125   const Register xstart = tmp3;
4126 
4127   const Register y_idx = tmp4;
4128   const Register carry = tmp5;
4129   const Register product = xlen;
4130   const Register x_xstart = zlen; // reuse register
4131 
4132   mv(idx, ylen); // idx = ylen;
4133   mv(kdx, zlen); // kdx = xlen+ylen;
4134   mv(carry, zr); // carry = 0;
4135 
4136   Label L_multiply_64_x_64_loop, L_done;
4137 
4138   subw(xstart, xlen, 1);
4139   bltz(xstart, L_done);
4140 
4141   const Register jdx = tmp1;
4142 
4143   if (AvoidUnalignedAccesses) {
4144     // Check if x and y are both 8-byte aligned.
4145     orr(t0, xlen, ylen);
4146     test_bit(t0, t0, 0);
4147     beqz(t0, L_multiply_64_x_64_loop);
4148 
4149     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4150     shadd(t0, xstart, z, t0, LogBytesPerInt);
4151     sw(carry, Address(t0, 0));
4152 
4153     Label L_second_loop_unaligned;
4154     bind(L_second_loop_unaligned);
4155     mv(carry, zr);
4156     mv(jdx, ylen);
4157     subw(xstart, xstart, 1);
4158     bltz(xstart, L_done);
4159     sub(sp, sp, 2 * wordSize);
4160     sd(z, Address(sp, 0));
4161     sd(zr, Address(sp, wordSize));
4162     shadd(t0, xstart, z, t0, LogBytesPerInt);
4163     addi(z, t0, 4);
4164     shadd(t0, xstart, x, t0, LogBytesPerInt);
4165     lwu(product, Address(t0, 0));
4166     Label L_third_loop, L_third_loop_exit;
4167 
4168     blez(jdx, L_third_loop_exit);
4169 
4170     bind(L_third_loop);
4171     subw(jdx, jdx, 1);
4172     shadd(t0, jdx, y, t0, LogBytesPerInt);
4173     lwu(t0, Address(t0, 0));
4174     mul(t1, t0, product);
4175     add(t0, t1, carry);
4176     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
4177     lwu(t1, Address(tmp6, 0));
4178     add(t0, t0, t1);
4179     sw(t0, Address(tmp6, 0));
4180     srli(carry, t0, 32);
4181     bgtz(jdx, L_third_loop);
4182 
4183     bind(L_third_loop_exit);
4184     ld(z, Address(sp, 0));
4185     addi(sp, sp, 2 * wordSize);
4186     shadd(t0, xstart, z, t0, LogBytesPerInt);
4187     sw(carry, Address(t0, 0));
4188 
4189     j(L_second_loop_unaligned);
4190   }
4191 
4192   bind(L_multiply_64_x_64_loop);
4193   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
4194 
4195   Label L_second_loop_aligned;
4196   beqz(kdx, L_second_loop_aligned);
4197 
4198   Label L_carry;
4199   subw(kdx, kdx, 1);
4200   beqz(kdx, L_carry);
4201 
4202   shadd(t0, kdx, z, t0, LogBytesPerInt);
4203   sw(carry, Address(t0, 0));
4204   srli(carry, carry, 32);
4205   subw(kdx, kdx, 1);
4206 
4207   bind(L_carry);
4208   shadd(t0, kdx, z, t0, LogBytesPerInt);
4209   sw(carry, Address(t0, 0));
4210 
4211   // Second and third (nested) loops.
4212   //
4213   // for (int i = xstart-1; i >= 0; i--) { // Second loop
4214   //   carry = 0;
4215   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4216   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4217   //                    (z[k] & LONG_MASK) + carry;
4218   //     z[k] = (int)product;
4219   //     carry = product >>> 32;
4220   //   }
4221   //   z[i] = (int)carry;
4222   // }
4223   //
4224   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
4225 
4226   bind(L_second_loop_aligned);
4227   mv(carry, zr); // carry = 0;
4228   mv(jdx, ylen); // j = ystart+1
4229 
4230   subw(xstart, xstart, 1); // i = xstart-1;
4231   bltz(xstart, L_done);
4232 
4233   sub(sp, sp, 4 * wordSize);
4234   sd(z, Address(sp, 0));
4235 
4236   Label L_last_x;
4237   shadd(t0, xstart, z, t0, LogBytesPerInt);
4238   addi(z, t0, 4);
4239   subw(xstart, xstart, 1); // i = xstart-1;
4240   bltz(xstart, L_last_x);
4241 
4242   shadd(t0, xstart, x, t0, LogBytesPerInt);
4243   ld(product_hi, Address(t0, 0));
4244   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
4245 
4246   Label L_third_loop_prologue;
4247   bind(L_third_loop_prologue);
4248 
4249   sd(ylen, Address(sp, wordSize));
4250   sd(x, Address(sp, 2 * wordSize));
4251   sd(xstart, Address(sp, 3 * wordSize));
4252   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
4253                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
4254   ld(z, Address(sp, 0));
4255   ld(ylen, Address(sp, wordSize));
4256   ld(x, Address(sp, 2 * wordSize));
4257   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
4258   addi(sp, sp, 4 * wordSize);
4259 
4260   addiw(tmp3, xlen, 1);
4261   shadd(t0, tmp3, z, t0, LogBytesPerInt);
4262   sw(carry, Address(t0, 0));
4263 
4264   subw(tmp3, tmp3, 1);
4265   bltz(tmp3, L_done);
4266 
4267   srli(carry, carry, 32);
4268   shadd(t0, tmp3, z, t0, LogBytesPerInt);
4269   sw(carry, Address(t0, 0));
4270   j(L_second_loop_aligned);
4271 
4272   // Next infrequent code is moved outside loops.
4273   bind(L_last_x);
4274   lwu(product_hi, Address(x, 0));
4275   j(L_third_loop_prologue);
4276 
4277   bind(L_done);
4278 }
4279 #endif
4280 
4281 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
4282 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
4283 // shift 16 bits once.
4284 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
4285   if (UseZbb) {
4286     assert_different_registers(Rd, Rs, tmp1);
4287     int step = isLL ? 8 : 16;
4288     ctz(Rd, Rs);
4289     andi(tmp1, Rd, step - 1);
4290     sub(Rd, Rd, tmp1);
4291     return;
4292   }
4293 
4294   assert_different_registers(Rd, Rs, tmp1, tmp2);
4295   Label Loop;
4296   int step = isLL ? 8 : 16;
4297   mv(Rd, -step);
4298   mv(tmp2, Rs);
4299 
4300   bind(Loop);
4301   addi(Rd, Rd, step);
4302   andi(tmp1, tmp2, ((1 << step) - 1));
4303   srli(tmp2, tmp2, step);
4304   beqz(tmp1, Loop);
4305 }
4306 
4307 // This instruction reads adjacent 4 bytes from the lower half of source register,
4308 // inflate into a register, for example:
4309 // Rs: A7A6A5A4A3A2A1A0
4310 // Rd: 00A300A200A100A0
4311 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4312   assert_different_registers(Rd, Rs, tmp1, tmp2);
4313 
4314   mv(tmp1, 0xFF000000); // first byte mask at lower word
4315   andr(Rd, Rs, tmp1);
4316   for (int i = 0; i < 2; i++) {
4317     slli(Rd, Rd, wordSize);
4318     srli(tmp1, tmp1, wordSize);
4319     andr(tmp2, Rs, tmp1);
4320     orr(Rd, Rd, tmp2);
4321   }
4322   slli(Rd, Rd, wordSize);
4323   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
4324   orr(Rd, Rd, tmp2);
4325 }
4326 
4327 // This instruction reads adjacent 4 bytes from the upper half of source register,
4328 // inflate into a register, for example:
4329 // Rs: A7A6A5A4A3A2A1A0
4330 // Rd: 00A700A600A500A4
4331 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4332   assert_different_registers(Rd, Rs, tmp1, tmp2);
4333   srli(Rs, Rs, 32);   // only upper 32 bits are needed
4334   inflate_lo32(Rd, Rs, tmp1, tmp2);
4335 }
4336 
4337 // The size of the blocks erased by the zero_blocks stub.  We must
4338 // handle anything smaller than this ourselves in zero_words().
4339 const int MacroAssembler::zero_words_block_size = 8;
4340 
4341 // zero_words() is used by C2 ClearArray patterns.  It is as small as
4342 // possible, handling small word counts locally and delegating
4343 // anything larger to the zero_blocks stub.  It is expanded many times
4344 // in compiled code, so it is important to keep it short.
4345 
4346 // ptr:   Address of a buffer to be zeroed.
4347 // cnt:   Count in HeapWords.
4348 //
4349 // ptr, cnt, and t0 are clobbered.
4350 address MacroAssembler::zero_words(Register ptr, Register cnt) {
4351   assert(is_power_of_2(zero_words_block_size), "adjust this");
4352   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
4353   assert_different_registers(cnt, t0);
4354 
4355   BLOCK_COMMENT("zero_words {");
4356 
4357   mv(t0, zero_words_block_size);
4358   Label around, done, done16;
4359   bltu(cnt, t0, around);
4360   {
4361     RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
4362     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
4363     if (StubRoutines::riscv::complete()) {
4364       address tpc = trampoline_call(zero_blocks);
4365       if (tpc == nullptr) {
4366         DEBUG_ONLY(reset_labels(around));
4367         postcond(pc() == badAddress);
4368         return nullptr;
4369       }
4370     } else {
4371       jal(zero_blocks);
4372     }
4373   }
4374   bind(around);
4375   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4376     Label l;
4377     test_bit(t0, cnt, exact_log2(i));
4378     beqz(t0, l);
4379     for (int j = 0; j < i; j++) {
4380       sd(zr, Address(ptr, j * wordSize));
4381     }
4382     addi(ptr, ptr, i * wordSize);
4383     bind(l);
4384   }
4385   {
4386     Label l;
4387     test_bit(t0, cnt, 0);
4388     beqz(t0, l);
4389     sd(zr, Address(ptr, 0));
4390     bind(l);
4391   }
4392 
4393   BLOCK_COMMENT("} zero_words");
4394   postcond(pc() != badAddress);
4395   return pc();
4396 }
4397 
4398 #define SmallArraySize (18 * BytesPerLong)
4399 
4400 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
4401 // cnt:   Immediate count in HeapWords.
4402 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
4403   assert_different_registers(base, t0, t1);
4404 
4405   BLOCK_COMMENT("zero_words {");
4406 
4407   if (cnt <= SmallArraySize / BytesPerLong) {
4408     for (int i = 0; i < (int)cnt; i++) {
4409       sd(zr, Address(base, i * wordSize));
4410     }
4411   } else {
4412     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
4413     int remainder = cnt % unroll;
4414     for (int i = 0; i < remainder; i++) {
4415       sd(zr, Address(base, i * wordSize));
4416     }
4417 
4418     Label loop;
4419     Register cnt_reg = t0;
4420     Register loop_base = t1;
4421     cnt = cnt - remainder;
4422     mv(cnt_reg, cnt);
4423     add(loop_base, base, remainder * wordSize);
4424     bind(loop);
4425     sub(cnt_reg, cnt_reg, unroll);
4426     for (int i = 0; i < unroll; i++) {
4427       sd(zr, Address(loop_base, i * wordSize));
4428     }
4429     add(loop_base, loop_base, unroll * wordSize);
4430     bnez(cnt_reg, loop);
4431   }
4432 
4433   BLOCK_COMMENT("} zero_words");
4434 }
4435 
4436 // base:   Address of a buffer to be filled, 8 bytes aligned.
4437 // cnt:    Count in 8-byte unit.
4438 // value:  Value to be filled with.
4439 // base will point to the end of the buffer after filling.
4440 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
4441 //  Algorithm:
4442 //
4443 //    t0 = cnt & 7
4444 //    cnt -= t0
4445 //    p += t0
4446 //    switch (t0):
4447 //      switch start:
4448 //      do while cnt
4449 //        cnt -= 8
4450 //          p[-8] = value
4451 //        case 7:
4452 //          p[-7] = value
4453 //        case 6:
4454 //          p[-6] = value
4455 //          // ...
4456 //        case 1:
4457 //          p[-1] = value
4458 //        case 0:
4459 //          p += 8
4460 //      do-while end
4461 //    switch end
4462 
4463   assert_different_registers(base, cnt, value, t0, t1);
4464 
4465   Label fini, skip, entry, loop;
4466   const int unroll = 8; // Number of sd instructions we'll unroll
4467 
4468   beqz(cnt, fini);
4469 
4470   andi(t0, cnt, unroll - 1);
4471   sub(cnt, cnt, t0);
4472   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
4473   shadd(base, t0, base, t1, 3);
4474   la(t1, entry);
4475   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
4476   sub(t1, t1, t0);
4477   jr(t1);
4478 
4479   bind(loop);
4480   add(base, base, unroll * 8);
4481   for (int i = -unroll; i < 0; i++) {
4482     sd(value, Address(base, i * 8));
4483   }
4484   bind(entry);
4485   sub(cnt, cnt, unroll);
4486   bgez(cnt, loop);
4487 
4488   bind(fini);
4489 }
4490 
4491 // Zero blocks of memory by using CBO.ZERO.
4492 //
4493 // Aligns the base address first sufficiently for CBO.ZERO, then uses
4494 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
4495 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4496 // in cnt.
4497 //
4498 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4499 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
4500 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
4501   Label initial_table_end, loop;
4502 
4503   // Align base with cache line size.
4504   neg(tmp1, base);
4505   andi(tmp1, tmp1, CacheLineSize - 1);
4506 
4507   // tmp1: the number of bytes to be filled to align the base with cache line size.
4508   add(base, base, tmp1);
4509   srai(tmp2, tmp1, 3);
4510   sub(cnt, cnt, tmp2);
4511   srli(tmp2, tmp1, 1);
4512   la(tmp1, initial_table_end);
4513   sub(tmp2, tmp1, tmp2);
4514   jr(tmp2);
4515   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
4516     sd(zr, Address(base, i));
4517   }
4518   bind(initial_table_end);
4519 
4520   mv(tmp1, CacheLineSize / wordSize);
4521   bind(loop);
4522   cbo_zero(base);
4523   sub(cnt, cnt, tmp1);
4524   add(base, base, CacheLineSize);
4525   bge(cnt, tmp1, loop);
4526 }
4527 
4528 // java.lang.Math.round(float a)
4529 // Returns the closest int to the argument, with ties rounding to positive infinity.
4530 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
4531   // this instructions calling sequence provides performance improvement on all tested devices;
4532   // don't change it without re-verification
4533   Label done;
4534   mv(t0, jint_cast(0.5f));
4535   fmv_w_x(ftmp, t0);
4536 
4537   // dst = 0 if NaN
4538   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
4539   mv(dst, zr);
4540   beqz(t0, done);
4541 
4542   // dst = (src + 0.5f) rounded down towards negative infinity
4543   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
4544   //   RDN is required for fadd_s, RNE gives incorrect results:
4545   //     --------------------------------------------------------------------
4546   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
4547   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
4548   //     --------------------------------------------------------------------
4549   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
4550   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
4551   //     --------------------------------------------------------------------
4552   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
4553   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
4554 
4555   bind(done);
4556 }
4557 
4558 // java.lang.Math.round(double a)
4559 // Returns the closest long to the argument, with ties rounding to positive infinity.
4560 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
4561   // this instructions calling sequence provides performance improvement on all tested devices;
4562   // don't change it without re-verification
4563   Label done;
4564   mv(t0, julong_cast(0.5));
4565   fmv_d_x(ftmp, t0);
4566 
4567   // dst = 0 if NaN
4568   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
4569   mv(dst, zr);
4570   beqz(t0, done);
4571 
4572   // dst = (src + 0.5) rounded down towards negative infinity
4573   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
4574   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
4575 
4576   bind(done);
4577 }
4578 
4579 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
4580 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
4581   Label done;                                                                             \
4582   assert_different_registers(dst, tmp);                                                   \
4583   fclass_##FLOATSIG(tmp, src);                                                            \
4584   mv(dst, zr);                                                                            \
4585   /* check if src is NaN */                                                               \
4586   andi(tmp, tmp, fclass_mask::nan);                                                       \
4587   bnez(tmp, done);                                                                        \
4588   FLOATCVT(dst, src);                                                                     \
4589   bind(done);                                                                             \
4590 }
4591 
4592 FCVT_SAFE(fcvt_w_s, s);
4593 FCVT_SAFE(fcvt_l_s, s);
4594 FCVT_SAFE(fcvt_w_d, d);
4595 FCVT_SAFE(fcvt_l_d, d);
4596 
4597 #undef FCVT_SAFE
4598 
4599 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
4600 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
4601                                          FloatRegister Rs2, int unordered_result) {     \
4602   Label Ldone;                                                                          \
4603   if (unordered_result < 0) {                                                           \
4604     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
4605     /* installs 1 if gt else 0 */                                                       \
4606     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
4607     /* Rs1 > Rs2, install 1 */                                                          \
4608     bgtz(result, Ldone);                                                                \
4609     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4610     addi(result, result, -1);                                                           \
4611     /* Rs1 = Rs2, install 0 */                                                          \
4612     /* NaN or Rs1 < Rs2, install -1 */                                                  \
4613     bind(Ldone);                                                                        \
4614   } else {                                                                              \
4615     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
4616     /* installs 1 if gt or unordered else 0 */                                          \
4617     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
4618     /* Rs1 < Rs2, install -1 */                                                         \
4619     bgtz(result, Ldone);                                                                \
4620     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4621     addi(result, result, -1);                                                           \
4622     /* Rs1 = Rs2, install 0 */                                                          \
4623     /* NaN or Rs1 > Rs2, install 1 */                                                   \
4624     bind(Ldone);                                                                        \
4625     neg(result, result);                                                                \
4626   }                                                                                     \
4627 }
4628 
4629 FCMP(float, s);
4630 FCMP(double, d);
4631 
4632 #undef FCMP
4633 
4634 // Zero words; len is in bytes
4635 // Destroys all registers except addr
4636 // len must be a nonzero multiple of wordSize
4637 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
4638   assert_different_registers(addr, len, tmp, t0, t1);
4639 
4640 #ifdef ASSERT
4641   {
4642     Label L;
4643     andi(t0, len, BytesPerWord - 1);
4644     beqz(t0, L);
4645     stop("len is not a multiple of BytesPerWord");
4646     bind(L);
4647   }
4648 #endif // ASSERT
4649 
4650 #ifndef PRODUCT
4651   block_comment("zero memory");
4652 #endif // PRODUCT
4653 
4654   Label loop;
4655   Label entry;
4656 
4657   // Algorithm:
4658   //
4659   //  t0 = cnt & 7
4660   //  cnt -= t0
4661   //  p += t0
4662   //  switch (t0) {
4663   //    do {
4664   //      cnt -= 8
4665   //        p[-8] = 0
4666   //      case 7:
4667   //        p[-7] = 0
4668   //      case 6:
4669   //        p[-6] = 0
4670   //        ...
4671   //      case 1:
4672   //        p[-1] = 0
4673   //      case 0:
4674   //        p += 8
4675   //     } while (cnt)
4676   //  }
4677 
4678   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
4679 
4680   srli(len, len, LogBytesPerWord);
4681   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
4682   sub(len, len, t0);          // cnt -= unroll
4683   // tmp always points to the end of the region we're about to zero
4684   shadd(tmp, t0, addr, t1, LogBytesPerWord);
4685   la(t1, entry);
4686   slli(t0, t0, 2);
4687   sub(t1, t1, t0);
4688   jr(t1);
4689   bind(loop);
4690   sub(len, len, unroll);
4691   for (int i = -unroll; i < 0; i++) {
4692     sd(zr, Address(tmp, i * wordSize));
4693   }
4694   bind(entry);
4695   add(tmp, tmp, unroll * wordSize);
4696   bnez(len, loop);
4697 }
4698 
4699 // shift left by shamt and add
4700 // Rd = (Rs1 << shamt) + Rs2
4701 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
4702   if (UseZba) {
4703     if (shamt == 1) {
4704       sh1add(Rd, Rs1, Rs2);
4705       return;
4706     } else if (shamt == 2) {
4707       sh2add(Rd, Rs1, Rs2);
4708       return;
4709     } else if (shamt == 3) {
4710       sh3add(Rd, Rs1, Rs2);
4711       return;
4712     }
4713   }
4714 
4715   if (shamt != 0) {
4716     assert_different_registers(Rs2, tmp);
4717     slli(tmp, Rs1, shamt);
4718     add(Rd, Rs2, tmp);
4719   } else {
4720     add(Rd, Rs1, Rs2);
4721   }
4722 }
4723 
4724 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
4725   switch (bits) {
4726     case 32:
4727       if (UseZba) {
4728         zext_w(dst, src);
4729         return;
4730       }
4731       break;
4732     case 16:
4733       if (UseZbb) {
4734         zext_h(dst, src);
4735         return;
4736       }
4737       break;
4738     case 8:
4739       if (UseZbb) {
4740         zext_b(dst, src);
4741         return;
4742       }
4743       break;
4744     default:
4745       break;
4746   }
4747   slli(dst, src, XLEN - bits);
4748   srli(dst, dst, XLEN - bits);
4749 }
4750 
4751 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
4752   switch (bits) {
4753     case 32:
4754       sext_w(dst, src);
4755       return;
4756     case 16:
4757       if (UseZbb) {
4758         sext_h(dst, src);
4759         return;
4760       }
4761       break;
4762     case 8:
4763       if (UseZbb) {
4764         sext_b(dst, src);
4765         return;
4766       }
4767       break;
4768     default:
4769       break;
4770   }
4771   slli(dst, src, XLEN - bits);
4772   srai(dst, dst, XLEN - bits);
4773 }
4774 
4775 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
4776                              Register tmp, bool is_signed) {
4777   if (src1 == src2) {
4778     mv(dst, zr);
4779     return;
4780   }
4781   Label done;
4782   Register left = src1;
4783   Register right = src2;
4784   if (dst == src1) {
4785     assert_different_registers(dst, src2, tmp);
4786     mv(tmp, src1);
4787     left = tmp;
4788   } else if (dst == src2) {
4789     assert_different_registers(dst, src1, tmp);
4790     mv(tmp, src2);
4791     right = tmp;
4792   }
4793 
4794   // installs 1 if gt else 0
4795   if (is_signed) {
4796     slt(dst, right, left);
4797   } else {
4798     sltu(dst, right, left);
4799   }
4800   bnez(dst, done);
4801   if (is_signed) {
4802     slt(dst, left, right);
4803   } else {
4804     sltu(dst, left, right);
4805   }
4806   // dst = -1 if lt; else if eq , dst = 0
4807   neg(dst, dst);
4808   bind(done);
4809 }
4810 
4811 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
4812 {
4813   cmp_x2i(dst, src1, src2, tmp);
4814 }
4815 
4816 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
4817   cmp_x2i(dst, src1, src2, tmp, false);
4818 }
4819 
4820 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
4821   cmp_x2i(dst, src1, src2, tmp, false);
4822 }
4823 
4824 // The java_calling_convention describes stack locations as ideal slots on
4825 // a frame with no abi restrictions. Since we must observe abi restrictions
4826 // (like the placement of the register window) the slots must be biased by
4827 // the following value.
4828 static int reg2offset_in(VMReg r) {
4829   // Account for saved fp and ra
4830   // This should really be in_preserve_stack_slots
4831   return r->reg2stack() * VMRegImpl::stack_slot_size;
4832 }
4833 
4834 static int reg2offset_out(VMReg r) {
4835   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
4836 }
4837 
4838 // On 64 bit we will store integer like items to the stack as
4839 // 64 bits items (riscv64 abi) even though java would only store
4840 // 32bits for a parameter. On 32bit it will simply be 32 bits
4841 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
4842 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
4843   if (src.first()->is_stack()) {
4844     if (dst.first()->is_stack()) {
4845       // stack to stack
4846       ld(tmp, Address(fp, reg2offset_in(src.first())));
4847       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4848     } else {
4849       // stack to reg
4850       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4851     }
4852   } else if (dst.first()->is_stack()) {
4853     // reg to stack
4854     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4855   } else {
4856     if (dst.first() != src.first()) {
4857       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
4858     }
4859   }
4860 }
4861 
4862 // An oop arg. Must pass a handle not the oop itself
4863 void MacroAssembler::object_move(OopMap* map,
4864                                  int oop_handle_offset,
4865                                  int framesize_in_slots,
4866                                  VMRegPair src,
4867                                  VMRegPair dst,
4868                                  bool is_receiver,
4869                                  int* receiver_offset) {
4870   assert_cond(map != nullptr && receiver_offset != nullptr);
4871 
4872   // must pass a handle. First figure out the location we use as a handle
4873   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
4874 
4875   // See if oop is null if it is we need no handle
4876 
4877   if (src.first()->is_stack()) {
4878     // Oop is already on the stack as an argument
4879     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
4880     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
4881     if (is_receiver) {
4882       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
4883     }
4884 
4885     ld(t0, Address(fp, reg2offset_in(src.first())));
4886     la(rHandle, Address(fp, reg2offset_in(src.first())));
4887     // conditionally move a null
4888     Label notZero1;
4889     bnez(t0, notZero1);
4890     mv(rHandle, zr);
4891     bind(notZero1);
4892   } else {
4893 
4894     // Oop is in a register we must store it to the space we reserve
4895     // on the stack for oop_handles and pass a handle if oop is non-null
4896 
4897     const Register rOop = src.first()->as_Register();
4898     int oop_slot = -1;
4899     if (rOop == j_rarg0) {
4900       oop_slot = 0;
4901     } else if (rOop == j_rarg1) {
4902       oop_slot = 1;
4903     } else if (rOop == j_rarg2) {
4904       oop_slot = 2;
4905     } else if (rOop == j_rarg3) {
4906       oop_slot = 3;
4907     } else if (rOop == j_rarg4) {
4908       oop_slot = 4;
4909     } else if (rOop == j_rarg5) {
4910       oop_slot = 5;
4911     } else if (rOop == j_rarg6) {
4912       oop_slot = 6;
4913     } else {
4914       assert(rOop == j_rarg7, "wrong register");
4915       oop_slot = 7;
4916     }
4917 
4918     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
4919     int offset = oop_slot * VMRegImpl::stack_slot_size;
4920 
4921     map->set_oop(VMRegImpl::stack2reg(oop_slot));
4922     // Store oop in handle area, may be null
4923     sd(rOop, Address(sp, offset));
4924     if (is_receiver) {
4925       *receiver_offset = offset;
4926     }
4927 
4928     //rOop maybe the same as rHandle
4929     if (rOop == rHandle) {
4930       Label isZero;
4931       beqz(rOop, isZero);
4932       la(rHandle, Address(sp, offset));
4933       bind(isZero);
4934     } else {
4935       Label notZero2;
4936       la(rHandle, Address(sp, offset));
4937       bnez(rOop, notZero2);
4938       mv(rHandle, zr);
4939       bind(notZero2);
4940     }
4941   }
4942 
4943   // If arg is on the stack then place it otherwise it is already in correct reg.
4944   if (dst.first()->is_stack()) {
4945     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
4946   }
4947 }
4948 
4949 // A float arg may have to do float reg int reg conversion
4950 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
4951   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
4952          (src.first()->is_reg() && dst.first()->is_reg()) ||
4953          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
4954   if (src.first()->is_stack()) {
4955     if (dst.first()->is_stack()) {
4956       lwu(tmp, Address(fp, reg2offset_in(src.first())));
4957       sw(tmp, Address(sp, reg2offset_out(dst.first())));
4958     } else if (dst.first()->is_Register()) {
4959       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4960     } else {
4961       ShouldNotReachHere();
4962     }
4963   } else if (src.first() != dst.first()) {
4964     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4965       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4966     } else {
4967       ShouldNotReachHere();
4968     }
4969   }
4970 }
4971 
4972 // A long move
4973 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
4974   if (src.first()->is_stack()) {
4975     if (dst.first()->is_stack()) {
4976       // stack to stack
4977       ld(tmp, Address(fp, reg2offset_in(src.first())));
4978       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4979     } else {
4980       // stack to reg
4981       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4982     }
4983   } else if (dst.first()->is_stack()) {
4984     // reg to stack
4985     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4986   } else {
4987     if (dst.first() != src.first()) {
4988       mv(dst.first()->as_Register(), src.first()->as_Register());
4989     }
4990   }
4991 }
4992 
4993 // A double move
4994 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
4995   assert((src.first()->is_stack() && dst.first()->is_stack()) ||
4996          (src.first()->is_reg() && dst.first()->is_reg()) ||
4997          (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
4998   if (src.first()->is_stack()) {
4999     if (dst.first()->is_stack()) {
5000       ld(tmp, Address(fp, reg2offset_in(src.first())));
5001       sd(tmp, Address(sp, reg2offset_out(dst.first())));
5002     } else if (dst.first()-> is_Register()) {
5003       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
5004     } else {
5005       ShouldNotReachHere();
5006     }
5007   } else if (src.first() != dst.first()) {
5008     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
5009       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
5010     } else {
5011       ShouldNotReachHere();
5012     }
5013   }
5014 }
5015 
5016 void MacroAssembler::rt_call(address dest, Register tmp) {
5017   CodeBlob *cb = CodeCache::find_blob(dest);
5018   RuntimeAddress target(dest);
5019   if (cb) {
5020     far_call(target, tmp);
5021   } else {
5022     relocate(target.rspec(), [&] {
5023       int32_t offset;
5024       movptr(tmp, target.target(), offset);
5025       jalr(x1, tmp, offset);
5026     });
5027   }
5028 }
5029 
5030 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
5031   assert(bit_pos < 64, "invalid bit range");
5032   if (UseZbs) {
5033     bexti(Rd, Rs, bit_pos);
5034     return;
5035   }
5036   int64_t imm = (int64_t)(1UL << bit_pos);
5037   if (is_simm12(imm)) {
5038     and_imm12(Rd, Rs, imm);
5039   } else {
5040     srli(Rd, Rs, bit_pos);
5041     and_imm12(Rd, Rd, 1);
5042   }
5043 }
5044 
5045 // Implements lightweight-locking.
5046 //
5047 //  - obj: the object to be locked
5048 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
5049 //  - slow: branched to if locking fails
5050 void MacroAssembler::lightweight_lock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5051   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5052   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
5053 
5054   Label push;
5055   const Register top = tmp1;
5056   const Register mark = tmp2;
5057   const Register t = tmp3;
5058 
5059   // Preload the markWord. It is important that this is the first
5060   // instruction emitted as it is part of C1's null check semantics.
5061   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5062 
5063   // Check if the lock-stack is full.
5064   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5065   mv(t, (unsigned)LockStack::end_offset());
5066   bge(top, t, slow, /* is_far */ true);
5067 
5068   // Check for recursion.
5069   add(t, xthread, top);
5070   ld(t, Address(t, -oopSize));
5071   beq(obj, t, push);
5072 
5073   // Check header for monitor (0b10).
5074   test_bit(t, mark, exact_log2(markWord::monitor_value));
5075   bnez(t, slow, /* is_far */ true);
5076 
5077   // Try to lock. Transition lock-bits 0b01 => 0b00
5078   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
5079   ori(mark, mark, markWord::unlocked_value);
5080   xori(t, mark, markWord::unlocked_value);
5081   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5082           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
5083   bne(mark, t, slow, /* is_far */ true);
5084 
5085   bind(push);
5086   // After successful lock, push object on lock-stack.
5087   add(t, xthread, top);
5088   sd(obj, Address(t));
5089   addw(top, top, oopSize);
5090   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5091 }
5092 
5093 // Implements ligthweight-unlocking.
5094 //
5095 // - obj: the object to be unlocked
5096 // - tmp1, tmp2, tmp3: temporary registers
5097 // - slow: branched to if unlocking fails
5098 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
5099   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5100   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
5101 
5102 #ifdef ASSERT
5103   {
5104     // Check for lock-stack underflow.
5105     Label stack_ok;
5106     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
5107     mv(tmp2, (unsigned)LockStack::start_offset());
5108     bge(tmp1, tmp2, stack_ok);
5109     STOP("Lock-stack underflow");
5110     bind(stack_ok);
5111   }
5112 #endif
5113 
5114   Label unlocked, push_and_slow;
5115   const Register top = tmp1;
5116   const Register mark = tmp2;
5117   const Register t = tmp3;
5118 
5119   // Check if obj is top of lock-stack.
5120   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5121   subw(top, top, oopSize);
5122   add(t, xthread, top);
5123   ld(t, Address(t));
5124   bne(obj, t, slow, /* is_far */ true);
5125 
5126   // Pop lock-stack.
5127   DEBUG_ONLY(add(t, xthread, top);)
5128   DEBUG_ONLY(sd(zr, Address(t));)
5129   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5130 
5131   // Check if recursive.
5132   add(t, xthread, top);
5133   ld(t, Address(t, -oopSize));
5134   beq(obj, t, unlocked);
5135 
5136   // Not recursive. Check header for monitor (0b10).
5137   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5138   test_bit(t, mark, exact_log2(markWord::monitor_value));
5139   bnez(t, push_and_slow);
5140 
5141 #ifdef ASSERT
5142   // Check header not unlocked (0b01).
5143   Label not_unlocked;
5144   test_bit(t, mark, exact_log2(markWord::unlocked_value));
5145   beqz(t, not_unlocked);
5146   stop("lightweight_unlock already unlocked");
5147   bind(not_unlocked);
5148 #endif
5149 
5150   // Try to unlock. Transition lock bits 0b00 => 0b01
5151   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
5152   ori(t, mark, markWord::unlocked_value);
5153   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
5154           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
5155   beq(mark, t, unlocked);
5156 
5157   bind(push_and_slow);
5158   // Restore lock-stack and handle the unlock in runtime.
5159   DEBUG_ONLY(add(t, xthread, top);)
5160   DEBUG_ONLY(sd(obj, Address(t));)
5161   addw(top, top, oopSize);
5162   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
5163   j(slow);
5164 
5165   bind(unlocked);
5166 }