1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_riscv.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/oop.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "utilities/globalDefinitions.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/compile.hpp"
  54 #include "opto/node.hpp"
  55 #include "opto/output.hpp"
  56 #endif
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #else
  61 #define BLOCK_COMMENT(str) block_comment(str)
  62 #endif
  63 #define STOP(str) stop(str);
  64 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  65 
  66 static void pass_arg0(MacroAssembler* masm, Register arg) {
  67   if (c_rarg0 != arg) {
  68     masm->mv(c_rarg0, arg);
  69   }
  70 }
  71 
  72 static void pass_arg1(MacroAssembler* masm, Register arg) {
  73   if (c_rarg1 != arg) {
  74     masm->mv(c_rarg1, arg);
  75   }
  76 }
  77 
  78 static void pass_arg2(MacroAssembler* masm, Register arg) {
  79   if (c_rarg2 != arg) {
  80     masm->mv(c_rarg2, arg);
  81   }
  82 }
  83 
  84 static void pass_arg3(MacroAssembler* masm, Register arg) {
  85   if (c_rarg3 != arg) {
  86     masm->mv(c_rarg3, arg);
  87   }
  88 }
  89 
  90 void MacroAssembler::push_cont_fastpath(Register java_thread) {
  91   if (!Continuations::enabled()) return;
  92   Label done;
  93   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
  94   bleu(sp, t0, done);
  95   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
  96   bind(done);
  97 }
  98 
  99 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 100   if (!Continuations::enabled()) return;
 101   Label done;
 102   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 103   bltu(sp, t0, done);
 104   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 105   bind(done);
 106 }
 107 
 108 int MacroAssembler::align(int modulus, int extra_offset) {
 109   CompressibleRegion cr(this);
 110   intptr_t before = offset();
 111   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 112   return (int)(offset() - before);
 113 }
 114 
 115 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 116   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 117 }
 118 
 119 // Implementation of call_VM versions
 120 
 121 void MacroAssembler::call_VM(Register oop_result,
 122                              address entry_point,
 123                              bool check_exceptions) {
 124   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 125 }
 126 
 127 void MacroAssembler::call_VM(Register oop_result,
 128                              address entry_point,
 129                              Register arg_1,
 130                              bool check_exceptions) {
 131   pass_arg1(this, arg_1);
 132   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 133 }
 134 
 135 void MacroAssembler::call_VM(Register oop_result,
 136                              address entry_point,
 137                              Register arg_1,
 138                              Register arg_2,
 139                              bool check_exceptions) {
 140   assert(arg_1 != c_rarg2, "smashed arg");
 141   pass_arg2(this, arg_2);
 142   pass_arg1(this, arg_1);
 143   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 144 }
 145 
 146 void MacroAssembler::call_VM(Register oop_result,
 147                              address entry_point,
 148                              Register arg_1,
 149                              Register arg_2,
 150                              Register arg_3,
 151                              bool check_exceptions) {
 152   assert(arg_1 != c_rarg3, "smashed arg");
 153   assert(arg_2 != c_rarg3, "smashed arg");
 154   pass_arg3(this, arg_3);
 155 
 156   assert(arg_1 != c_rarg2, "smashed arg");
 157   pass_arg2(this, arg_2);
 158 
 159   pass_arg1(this, arg_1);
 160   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 161 }
 162 
 163 void MacroAssembler::call_VM(Register oop_result,
 164                              Register last_java_sp,
 165                              address entry_point,
 166                              int number_of_arguments,
 167                              bool check_exceptions) {
 168   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 169 }
 170 
 171 void MacroAssembler::call_VM(Register oop_result,
 172                              Register last_java_sp,
 173                              address entry_point,
 174                              Register arg_1,
 175                              bool check_exceptions) {
 176   pass_arg1(this, arg_1);
 177   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 178 }
 179 
 180 void MacroAssembler::call_VM(Register oop_result,
 181                              Register last_java_sp,
 182                              address entry_point,
 183                              Register arg_1,
 184                              Register arg_2,
 185                              bool check_exceptions) {
 186 
 187   assert(arg_1 != c_rarg2, "smashed arg");
 188   pass_arg2(this, arg_2);
 189   pass_arg1(this, arg_1);
 190   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 191 }
 192 
 193 void MacroAssembler::call_VM(Register oop_result,
 194                              Register last_java_sp,
 195                              address entry_point,
 196                              Register arg_1,
 197                              Register arg_2,
 198                              Register arg_3,
 199                              bool check_exceptions) {
 200   assert(arg_1 != c_rarg3, "smashed arg");
 201   assert(arg_2 != c_rarg3, "smashed arg");
 202   pass_arg3(this, arg_3);
 203   assert(arg_1 != c_rarg2, "smashed arg");
 204   pass_arg2(this, arg_2);
 205   pass_arg1(this, arg_1);
 206   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 207 }
 208 
 209 void MacroAssembler::post_call_nop() {
 210   if (!Continuations::enabled()) {
 211     return;
 212   }
 213   relocate(post_call_nop_Relocation::spec(), [&] {
 214     InlineSkippedInstructionsCounter skipCounter(this);
 215     nop();
 216     li32(zr, 0);
 217   });
 218 }
 219 
 220 // these are no-ops overridden by InterpreterMacroAssembler
 221 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 222 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 223 
 224 // Calls to C land
 225 //
 226 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 227 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 228 // has to be reset to 0. This is required to allow proper stack traversal.
 229 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 230                                          Register last_java_fp,
 231                                          Register last_java_pc,
 232                                          Register tmp) {
 233 
 234   if (last_java_pc->is_valid()) {
 235       sd(last_java_pc, Address(xthread,
 236                                JavaThread::frame_anchor_offset() +
 237                                JavaFrameAnchor::last_Java_pc_offset()));
 238   }
 239 
 240   // determine last_java_sp register
 241   if (last_java_sp == sp) {
 242     mv(tmp, sp);
 243     last_java_sp = tmp;
 244   } else if (!last_java_sp->is_valid()) {
 245     last_java_sp = esp;
 246   }
 247 
 248   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 249 
 250   // last_java_fp is optional
 251   if (last_java_fp->is_valid()) {
 252     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 253   }
 254 }
 255 
 256 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 257                                          Register last_java_fp,
 258                                          address  last_java_pc,
 259                                          Register tmp) {
 260   assert(last_java_pc != nullptr, "must provide a valid PC");
 261 
 262   la(tmp, last_java_pc);
 263   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 264 
 265   set_last_Java_frame(last_java_sp, last_java_fp, noreg, tmp);
 266 }
 267 
 268 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 269                                          Register last_java_fp,
 270                                          Label &L,
 271                                          Register tmp) {
 272   if (L.is_bound()) {
 273     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 274   } else {
 275     L.add_patch_at(code(), locator());
 276     IncompressibleRegion ir(this);  // the label address will be patched back.
 277     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 278   }
 279 }
 280 
 281 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 282   // we must set sp to zero to clear frame
 283   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 284 
 285   // must clear fp, so that compiled frames are not confused; it is
 286   // possible that we need it only for debugging
 287   if (clear_fp) {
 288     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 289   }
 290 
 291   // Always clear the pc because it could have been set by make_walkable()
 292   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 293 }
 294 
 295 void MacroAssembler::call_VM_base(Register oop_result,
 296                                   Register java_thread,
 297                                   Register last_java_sp,
 298                                   address  entry_point,
 299                                   int      number_of_arguments,
 300                                   bool     check_exceptions) {
 301    // determine java_thread register
 302   if (!java_thread->is_valid()) {
 303     java_thread = xthread;
 304   }
 305   // determine last_java_sp register
 306   if (!last_java_sp->is_valid()) {
 307     last_java_sp = esp;
 308   }
 309 
 310   // debugging support
 311   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 312   assert(java_thread == xthread, "unexpected register");
 313 
 314   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 315   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 316 
 317   // push java thread (becomes first argument of C function)
 318   mv(c_rarg0, java_thread);
 319 
 320   // set last Java frame before call
 321   assert(last_java_sp != fp, "can't use fp");
 322 
 323   Label l;
 324   set_last_Java_frame(last_java_sp, fp, l, t0);
 325 
 326   // do the call, remove parameters
 327   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 328 
 329   // reset last Java frame
 330   // Only interpreter should have to clear fp
 331   reset_last_Java_frame(true);
 332 
 333    // C++ interp handles this in the interpreter
 334   check_and_handle_popframe(java_thread);
 335   check_and_handle_earlyret(java_thread);
 336 
 337   if (check_exceptions) {
 338     // check for pending exceptions (java_thread is set upon return)
 339     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 340     Label ok;
 341     beqz(t0, ok);
 342     RuntimeAddress target(StubRoutines::forward_exception_entry());
 343     relocate(target.rspec(), [&] {
 344       int32_t offset;
 345       la_patchable(t0, target, offset);
 346       jalr(x0, t0, offset);
 347     });
 348     bind(ok);
 349   }
 350 
 351   // get oop result if there is one and reset the value in the thread
 352   if (oop_result->is_valid()) {
 353     get_vm_result(oop_result, java_thread);
 354   }
 355 }
 356 
 357 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 358   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 359   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 360   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 361 }
 362 
 363 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 364   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 365   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 366 }
 367 
 368 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 369   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 370   assert_different_registers(klass, xthread, tmp);
 371 
 372   Label L_fallthrough, L_tmp;
 373   if (L_fast_path == nullptr) {
 374     L_fast_path = &L_fallthrough;
 375   } else if (L_slow_path == nullptr) {
 376     L_slow_path = &L_fallthrough;
 377   }
 378 
 379   // Fast path check: class is fully initialized
 380   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 381   sub(tmp, tmp, InstanceKlass::fully_initialized);
 382   beqz(tmp, *L_fast_path);
 383 
 384   // Fast path check: current thread is initializer thread
 385   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 386 
 387   if (L_slow_path == &L_fallthrough) {
 388     beq(xthread, tmp, *L_fast_path);
 389     bind(*L_slow_path);
 390   } else if (L_fast_path == &L_fallthrough) {
 391     bne(xthread, tmp, *L_slow_path);
 392     bind(*L_fast_path);
 393   } else {
 394     Unimplemented();
 395   }
 396 }
 397 
 398 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 399   if (!VerifyOops) { return; }
 400 
 401   // Pass register number to verify_oop_subroutine
 402   const char* b = nullptr;
 403   {
 404     ResourceMark rm;
 405     stringStream ss;
 406     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 407     b = code_string(ss.as_string());
 408   }
 409   BLOCK_COMMENT("verify_oop {");
 410 
 411   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 412 
 413   mv(c_rarg0, reg); // c_rarg0 : x10
 414   {
 415     // The length of the instruction sequence emitted should not depend
 416     // on the address of the char buffer so that the size of mach nodes for
 417     // scratch emit and normal emit matches.
 418     IncompressibleRegion ir(this);  // Fixed length
 419     movptr(t0, (address) b);
 420   }
 421 
 422   // call indirectly to solve generation ordering problem
 423   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 424   relocate(target.rspec(), [&] {
 425     int32_t offset;
 426     la_patchable(t1, target, offset);
 427     ld(t1, Address(t1, offset));
 428   });
 429   jalr(t1);
 430 
 431   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 432 
 433   BLOCK_COMMENT("} verify_oop");
 434 }
 435 
 436 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 437   if (!VerifyOops) {
 438     return;
 439   }
 440 
 441   const char* b = nullptr;
 442   {
 443     ResourceMark rm;
 444     stringStream ss;
 445     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 446     b = code_string(ss.as_string());
 447   }
 448   BLOCK_COMMENT("verify_oop_addr {");
 449 
 450   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 451 
 452   if (addr.uses(sp)) {
 453     la(x10, addr);
 454     ld(x10, Address(x10, 4 * wordSize));
 455   } else {
 456     ld(x10, addr);
 457   }
 458 
 459   {
 460     // The length of the instruction sequence emitted should not depend
 461     // on the address of the char buffer so that the size of mach nodes for
 462     // scratch emit and normal emit matches.
 463     IncompressibleRegion ir(this);  // Fixed length
 464     movptr(t0, (address) b);
 465   }
 466 
 467   // call indirectly to solve generation ordering problem
 468   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 469   relocate(target.rspec(), [&] {
 470     int32_t offset;
 471     la_patchable(t1, target, offset);
 472     ld(t1, Address(t1, offset));
 473   });
 474   jalr(t1);
 475 
 476   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 477 
 478   BLOCK_COMMENT("} verify_oop_addr");
 479 }
 480 
 481 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 482                                          int extra_slot_offset) {
 483   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 484   int stackElementSize = Interpreter::stackElementSize;
 485   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 486 #ifdef ASSERT
 487   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 488   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 489 #endif
 490   if (arg_slot.is_constant()) {
 491     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 492   } else {
 493     assert_different_registers(t0, arg_slot.as_register());
 494     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 495     return Address(t0, offset);
 496   }
 497 }
 498 
 499 #ifndef PRODUCT
 500 extern "C" void findpc(intptr_t x);
 501 #endif
 502 
 503 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 504 {
 505   // In order to get locks to work, we need to fake a in_VM state
 506   if (ShowMessageBoxOnError) {
 507     JavaThread* thread = JavaThread::current();
 508     JavaThreadState saved_state = thread->thread_state();
 509     thread->set_thread_state(_thread_in_vm);
 510 #ifndef PRODUCT
 511     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 512       ttyLocker ttyl;
 513       BytecodeCounter::print();
 514     }
 515 #endif
 516     if (os::message_box(msg, "Execution stopped, print registers?")) {
 517       ttyLocker ttyl;
 518       tty->print_cr(" pc = 0x%016lx", pc);
 519 #ifndef PRODUCT
 520       tty->cr();
 521       findpc(pc);
 522       tty->cr();
 523 #endif
 524       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 525       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 526       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 527       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 528       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 529       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 530       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 531       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 532       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 533       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 534       tty->print_cr("x10 = 0x%016lx", regs[10]);
 535       tty->print_cr("x11 = 0x%016lx", regs[11]);
 536       tty->print_cr("x12 = 0x%016lx", regs[12]);
 537       tty->print_cr("x13 = 0x%016lx", regs[13]);
 538       tty->print_cr("x14 = 0x%016lx", regs[14]);
 539       tty->print_cr("x15 = 0x%016lx", regs[15]);
 540       tty->print_cr("x16 = 0x%016lx", regs[16]);
 541       tty->print_cr("x17 = 0x%016lx", regs[17]);
 542       tty->print_cr("x18 = 0x%016lx", regs[18]);
 543       tty->print_cr("x19 = 0x%016lx", regs[19]);
 544       tty->print_cr("x20 = 0x%016lx", regs[20]);
 545       tty->print_cr("x21 = 0x%016lx", regs[21]);
 546       tty->print_cr("x22 = 0x%016lx", regs[22]);
 547       tty->print_cr("x23 = 0x%016lx", regs[23]);
 548       tty->print_cr("x24 = 0x%016lx", regs[24]);
 549       tty->print_cr("x25 = 0x%016lx", regs[25]);
 550       tty->print_cr("x26 = 0x%016lx", regs[26]);
 551       tty->print_cr("x27 = 0x%016lx", regs[27]);
 552       tty->print_cr("x28 = 0x%016lx", regs[28]);
 553       tty->print_cr("x30 = 0x%016lx", regs[30]);
 554       tty->print_cr("x31 = 0x%016lx", regs[31]);
 555       BREAKPOINT;
 556     }
 557   }
 558   fatal("DEBUG MESSAGE: %s", msg);
 559 }
 560 
 561 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 562   assert_different_registers(value, tmp1, tmp2);
 563   Label done, tagged, weak_tagged;
 564 
 565   beqz(value, done);           // Use null as-is.
 566   // Test for tag.
 567   andi(tmp1, value, JNIHandles::tag_mask);
 568   bnez(tmp1, tagged);
 569 
 570   // Resolve local handle
 571   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 572   verify_oop(value);
 573   j(done);
 574 
 575   bind(tagged);
 576   // Test for jweak tag.
 577   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 578   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 579   bnez(tmp1, weak_tagged);
 580 
 581   // Resolve global handle
 582   access_load_at(T_OBJECT, IN_NATIVE, value,
 583                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 584   verify_oop(value);
 585   j(done);
 586 
 587   bind(weak_tagged);
 588   // Resolve jweak.
 589   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 590                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 591   verify_oop(value);
 592 
 593   bind(done);
 594 }
 595 
 596 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 597   assert_different_registers(value, tmp1, tmp2);
 598   Label done;
 599 
 600   beqz(value, done);           // Use null as-is.
 601 
 602 #ifdef ASSERT
 603   {
 604     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 605     Label valid_global_tag;
 606     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 607     bnez(tmp1, valid_global_tag);
 608     stop("non global jobject using resolve_global_jobject");
 609     bind(valid_global_tag);
 610   }
 611 #endif
 612 
 613   // Resolve global handle
 614   access_load_at(T_OBJECT, IN_NATIVE, value,
 615                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 616   verify_oop(value);
 617 
 618   bind(done);
 619 }
 620 
 621 void MacroAssembler::stop(const char* msg) {
 622   BLOCK_COMMENT(msg);
 623   illegal_instruction(Assembler::csr::time);
 624   emit_int64((uintptr_t)msg);
 625 }
 626 
 627 void MacroAssembler::unimplemented(const char* what) {
 628   const char* buf = nullptr;
 629   {
 630     ResourceMark rm;
 631     stringStream ss;
 632     ss.print("unimplemented: %s", what);
 633     buf = code_string(ss.as_string());
 634   }
 635   stop(buf);
 636 }
 637 
 638 void MacroAssembler::emit_static_call_stub() {
 639   IncompressibleRegion ir(this);  // Fixed length: see CompiledStaticCall::to_interp_stub_size().
 640   // CompiledDirectStaticCall::set_to_interpreted knows the
 641   // exact layout of this stub.
 642 
 643   mov_metadata(xmethod, (Metadata*)nullptr);
 644 
 645   // Jump to the entry point of the c2i stub.
 646   int32_t offset = 0;
 647   movptr(t0, 0, offset);
 648   jalr(x0, t0, offset);
 649 }
 650 
 651 void MacroAssembler::call_VM_leaf_base(address entry_point,
 652                                        int number_of_arguments,
 653                                        Label *retaddr) {
 654   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 655   call(entry_point);
 656   if (retaddr != nullptr) {
 657     bind(*retaddr);
 658   }
 659   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 660 }
 661 
 662 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 663   call_VM_leaf_base(entry_point, number_of_arguments);
 664 }
 665 
 666 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 667   pass_arg0(this, arg_0);
 668   call_VM_leaf_base(entry_point, 1);
 669 }
 670 
 671 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 672   pass_arg0(this, arg_0);
 673   pass_arg1(this, arg_1);
 674   call_VM_leaf_base(entry_point, 2);
 675 }
 676 
 677 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 678                                   Register arg_1, Register arg_2) {
 679   pass_arg0(this, arg_0);
 680   pass_arg1(this, arg_1);
 681   pass_arg2(this, arg_2);
 682   call_VM_leaf_base(entry_point, 3);
 683 }
 684 
 685 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 686   pass_arg0(this, arg_0);
 687   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 688 }
 689 
 690 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 691 
 692   assert(arg_0 != c_rarg1, "smashed arg");
 693   pass_arg1(this, arg_1);
 694   pass_arg0(this, arg_0);
 695   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 696 }
 697 
 698 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 699   assert(arg_0 != c_rarg2, "smashed arg");
 700   assert(arg_1 != c_rarg2, "smashed arg");
 701   pass_arg2(this, arg_2);
 702   assert(arg_0 != c_rarg1, "smashed arg");
 703   pass_arg1(this, arg_1);
 704   pass_arg0(this, arg_0);
 705   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 706 }
 707 
 708 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 709   assert(arg_0 != c_rarg3, "smashed arg");
 710   assert(arg_1 != c_rarg3, "smashed arg");
 711   assert(arg_2 != c_rarg3, "smashed arg");
 712   pass_arg3(this, arg_3);
 713   assert(arg_0 != c_rarg2, "smashed arg");
 714   assert(arg_1 != c_rarg2, "smashed arg");
 715   pass_arg2(this, arg_2);
 716   assert(arg_0 != c_rarg1, "smashed arg");
 717   pass_arg1(this, arg_1);
 718   pass_arg0(this, arg_0);
 719   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 720 }
 721 
 722 void MacroAssembler::la(Register Rd, const address dest) {
 723   int64_t offset = dest - pc();
 724   if (is_valid_32bit_offset(offset)) {
 725     auipc(Rd, (int32_t)offset + 0x800);  //0x800, Note:the 11th sign bit
 726     addi(Rd, Rd, ((int64_t)offset << 52) >> 52);
 727   } else {
 728     movptr(Rd, dest);
 729   }
 730 }
 731 
 732 void MacroAssembler::la(Register Rd, const Address &adr) {
 733   switch (adr.getMode()) {
 734     case Address::literal: {
 735       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 736       if (rtype == relocInfo::none) {
 737         mv(Rd, (intptr_t)(adr.target()));
 738       } else {
 739         relocate(adr.rspec(), [&] {
 740           movptr(Rd, adr.target());
 741         });
 742       }
 743       break;
 744     }
 745     case Address::base_plus_offset: {
 746       Address new_adr = legitimize_address(Rd, adr);
 747       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 748         addi(Rd, new_adr.base(), new_adr.offset());
 749       }
 750       break;
 751     }
 752     default:
 753       ShouldNotReachHere();
 754   }
 755 }
 756 
 757 void MacroAssembler::la(Register Rd, Label &label) {
 758   IncompressibleRegion ir(this);   // the label address may be patched back.
 759   wrap_label(Rd, label, &MacroAssembler::la);
 760 }
 761 
 762 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 763   lui(Rd, (uint32_t)imm << 12);
 764   srli(Rd, Rd, 12);
 765 }
 766 
 767 void MacroAssembler::li32(Register Rd, int32_t imm) {
 768   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 769   int64_t upper = imm, lower = imm;
 770   lower = (imm << 20) >> 20;
 771   upper -= lower;
 772   upper = (int32_t)upper;
 773   // lui Rd, imm[31:12] + imm[11]
 774   lui(Rd, upper);
 775   // use addiw to distinguish li32 to li64
 776   addiw(Rd, Rd, lower);
 777 }
 778 
 779 void MacroAssembler::li64(Register Rd, int64_t imm) {
 780   // Load upper 32 bits. upper = imm[63:32], but if imm[31] == 1 or
 781   // (imm[31:20] == 0x7ff && imm[19] == 1), upper = imm[63:32] + 1.
 782   int64_t lower = imm & 0xffffffff;
 783   lower -= ((lower << 44) >> 44);
 784   int64_t tmp_imm = ((uint64_t)(imm & 0xffffffff00000000)) + (uint64_t)lower;
 785   int32_t upper = (tmp_imm - (int32_t)lower) >> 32;
 786 
 787   // Load upper 32 bits
 788   int64_t up = upper, lo = upper;
 789   lo = (lo << 52) >> 52;
 790   up -= lo;
 791   up = (int32_t)up;
 792   lui(Rd, up);
 793   addi(Rd, Rd, lo);
 794 
 795   // Load the rest 32 bits.
 796   slli(Rd, Rd, 12);
 797   addi(Rd, Rd, (int32_t)lower >> 20);
 798   slli(Rd, Rd, 12);
 799   lower = ((int32_t)imm << 12) >> 20;
 800   addi(Rd, Rd, lower);
 801   slli(Rd, Rd, 8);
 802   lower = imm & 0xff;
 803   addi(Rd, Rd, lower);
 804 }
 805 
 806 void MacroAssembler::li(Register Rd, int64_t imm) {
 807   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 808   // li -> c.li
 809   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 810     c_li(Rd, imm);
 811     return;
 812   }
 813 
 814   int shift = 12;
 815   int64_t upper = imm, lower = imm;
 816   // Split imm to a lower 12-bit sign-extended part and the remainder,
 817   // because addi will sign-extend the lower imm.
 818   lower = ((int32_t)imm << 20) >> 20;
 819   upper -= lower;
 820 
 821   // Test whether imm is a 32-bit integer.
 822   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 823         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 824     while (((upper >> shift) & 1) == 0) { shift++; }
 825     upper >>= shift;
 826     li(Rd, upper);
 827     slli(Rd, Rd, shift);
 828     if (lower != 0) {
 829       addi(Rd, Rd, lower);
 830     }
 831   } else {
 832     // 32-bit integer
 833     Register hi_Rd = zr;
 834     if (upper != 0) {
 835       lui(Rd, (int32_t)upper);
 836       hi_Rd = Rd;
 837     }
 838     if (lower != 0 || hi_Rd == zr) {
 839       addiw(Rd, hi_Rd, lower);
 840     }
 841   }
 842 }
 843 
 844 #define INSN(NAME, REGISTER)                                       \
 845   void MacroAssembler::NAME(const address dest, Register temp) {   \
 846     assert_cond(dest != nullptr);                                  \
 847     int64_t distance = dest - pc();                                \
 848     if (is_simm21(distance) && ((distance % 2) == 0)) {            \
 849       Assembler::jal(REGISTER, distance);                          \
 850     } else {                                                       \
 851       assert(temp != noreg, "expecting a register");               \
 852       int32_t offset = 0;                                          \
 853       movptr(temp, dest, offset);                                  \
 854       Assembler::jalr(REGISTER, temp, offset);                     \
 855     }                                                              \
 856   }                                                                \
 857 
 858   INSN(j,   x0);
 859   INSN(jal, x1);
 860 
 861 #undef INSN
 862 
 863 #define INSN(NAME, REGISTER)                                       \
 864   void MacroAssembler::NAME(const Address &adr, Register temp) {   \
 865     switch (adr.getMode()) {                                       \
 866       case Address::literal: {                                     \
 867         relocate(adr.rspec(), [&] {                                \
 868           NAME(adr.target(), temp);                                \
 869         });                                                        \
 870         break;                                                     \
 871       }                                                            \
 872       case Address::base_plus_offset: {                            \
 873         int32_t offset = ((int32_t)adr.offset() << 20) >> 20;      \
 874         la(temp, Address(adr.base(), adr.offset() - offset));      \
 875         Assembler::jalr(REGISTER, temp, offset);                   \
 876         break;                                                     \
 877       }                                                            \
 878       default:                                                     \
 879         ShouldNotReachHere();                                      \
 880     }                                                              \
 881   }
 882 
 883   INSN(j,   x0);
 884   INSN(jal, x1);
 885 
 886 #undef INSN
 887 
 888 #define INSN(NAME)                                                                    \
 889   void MacroAssembler::NAME(Register Rd, const address dest, Register temp) {         \
 890     assert_cond(dest != nullptr);                                                     \
 891     int64_t distance = dest - pc();                                                   \
 892     if (is_simm21(distance) && ((distance % 2) == 0)) {                               \
 893       Assembler::NAME(Rd, distance);                                                  \
 894     } else {                                                                          \
 895       assert_different_registers(Rd, temp);                                           \
 896       int32_t offset = 0;                                                             \
 897       movptr(temp, dest, offset);                                                     \
 898       jalr(Rd, temp, offset);                                                         \
 899     }                                                                                 \
 900   }                                                                                   \
 901   void MacroAssembler::NAME(Register Rd, Label &L, Register temp) {                   \
 902     assert_different_registers(Rd, temp);                                             \
 903     wrap_label(Rd, L, temp, &MacroAssembler::NAME);                                   \
 904   }
 905 
 906   INSN(jal);
 907 
 908 #undef INSN
 909 
 910 #define INSN(NAME, REGISTER)                                       \
 911   void MacroAssembler::NAME(Label &l, Register temp) {             \
 912     jal(REGISTER, l, temp);                                        \
 913   }                                                                \
 914 
 915   INSN(j,   x0);
 916   INSN(jal, x1);
 917 
 918 #undef INSN
 919 
 920 void MacroAssembler::wrap_label(Register Rt, Label &L, Register tmp, load_insn_by_temp insn) {
 921   if (L.is_bound()) {
 922     (this->*insn)(Rt, target(L), tmp);
 923   } else {
 924     L.add_patch_at(code(), locator());
 925     (this->*insn)(Rt, pc(), tmp);
 926   }
 927 }
 928 
 929 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
 930   if (L.is_bound()) {
 931     (this->*insn)(Rt, target(L));
 932   } else {
 933     L.add_patch_at(code(), locator());
 934     (this->*insn)(Rt, pc());
 935   }
 936 }
 937 
 938 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
 939                                 compare_and_branch_insn insn,
 940                                 compare_and_branch_label_insn neg_insn, bool is_far) {
 941   if (is_far) {
 942     Label done;
 943     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
 944     j(L);
 945     bind(done);
 946   } else {
 947     if (L.is_bound()) {
 948       (this->*insn)(r1, r2, target(L));
 949     } else {
 950       L.add_patch_at(code(), locator());
 951       (this->*insn)(r1, r2, pc());
 952     }
 953   }
 954 }
 955 
 956 #define INSN(NAME, NEG_INSN)                                                              \
 957   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
 958     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
 959   }
 960 
 961   INSN(beq,  bne);
 962   INSN(bne,  beq);
 963   INSN(blt,  bge);
 964   INSN(bge,  blt);
 965   INSN(bltu, bgeu);
 966   INSN(bgeu, bltu);
 967 
 968 #undef INSN
 969 
 970 #define INSN(NAME)                                                                \
 971   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
 972     NAME(Rs, zr, dest);                                                           \
 973   }                                                                               \
 974   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
 975     NAME(Rs, zr, l, is_far);                                                      \
 976   }                                                                               \
 977 
 978   INSN(beq);
 979   INSN(bne);
 980   INSN(blt);
 981   INSN(ble);
 982   INSN(bge);
 983   INSN(bgt);
 984 
 985 #undef INSN
 986 
 987 #define INSN(NAME, NEG_INSN)                                                      \
 988   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
 989     NEG_INSN(Rt, Rs, dest);                                                       \
 990   }                                                                               \
 991   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
 992     NEG_INSN(Rt, Rs, l, is_far);                                                  \
 993   }
 994 
 995   INSN(bgt,  blt);
 996   INSN(ble,  bge);
 997   INSN(bgtu, bltu);
 998   INSN(bleu, bgeu);
 999 
1000 #undef INSN
1001 
1002 // Float compare branch instructions
1003 
1004 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1005   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1006     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1007     BRANCH(t0, l, is_far);                                                                                              \
1008   }                                                                                                                     \
1009   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1010     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1011     BRANCH(t0, l, is_far);                                                                                              \
1012   }
1013 
1014   INSN(beq, feq, bnez);
1015   INSN(bne, feq, beqz);
1016 
1017 #undef INSN
1018 
1019 
1020 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1021   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1022                                     bool is_far, bool is_unordered) {                 \
1023     if (is_unordered) {                                                               \
1024       /* jump if either source is NaN or condition is expected */                     \
1025       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1026       beqz(t0, l, is_far);                                                            \
1027     } else {                                                                          \
1028       /* jump if no NaN in source and condition is expected */                        \
1029       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1030       bnez(t0, l, is_far);                                                            \
1031     }                                                                                 \
1032   }                                                                                   \
1033   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1034                                      bool is_far, bool is_unordered) {                \
1035     if (is_unordered) {                                                               \
1036       /* jump if either source is NaN or condition is expected */                     \
1037       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1038       beqz(t0, l, is_far);                                                            \
1039     } else {                                                                          \
1040       /* jump if no NaN in source and condition is expected */                        \
1041       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1042       bnez(t0, l, is_far);                                                            \
1043     }                                                                                 \
1044   }
1045 
1046   INSN(ble, fle, flt);
1047   INSN(blt, flt, fle);
1048 
1049 #undef INSN
1050 
1051 #define INSN(NAME, CMP)                                                              \
1052   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1053                                     bool is_far, bool is_unordered) {                \
1054     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1055   }                                                                                  \
1056   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1057                                      bool is_far, bool is_unordered) {               \
1058     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1059   }
1060 
1061   INSN(bgt, blt);
1062   INSN(bge, ble);
1063 
1064 #undef INSN
1065 
1066 
1067 #define INSN(NAME, CSR)                       \
1068   void MacroAssembler::NAME(Register Rd) {    \
1069     csrr(Rd, CSR);                            \
1070   }
1071 
1072   INSN(rdinstret,  CSR_INSTRET);
1073   INSN(rdcycle,    CSR_CYCLE);
1074   INSN(rdtime,     CSR_TIME);
1075   INSN(frcsr,      CSR_FCSR);
1076   INSN(frrm,       CSR_FRM);
1077   INSN(frflags,    CSR_FFLAGS);
1078 
1079 #undef INSN
1080 
1081 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1082   csrrs(Rd, csr, x0);
1083 }
1084 
1085 #define INSN(NAME, OPFUN)                                      \
1086   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1087     OPFUN(x0, csr, Rs);                                        \
1088   }
1089 
1090   INSN(csrw, csrrw);
1091   INSN(csrs, csrrs);
1092   INSN(csrc, csrrc);
1093 
1094 #undef INSN
1095 
1096 #define INSN(NAME, OPFUN)                                      \
1097   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1098     OPFUN(x0, csr, imm);                                       \
1099   }
1100 
1101   INSN(csrwi, csrrwi);
1102   INSN(csrsi, csrrsi);
1103   INSN(csrci, csrrci);
1104 
1105 #undef INSN
1106 
1107 #define INSN(NAME, CSR)                                      \
1108   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1109     csrrw(Rd, CSR, Rs);                                      \
1110   }
1111 
1112   INSN(fscsr,   CSR_FCSR);
1113   INSN(fsrm,    CSR_FRM);
1114   INSN(fsflags, CSR_FFLAGS);
1115 
1116 #undef INSN
1117 
1118 #define INSN(NAME)                              \
1119   void MacroAssembler::NAME(Register Rs) {      \
1120     NAME(x0, Rs);                               \
1121   }
1122 
1123   INSN(fscsr);
1124   INSN(fsrm);
1125   INSN(fsflags);
1126 
1127 #undef INSN
1128 
1129 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1130   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1131   csrrwi(Rd, CSR_FRM, imm);
1132 }
1133 
1134 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1135    csrrwi(Rd, CSR_FFLAGS, imm);
1136 }
1137 
1138 #define INSN(NAME)                             \
1139   void MacroAssembler::NAME(unsigned imm) {    \
1140     NAME(x0, imm);                             \
1141   }
1142 
1143   INSN(fsrmi);
1144   INSN(fsflagsi);
1145 
1146 #undef INSN
1147 
1148 void MacroAssembler::push_reg(Register Rs)
1149 {
1150   addi(esp, esp, 0 - wordSize);
1151   sd(Rs, Address(esp, 0));
1152 }
1153 
1154 void MacroAssembler::pop_reg(Register Rd)
1155 {
1156   ld(Rd, Address(esp, 0));
1157   addi(esp, esp, wordSize);
1158 }
1159 
1160 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1161   int count = 0;
1162   // Scan bitset to accumulate register pairs
1163   for (int reg = 31; reg >= 0; reg--) {
1164     if ((1U << 31) & bitset) {
1165       regs[count++] = reg;
1166     }
1167     bitset <<= 1;
1168   }
1169   return count;
1170 }
1171 
1172 // Push integer registers in the bitset supplied. Don't push sp.
1173 // Return the number of words pushed
1174 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1175   DEBUG_ONLY(int words_pushed = 0;)
1176   unsigned char regs[32];
1177   int count = bitset_to_regs(bitset, regs);
1178   // reserve one slot to align for odd count
1179   int offset = is_even(count) ? 0 : wordSize;
1180 
1181   if (count) {
1182     addi(stack, stack, -count * wordSize - offset);
1183   }
1184   for (int i = count - 1; i >= 0; i--) {
1185     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1186     DEBUG_ONLY(words_pushed++;)
1187   }
1188 
1189   assert(words_pushed == count, "oops, pushed != count");
1190 
1191   return count;
1192 }
1193 
1194 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1195   DEBUG_ONLY(int words_popped = 0;)
1196   unsigned char regs[32];
1197   int count = bitset_to_regs(bitset, regs);
1198   // reserve one slot to align for odd count
1199   int offset = is_even(count) ? 0 : wordSize;
1200 
1201   for (int i = count - 1; i >= 0; i--) {
1202     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1203     DEBUG_ONLY(words_popped++;)
1204   }
1205 
1206   if (count) {
1207     addi(stack, stack, count * wordSize + offset);
1208   }
1209   assert(words_popped == count, "oops, popped != count");
1210 
1211   return count;
1212 }
1213 
1214 // Push floating-point registers in the bitset supplied.
1215 // Return the number of words pushed
1216 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1217   DEBUG_ONLY(int words_pushed = 0;)
1218   unsigned char regs[32];
1219   int count = bitset_to_regs(bitset, regs);
1220   int push_slots = count + (count & 1);
1221 
1222   if (count) {
1223     addi(stack, stack, -push_slots * wordSize);
1224   }
1225 
1226   for (int i = count - 1; i >= 0; i--) {
1227     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1228     DEBUG_ONLY(words_pushed++;)
1229   }
1230 
1231   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1232 
1233   return count;
1234 }
1235 
1236 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1237   DEBUG_ONLY(int words_popped = 0;)
1238   unsigned char regs[32];
1239   int count = bitset_to_regs(bitset, regs);
1240   int pop_slots = count + (count & 1);
1241 
1242   for (int i = count - 1; i >= 0; i--) {
1243     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1244     DEBUG_ONLY(words_popped++;)
1245   }
1246 
1247   if (count) {
1248     addi(stack, stack, pop_slots * wordSize);
1249   }
1250 
1251   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1252 
1253   return count;
1254 }
1255 
1256 #ifdef COMPILER2
1257 // Push vector registers in the bitset supplied.
1258 // Return the number of words pushed
1259 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1260   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1261 
1262   // Scan bitset to accumulate register pairs
1263   unsigned char regs[32];
1264   int count = bitset_to_regs(bitset, regs);
1265 
1266   for (int i = 0; i < count; i++) {
1267     sub(stack, stack, vector_size_in_bytes);
1268     vs1r_v(as_VectorRegister(regs[i]), stack);
1269   }
1270 
1271   return count * vector_size_in_bytes / wordSize;
1272 }
1273 
1274 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1275   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1276 
1277   // Scan bitset to accumulate register pairs
1278   unsigned char regs[32];
1279   int count = bitset_to_regs(bitset, regs);
1280 
1281   for (int i = count - 1; i >= 0; i--) {
1282     vl1r_v(as_VectorRegister(regs[i]), stack);
1283     add(stack, stack, vector_size_in_bytes);
1284   }
1285 
1286   return count * vector_size_in_bytes / wordSize;
1287 }
1288 #endif // COMPILER2
1289 
1290 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1291   // Push integer registers x7, x10-x17, x28-x31.
1292   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1293 
1294   // Push float registers f0-f7, f10-f17, f28-f31.
1295   addi(sp, sp, - wordSize * 20);
1296   int offset = 0;
1297   for (int i = 0; i < 32; i++) {
1298     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1299       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1300     }
1301   }
1302 }
1303 
1304 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1305   int offset = 0;
1306   for (int i = 0; i < 32; i++) {
1307     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1308       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1309     }
1310   }
1311   addi(sp, sp, wordSize * 20);
1312 
1313   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1314 }
1315 
1316 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1317   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1318   push_reg(RegSet::range(x5, x31), sp);
1319 
1320   // float registers
1321   addi(sp, sp, - 32 * wordSize);
1322   for (int i = 0; i < 32; i++) {
1323     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1324   }
1325 
1326   // vector registers
1327   if (save_vectors) {
1328     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1329     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1330     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1331       add(t0, sp, vector_size_in_bytes * i);
1332       vse64_v(as_VectorRegister(i), t0);
1333     }
1334   }
1335 }
1336 
1337 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1338   // vector registers
1339   if (restore_vectors) {
1340     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1341     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1342       vle64_v(as_VectorRegister(i), sp);
1343       add(sp, sp, vector_size_in_bytes * 8);
1344     }
1345   }
1346 
1347   // float registers
1348   for (int i = 0; i < 32; i++) {
1349     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1350   }
1351   addi(sp, sp, 32 * wordSize);
1352 
1353   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1354   pop_reg(RegSet::range(x5, x31), sp);
1355 }
1356 
1357 static int patch_offset_in_jal(address branch, int64_t offset) {
1358   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1359          "offset is too large to be patched in one jal instruction!\n");
1360   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1361   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1362   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1363   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1364   return NativeInstruction::instruction_size;                                   // only one instruction
1365 }
1366 
1367 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1368   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1369          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1370   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1371   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1372   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1373   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1374   return NativeInstruction::instruction_size;                                   // only one instruction
1375 }
1376 
1377 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1378   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1379   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1380   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1381   return PC_RELATIVE_INSTRUCTION_NUM * NativeInstruction::instruction_size;
1382 }
1383 
1384 static int patch_addr_in_movptr(address branch, address target) {
1385   const int MOVPTR_INSTRUCTIONS_NUM = 6;                                        // lui + addi + slli + addi + slli + addi/jalr/load
1386   int32_t lower = ((intptr_t)target << 35) >> 35;
1387   int64_t upper = ((intptr_t)target - lower) >> 29;
1388   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1389   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1390   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1391   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1392   return MOVPTR_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1393 }
1394 
1395 static int patch_imm_in_li64(address branch, address target) {
1396   const int LI64_INSTRUCTIONS_NUM = 8;                                          // lui + addi + slli + addi + slli + addi + slli + addi
1397   int64_t lower = (intptr_t)target & 0xffffffff;
1398   lower = lower - ((lower << 44) >> 44);
1399   int64_t tmp_imm = ((uint64_t)((intptr_t)target & 0xffffffff00000000)) + (uint64_t)lower;
1400   int32_t upper =  (tmp_imm - (int32_t)lower) >> 32;
1401   int64_t tmp_upper = upper, tmp_lower = upper;
1402   tmp_lower = (tmp_lower << 52) >> 52;
1403   tmp_upper -= tmp_lower;
1404   tmp_upper >>= 12;
1405   // Load upper 32 bits. Upper = target[63:32], but if target[31] = 1 or (target[31:20] == 0x7ff && target[19] == 1),
1406   // upper = target[63:32] + 1.
1407   Assembler::patch(branch + 0,  31, 12, tmp_upper & 0xfffff);                       // Lui.
1408   Assembler::patch(branch + 4,  31, 20, tmp_lower & 0xfff);                         // Addi.
1409   // Load the rest 32 bits.
1410   Assembler::patch(branch + 12, 31, 20, ((int32_t)lower >> 20) & 0xfff);            // Addi.
1411   Assembler::patch(branch + 20, 31, 20, (((intptr_t)target << 44) >> 52) & 0xfff);  // Addi.
1412   Assembler::patch(branch + 28, 31, 20, (intptr_t)target & 0xff);                   // Addi.
1413   return LI64_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1414 }
1415 
1416 static int patch_imm_in_li16u(address branch, uint16_t target) {
1417   Assembler::patch(branch, 31, 12, target); // patch lui only
1418   return NativeInstruction::instruction_size;
1419 }
1420 
1421 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1422   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1423   int64_t upper = (intptr_t)target;
1424   int32_t lower = (((int32_t)target) << 20) >> 20;
1425   upper -= lower;
1426   upper = (int32_t)upper;
1427   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1428   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1429   return LI32_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1430 }
1431 
1432 static long get_offset_of_jal(address insn_addr) {
1433   assert_cond(insn_addr != nullptr);
1434   long offset = 0;
1435   unsigned insn = Assembler::ld_instr(insn_addr);
1436   long val = (long)Assembler::sextract(insn, 31, 12);
1437   offset |= ((val >> 19) & 0x1) << 20;
1438   offset |= (val & 0xff) << 12;
1439   offset |= ((val >> 8) & 0x1) << 11;
1440   offset |= ((val >> 9) & 0x3ff) << 1;
1441   offset = (offset << 43) >> 43;
1442   return offset;
1443 }
1444 
1445 static long get_offset_of_conditional_branch(address insn_addr) {
1446   long offset = 0;
1447   assert_cond(insn_addr != nullptr);
1448   unsigned insn = Assembler::ld_instr(insn_addr);
1449   offset = (long)Assembler::sextract(insn, 31, 31);
1450   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1451   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1452   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1453   offset = (offset << 41) >> 41;
1454   return offset;
1455 }
1456 
1457 static long get_offset_of_pc_relative(address insn_addr) {
1458   long offset = 0;
1459   assert_cond(insn_addr != nullptr);
1460   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1461   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1462   offset = (offset << 32) >> 32;
1463   return offset;
1464 }
1465 
1466 static address get_target_of_movptr(address insn_addr) {
1467   assert_cond(insn_addr != nullptr);
1468   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1469   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1470   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1471   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1472   return (address) target_address;
1473 }
1474 
1475 static address get_target_of_li64(address insn_addr) {
1476   assert_cond(insn_addr != nullptr);
1477   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 44; // Lui.
1478   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 32;                 // Addi.
1479   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 20;                // Addi.
1480   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)) << 8;                 // Addi.
1481   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 28), 31, 20));                      // Addi.
1482   return (address)target_address;
1483 }
1484 
1485 address MacroAssembler::get_target_of_li32(address insn_addr) {
1486   assert_cond(insn_addr != nullptr);
1487   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1488   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1489   return (address)target_address;
1490 }
1491 
1492 // Patch any kind of instruction; there may be several instructions.
1493 // Return the total length (in bytes) of the instructions.
1494 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
1495   assert_cond(branch != nullptr);
1496   int64_t offset = target - branch;
1497   if (NativeInstruction::is_jal_at(branch)) {                         // jal
1498     return patch_offset_in_jal(branch, offset);
1499   } else if (NativeInstruction::is_branch_at(branch)) {               // beq/bge/bgeu/blt/bltu/bne
1500     return patch_offset_in_conditional_branch(branch, offset);
1501   } else if (NativeInstruction::is_pc_relative_at(branch)) {          // auipc, addi/jalr/load
1502     return patch_offset_in_pc_relative(branch, offset);
1503   } else if (NativeInstruction::is_movptr_at(branch)) {               // movptr
1504     return patch_addr_in_movptr(branch, target);
1505   } else if (NativeInstruction::is_li64_at(branch)) {                 // li64
1506     return patch_imm_in_li64(branch, target);
1507   } else if (NativeInstruction::is_li32_at(branch)) {                 // li32
1508     int64_t imm = (intptr_t)target;
1509     return patch_imm_in_li32(branch, (int32_t)imm);
1510   } else if (NativeInstruction::is_li16u_at(branch)) {
1511     int64_t imm = (intptr_t)target;
1512     return patch_imm_in_li16u(branch, (uint16_t)imm);
1513   } else {
1514 #ifdef ASSERT
1515     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1516                   Assembler::ld_instr(branch), p2i(branch));
1517     Disassembler::decode(branch - 16, branch + 16);
1518 #endif
1519     ShouldNotReachHere();
1520     return -1;
1521   }
1522 }
1523 
1524 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1525   long offset = 0;
1526   assert_cond(insn_addr != nullptr);
1527   if (NativeInstruction::is_jal_at(insn_addr)) {                     // jal
1528     offset = get_offset_of_jal(insn_addr);
1529   } else if (NativeInstruction::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1530     offset = get_offset_of_conditional_branch(insn_addr);
1531   } else if (NativeInstruction::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1532     offset = get_offset_of_pc_relative(insn_addr);
1533   } else if (NativeInstruction::is_movptr_at(insn_addr)) {           // movptr
1534     return get_target_of_movptr(insn_addr);
1535   } else if (NativeInstruction::is_li64_at(insn_addr)) {             // li64
1536     return get_target_of_li64(insn_addr);
1537   } else if (NativeInstruction::is_li32_at(insn_addr)) {             // li32
1538     return get_target_of_li32(insn_addr);
1539   } else {
1540     ShouldNotReachHere();
1541   }
1542   return address(((uintptr_t)insn_addr + offset));
1543 }
1544 
1545 int MacroAssembler::patch_oop(address insn_addr, address o) {
1546   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1547   // narrow OOPs by setting the upper 16 bits in the first
1548   // instruction.
1549   if (NativeInstruction::is_li32_at(insn_addr)) {
1550     // Move narrow OOP
1551     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1552     return patch_imm_in_li32(insn_addr, (int32_t)n);
1553   } else if (NativeInstruction::is_movptr_at(insn_addr)) {
1554     // Move wide OOP
1555     return patch_addr_in_movptr(insn_addr, o);
1556   }
1557   ShouldNotReachHere();
1558   return -1;
1559 }
1560 
1561 void MacroAssembler::reinit_heapbase() {
1562   if (UseCompressedOops) {
1563     if (Universe::is_fully_initialized()) {
1564       mv(xheapbase, CompressedOops::ptrs_base());
1565     } else {
1566       ExternalAddress target(CompressedOops::ptrs_base_addr());
1567       relocate(target.rspec(), [&] {
1568         int32_t offset;
1569         la_patchable(xheapbase, target, offset);
1570         ld(xheapbase, Address(xheapbase, offset));
1571       });
1572     }
1573   }
1574 }
1575 
1576 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset) {
1577   int64_t imm64 = (int64_t)addr;
1578 #ifndef PRODUCT
1579   {
1580     char buffer[64];
1581     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, imm64);
1582     block_comment(buffer);
1583   }
1584 #endif
1585   assert((uintptr_t)imm64 < (1ull << 48), "48-bit overflow in address constant");
1586   // Load upper 31 bits
1587   int64_t imm = imm64 >> 17;
1588   int64_t upper = imm, lower = imm;
1589   lower = (lower << 52) >> 52;
1590   upper -= lower;
1591   upper = (int32_t)upper;
1592   lui(Rd, upper);
1593   addi(Rd, Rd, lower);
1594 
1595   // Load the rest 17 bits.
1596   slli(Rd, Rd, 11);
1597   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
1598   slli(Rd, Rd, 6);
1599 
1600   // This offset will be used by following jalr/ld.
1601   offset = imm64 & 0x3f;
1602 }
1603 
1604 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
1605   if (is_simm12(increment)) {
1606     addi(Rd, Rn, increment);
1607   } else {
1608     assert_different_registers(Rn, temp);
1609     li(temp, increment);
1610     add(Rd, Rn, temp);
1611   }
1612 }
1613 
1614 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
1615   if (is_simm12(increment)) {
1616     addiw(Rd, Rn, increment);
1617   } else {
1618     assert_different_registers(Rn, temp);
1619     li(temp, increment);
1620     addw(Rd, Rn, temp);
1621   }
1622 }
1623 
1624 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
1625   if (is_simm12(-decrement)) {
1626     addi(Rd, Rn, -decrement);
1627   } else {
1628     assert_different_registers(Rn, temp);
1629     li(temp, decrement);
1630     sub(Rd, Rn, temp);
1631   }
1632 }
1633 
1634 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
1635   if (is_simm12(-decrement)) {
1636     addiw(Rd, Rn, -decrement);
1637   } else {
1638     assert_different_registers(Rn, temp);
1639     li(temp, decrement);
1640     subw(Rd, Rn, temp);
1641   }
1642 }
1643 
1644 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
1645   andr(Rd, Rs1, Rs2);
1646   sign_extend(Rd, Rd, 32);
1647 }
1648 
1649 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
1650   orr(Rd, Rs1, Rs2);
1651   sign_extend(Rd, Rd, 32);
1652 }
1653 
1654 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
1655   xorr(Rd, Rs1, Rs2);
1656   sign_extend(Rd, Rd, 32);
1657 }
1658 
1659 // Rd = Rs1 & (~Rd2)
1660 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
1661   if (UseZbb) {
1662     Assembler::andn(Rd, Rs1, Rs2);
1663     return;
1664   }
1665 
1666   notr(Rd, Rs2);
1667   andr(Rd, Rs1, Rd);
1668 }
1669 
1670 // Rd = Rs1 | (~Rd2)
1671 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
1672   if (UseZbb) {
1673     Assembler::orn(Rd, Rs1, Rs2);
1674     return;
1675   }
1676 
1677   notr(Rd, Rs2);
1678   orr(Rd, Rs1, Rd);
1679 }
1680 
1681 // Note: load_unsigned_short used to be called load_unsigned_word.
1682 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1683   int off = offset();
1684   lhu(dst, src);
1685   return off;
1686 }
1687 
1688 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1689   int off = offset();
1690   lbu(dst, src);
1691   return off;
1692 }
1693 
1694 int MacroAssembler::load_signed_short(Register dst, Address src) {
1695   int off = offset();
1696   lh(dst, src);
1697   return off;
1698 }
1699 
1700 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1701   int off = offset();
1702   lb(dst, src);
1703   return off;
1704 }
1705 
1706 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
1707   switch (size_in_bytes) {
1708     case  8:  ld(dst, src); break;
1709     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
1710     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1711     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1712     default:  ShouldNotReachHere();
1713   }
1714 }
1715 
1716 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
1717   switch (size_in_bytes) {
1718     case  8:  sd(src, dst); break;
1719     case  4:  sw(src, dst); break;
1720     case  2:  sh(src, dst); break;
1721     case  1:  sb(src, dst); break;
1722     default:  ShouldNotReachHere();
1723   }
1724 }
1725 
1726 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
1727 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1728   if (granularity != 1 && granularity != 2) {
1729     ShouldNotReachHere();
1730   }
1731   if (AvoidUnalignedAccesses && (granularity != 2)) {
1732     assert_different_registers(dst, tmp);
1733     assert_different_registers(tmp, src.base());
1734     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
1735     slli(tmp, tmp, 8);
1736     lbu(dst, src);
1737     add(dst, dst, tmp);
1738   } else {
1739     is_signed ? lh(dst, src) : lhu(dst, src);
1740   }
1741 }
1742 
1743 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
1744 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1745   if (AvoidUnalignedAccesses && (granularity != 4)) {
1746     switch(granularity) {
1747       case 1:
1748         assert_different_registers(dst, tmp, src.base());
1749         lbu(dst, src);
1750         lbu(tmp, Address(src.base(), src.offset() + 1));
1751         slli(tmp, tmp, 8);
1752         add(dst, dst, tmp);
1753         lbu(tmp, Address(src.base(), src.offset() + 2));
1754         slli(tmp, tmp, 16);
1755         add(dst, dst, tmp);
1756         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
1757         slli(tmp, tmp, 24);
1758         add(dst, dst, tmp);
1759         break;
1760       case 2:
1761         assert_different_registers(dst, tmp);
1762         assert_different_registers(tmp, src.base());
1763         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
1764         slli(tmp, tmp, 16);
1765         lhu(dst, src);
1766         add(dst, dst, tmp);
1767         break;
1768       default:
1769         ShouldNotReachHere();
1770     }
1771   } else {
1772     is_signed ? lw(dst, src) : lwu(dst, src);
1773   }
1774 }
1775 
1776 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
1777 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
1778   if (AvoidUnalignedAccesses && (granularity != 8)) {
1779     switch(granularity){
1780       case 1:
1781         assert_different_registers(dst, tmp, src.base());
1782         lbu(dst, src);
1783         lbu(tmp, Address(src.base(), src.offset() + 1));
1784         slli(tmp, tmp, 8);
1785         add(dst, dst, tmp);
1786         lbu(tmp, Address(src.base(), src.offset() + 2));
1787         slli(tmp, tmp, 16);
1788         add(dst, dst, tmp);
1789         lbu(tmp, Address(src.base(), src.offset() + 3));
1790         slli(tmp, tmp, 24);
1791         add(dst, dst, tmp);
1792         lbu(tmp, Address(src.base(), src.offset() + 4));
1793         slli(tmp, tmp, 32);
1794         add(dst, dst, tmp);
1795         lbu(tmp, Address(src.base(), src.offset() + 5));
1796         slli(tmp, tmp, 40);
1797         add(dst, dst, tmp);
1798         lbu(tmp, Address(src.base(), src.offset() + 6));
1799         slli(tmp, tmp, 48);
1800         add(dst, dst, tmp);
1801         lbu(tmp, Address(src.base(), src.offset() + 7));
1802         slli(tmp, tmp, 56);
1803         add(dst, dst, tmp);
1804         break;
1805       case 2:
1806         assert_different_registers(dst, tmp, src.base());
1807         lhu(dst, src);
1808         lhu(tmp, Address(src.base(), src.offset() + 2));
1809         slli(tmp, tmp, 16);
1810         add(dst, dst, tmp);
1811         lhu(tmp, Address(src.base(), src.offset() + 4));
1812         slli(tmp, tmp, 32);
1813         add(dst, dst, tmp);
1814         lhu(tmp, Address(src.base(), src.offset() + 6));
1815         slli(tmp, tmp, 48);
1816         add(dst, dst, tmp);
1817         break;
1818       case 4:
1819         assert_different_registers(dst, tmp);
1820         assert_different_registers(tmp, src.base());
1821         lwu(tmp, Address(src.base(), src.offset() + 4));
1822         slli(tmp, tmp, 32);
1823         lwu(dst, src);
1824         add(dst, dst, tmp);
1825         break;
1826       default:
1827         ShouldNotReachHere();
1828     }
1829   } else {
1830     ld(dst, src);
1831   }
1832 }
1833 
1834 
1835 // reverse bytes in halfword in lower 16 bits and sign-extend
1836 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
1837 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
1838   if (UseZbb) {
1839     rev8(Rd, Rs);
1840     srai(Rd, Rd, 48);
1841     return;
1842   }
1843   assert_different_registers(Rs, tmp);
1844   assert_different_registers(Rd, tmp);
1845   srli(tmp, Rs, 8);
1846   andi(tmp, tmp, 0xFF);
1847   slli(Rd, Rs, 56);
1848   srai(Rd, Rd, 48); // sign-extend
1849   orr(Rd, Rd, tmp);
1850 }
1851 
1852 // reverse bytes in lower word and sign-extend
1853 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
1854 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1855   if (UseZbb) {
1856     rev8(Rd, Rs);
1857     srai(Rd, Rd, 32);
1858     return;
1859   }
1860   assert_different_registers(Rs, tmp1, tmp2);
1861   assert_different_registers(Rd, tmp1, tmp2);
1862   revb_h_w_u(Rd, Rs, tmp1, tmp2);
1863   slli(tmp2, Rd, 48);
1864   srai(tmp2, tmp2, 32); // sign-extend
1865   srli(Rd, Rd, 16);
1866   orr(Rd, Rd, tmp2);
1867 }
1868 
1869 // reverse bytes in halfword in lower 16 bits and zero-extend
1870 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1871 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
1872   if (UseZbb) {
1873     rev8(Rd, Rs);
1874     srli(Rd, Rd, 48);
1875     return;
1876   }
1877   assert_different_registers(Rs, tmp);
1878   assert_different_registers(Rd, tmp);
1879   srli(tmp, Rs, 8);
1880   andi(tmp, tmp, 0xFF);
1881   andi(Rd, Rs, 0xFF);
1882   slli(Rd, Rd, 8);
1883   orr(Rd, Rd, tmp);
1884 }
1885 
1886 // reverse bytes in halfwords in lower 32 bits and zero-extend
1887 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1888 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1889   if (UseZbb) {
1890     rev8(Rd, Rs);
1891     rori(Rd, Rd, 32);
1892     roriw(Rd, Rd, 16);
1893     zero_extend(Rd, Rd, 32);
1894     return;
1895   }
1896   assert_different_registers(Rs, tmp1, tmp2);
1897   assert_different_registers(Rd, tmp1, tmp2);
1898   srli(tmp2, Rs, 16);
1899   revb_h_h_u(tmp2, tmp2, tmp1);
1900   revb_h_h_u(Rd, Rs, tmp1);
1901   slli(tmp2, tmp2, 16);
1902   orr(Rd, Rd, tmp2);
1903 }
1904 
1905 // This method is only used for revb_h
1906 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
1907 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1908   assert_different_registers(Rs, tmp1, tmp2);
1909   assert_different_registers(Rd, tmp1);
1910   srli(tmp1, Rs, 48);
1911   andi(tmp2, tmp1, 0xFF);
1912   slli(tmp2, tmp2, 8);
1913   srli(tmp1, tmp1, 8);
1914   orr(tmp1, tmp1, tmp2);
1915   slli(Rd, Rs, 16);
1916   orr(Rd, Rd, tmp1);
1917 }
1918 
1919 // reverse bytes in each halfword
1920 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
1921 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1922   if (UseZbb) {
1923     assert_different_registers(Rs, tmp1);
1924     assert_different_registers(Rd, tmp1);
1925     rev8(Rd, Rs);
1926     zero_extend(tmp1, Rd, 32);
1927     roriw(tmp1, tmp1, 16);
1928     slli(tmp1, tmp1, 32);
1929     srli(Rd, Rd, 32);
1930     roriw(Rd, Rd, 16);
1931     zero_extend(Rd, Rd, 32);
1932     orr(Rd, Rd, tmp1);
1933     return;
1934   }
1935   assert_different_registers(Rs, tmp1, tmp2);
1936   assert_different_registers(Rd, tmp1, tmp2);
1937   revb_h_helper(Rd, Rs, tmp1, tmp2);
1938   for (int i = 0; i < 3; ++i) {
1939     revb_h_helper(Rd, Rd, tmp1, tmp2);
1940   }
1941 }
1942 
1943 // reverse bytes in each word
1944 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
1945 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1946   if (UseZbb) {
1947     rev8(Rd, Rs);
1948     rori(Rd, Rd, 32);
1949     return;
1950   }
1951   assert_different_registers(Rs, tmp1, tmp2);
1952   assert_different_registers(Rd, tmp1, tmp2);
1953   revb(Rd, Rs, tmp1, tmp2);
1954   ror_imm(Rd, Rd, 32);
1955 }
1956 
1957 // reverse bytes in doubleword
1958 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
1959 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1960   if (UseZbb) {
1961     rev8(Rd, Rs);
1962     return;
1963   }
1964   assert_different_registers(Rs, tmp1, tmp2);
1965   assert_different_registers(Rd, tmp1, tmp2);
1966   andi(tmp1, Rs, 0xFF);
1967   slli(tmp1, tmp1, 8);
1968   for (int step = 8; step < 56; step += 8) {
1969     srli(tmp2, Rs, step);
1970     andi(tmp2, tmp2, 0xFF);
1971     orr(tmp1, tmp1, tmp2);
1972     slli(tmp1, tmp1, 8);
1973   }
1974   srli(Rd, Rs, 56);
1975   andi(Rd, Rd, 0xFF);
1976   orr(Rd, tmp1, Rd);
1977 }
1978 
1979 // rotate right with shift bits
1980 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
1981 {
1982   if (UseZbb) {
1983     rori(dst, src, shift);
1984     return;
1985   }
1986 
1987   assert_different_registers(dst, tmp);
1988   assert_different_registers(src, tmp);
1989   assert(shift < 64, "shift amount must be < 64");
1990   slli(tmp, src, 64 - shift);
1991   srli(dst, src, shift);
1992   orr(dst, dst, tmp);
1993 }
1994 
1995 // rotate left with shift bits, 32-bit version
1996 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
1997   if (UseZbb) {
1998     // no roliw available
1999     roriw(dst, src, 32 - shift);
2000     return;
2001   }
2002 
2003   assert_different_registers(dst, tmp);
2004   assert_different_registers(src, tmp);
2005   assert(shift < 32, "shift amount must be < 32");
2006   srliw(tmp, src, 32 - shift);
2007   slliw(dst, src, shift);
2008   orr(dst, dst, tmp);
2009 }
2010 
2011 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2012   if (is_simm12(imm)) {
2013     and_imm12(Rd, Rn, imm);
2014   } else {
2015     assert_different_registers(Rn, tmp);
2016     mv(tmp, imm);
2017     andr(Rd, Rn, tmp);
2018   }
2019 }
2020 
2021 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2022   ld(tmp1, adr);
2023   if (src.is_register()) {
2024     orr(tmp1, tmp1, src.as_register());
2025   } else {
2026     if (is_simm12(src.as_constant())) {
2027       ori(tmp1, tmp1, src.as_constant());
2028     } else {
2029       assert_different_registers(tmp1, tmp2);
2030       mv(tmp2, src.as_constant());
2031       orr(tmp1, tmp1, tmp2);
2032     }
2033   }
2034   sd(tmp1, adr);
2035 }
2036 
2037 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2038   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2039   if (UseCompressedClassPointers) {
2040     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2041     if (CompressedKlassPointers::base() == nullptr) {
2042       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2043       beq(trial_klass, tmp1, L);
2044       return;
2045     }
2046     decode_klass_not_null(tmp1, tmp2);
2047   } else {
2048     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2049   }
2050   beq(trial_klass, tmp1, L);
2051 }
2052 
2053 // Move an oop into a register.
2054 void MacroAssembler::movoop(Register dst, jobject obj) {
2055   int oop_index;
2056   if (obj == nullptr) {
2057     oop_index = oop_recorder()->allocate_oop_index(obj);
2058   } else {
2059 #ifdef ASSERT
2060     {
2061       ThreadInVMfromUnknown tiv;
2062       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2063     }
2064 #endif
2065     oop_index = oop_recorder()->find_index(obj);
2066   }
2067   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2068 
2069   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2070     mv(dst, Address((address)obj, rspec));
2071   } else {
2072     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2073     ld_constant(dst, Address(dummy, rspec));
2074   }
2075 }
2076 
2077 // Move a metadata address into a register.
2078 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2079   int oop_index;
2080   if (obj == nullptr) {
2081     oop_index = oop_recorder()->allocate_metadata_index(obj);
2082   } else {
2083     oop_index = oop_recorder()->find_index(obj);
2084   }
2085   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2086   mv(dst, Address((address)obj, rspec));
2087 }
2088 
2089 // Writes to stack successive pages until offset reached to check for
2090 // stack overflow + shadow pages.  This clobbers tmp.
2091 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2092   assert_different_registers(tmp, size, t0);
2093   // Bang stack for total size given plus shadow page size.
2094   // Bang one page at a time because large size can bang beyond yellow and
2095   // red zones.
2096   mv(t0, (int)os::vm_page_size());
2097   Label loop;
2098   bind(loop);
2099   sub(tmp, sp, t0);
2100   subw(size, size, t0);
2101   sd(size, Address(tmp));
2102   bgtz(size, loop);
2103 
2104   // Bang down shadow pages too.
2105   // At this point, (tmp-0) is the last address touched, so don't
2106   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2107   // was post-decremented.)  Skip this address by starting at i=1, and
2108   // touch a few more pages below.  N.B.  It is important to touch all
2109   // the way down to and including i=StackShadowPages.
2110   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2111     // this could be any sized move but this is can be a debugging crumb
2112     // so the bigger the better.
2113     sub(tmp, tmp, (int)os::vm_page_size());
2114     sd(size, Address(tmp, 0));
2115   }
2116 }
2117 
2118 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2119   int32_t offset = 0;
2120   _masm = masm;
2121   ExternalAddress target((address)flag_addr);
2122   _masm->relocate(target.rspec(), [&] {
2123     int32_t offset;
2124     _masm->la_patchable(t0, target, offset);
2125     _masm->lbu(t0, Address(t0, offset));
2126   });
2127   if (value) {
2128     _masm->bnez(t0, _label);
2129   } else {
2130     _masm->beqz(t0, _label);
2131   }
2132 }
2133 
2134 SkipIfEqual::~SkipIfEqual() {
2135   _masm->bind(_label);
2136   _masm = nullptr;
2137 }
2138 
2139 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2140   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2141   ld(dst, Address(xmethod, Method::const_offset()));
2142   ld(dst, Address(dst, ConstMethod::constants_offset()));
2143   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2144   ld(dst, Address(dst, mirror_offset));
2145   resolve_oop_handle(dst, tmp1, tmp2);
2146 }
2147 
2148 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2149   // OopHandle::resolve is an indirection.
2150   assert_different_registers(result, tmp1, tmp2);
2151   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2152 }
2153 
2154 // ((WeakHandle)result).resolve()
2155 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2156   assert_different_registers(result, tmp1, tmp2);
2157   Label resolved;
2158 
2159   // A null weak handle resolves to null.
2160   beqz(result, resolved);
2161 
2162   // Only 64 bit platforms support GCs that require a tmp register
2163   // Only IN_HEAP loads require a thread_tmp register
2164   // WeakHandle::resolve is an indirection like jweak.
2165   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2166                  result, Address(result), tmp1, tmp2);
2167   bind(resolved);
2168 }
2169 
2170 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2171                                     Register dst, Address src,
2172                                     Register tmp1, Register tmp2) {
2173   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2174   decorators = AccessInternal::decorator_fixup(decorators, type);
2175   bool as_raw = (decorators & AS_RAW) != 0;
2176   if (as_raw) {
2177     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2178   } else {
2179     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2180   }
2181 }
2182 
2183 void MacroAssembler::null_check(Register reg, int offset) {
2184   if (needs_explicit_null_check(offset)) {
2185     // provoke OS null exception if reg is null by
2186     // accessing M[reg] w/o changing any registers
2187     // NOTE: this is plenty to provoke a segv
2188     ld(zr, Address(reg, 0));
2189   } else {
2190     // nothing to do, (later) access of M[reg + offset]
2191     // will provoke OS null exception if reg is null
2192   }
2193 }
2194 
2195 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2196                                      Address dst, Register val,
2197                                      Register tmp1, Register tmp2, Register tmp3) {
2198   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2199   decorators = AccessInternal::decorator_fixup(decorators, type);
2200   bool as_raw = (decorators & AS_RAW) != 0;
2201   if (as_raw) {
2202     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2203   } else {
2204     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2205   }
2206 }
2207 
2208 // Algorithm must match CompressedOops::encode.
2209 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2210   verify_oop_msg(s, "broken oop in encode_heap_oop");
2211   if (CompressedOops::base() == nullptr) {
2212     if (CompressedOops::shift() != 0) {
2213       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2214       srli(d, s, LogMinObjAlignmentInBytes);
2215     } else {
2216       mv(d, s);
2217     }
2218   } else {
2219     Label notNull;
2220     sub(d, s, xheapbase);
2221     bgez(d, notNull);
2222     mv(d, zr);
2223     bind(notNull);
2224     if (CompressedOops::shift() != 0) {
2225       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2226       srli(d, d, CompressedOops::shift());
2227     }
2228   }
2229 }
2230 
2231 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2232   assert_different_registers(dst, tmp);
2233   assert_different_registers(src, tmp);
2234   if (UseCompressedClassPointers) {
2235     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2236     decode_klass_not_null(dst, tmp);
2237   } else {
2238     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2239   }
2240 }
2241 
2242 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2243   // FIXME: Should this be a store release? concurrent gcs assumes
2244   // klass length is valid if klass field is not null.
2245   if (UseCompressedClassPointers) {
2246     encode_klass_not_null(src, tmp);
2247     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2248   } else {
2249     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2250   }
2251 }
2252 
2253 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2254   if (UseCompressedClassPointers) {
2255     // Store to klass gap in destination
2256     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2257   }
2258 }
2259 
2260 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2261   assert_different_registers(r, tmp);
2262   decode_klass_not_null(r, r, tmp);
2263 }
2264 
2265 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2266   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2267 
2268   if (CompressedKlassPointers::base() == nullptr) {
2269     if (CompressedKlassPointers::shift() != 0) {
2270       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2271       slli(dst, src, LogKlassAlignmentInBytes);
2272     } else {
2273       mv(dst, src);
2274     }
2275     return;
2276   }
2277 
2278   Register xbase = dst;
2279   if (dst == src) {
2280     xbase = tmp;
2281   }
2282 
2283   assert_different_registers(src, xbase);
2284   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2285 
2286   if (CompressedKlassPointers::shift() != 0) {
2287     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2288     assert_different_registers(t0, xbase);
2289     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2290   } else {
2291     add(dst, xbase, src);
2292   }
2293 }
2294 
2295 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2296   assert_different_registers(r, tmp);
2297   encode_klass_not_null(r, r, tmp);
2298 }
2299 
2300 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2301   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2302 
2303   if (CompressedKlassPointers::base() == nullptr) {
2304     if (CompressedKlassPointers::shift() != 0) {
2305       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2306       srli(dst, src, LogKlassAlignmentInBytes);
2307     } else {
2308       mv(dst, src);
2309     }
2310     return;
2311   }
2312 
2313   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2314       CompressedKlassPointers::shift() == 0) {
2315     zero_extend(dst, src, 32);
2316     return;
2317   }
2318 
2319   Register xbase = dst;
2320   if (dst == src) {
2321     xbase = tmp;
2322   }
2323 
2324   assert_different_registers(src, xbase);
2325   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2326   sub(dst, src, xbase);
2327   if (CompressedKlassPointers::shift() != 0) {
2328     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2329     srli(dst, dst, LogKlassAlignmentInBytes);
2330   }
2331 }
2332 
2333 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2334   decode_heap_oop_not_null(r, r);
2335 }
2336 
2337 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2338   assert(UseCompressedOops, "should only be used for compressed headers");
2339   assert(Universe::heap() != nullptr, "java heap should be initialized");
2340   // Cannot assert, unverified entry point counts instructions (see .ad file)
2341   // vtableStubs also counts instructions in pd_code_size_limit.
2342   // Also do not verify_oop as this is called by verify_oop.
2343   if (CompressedOops::shift() != 0) {
2344     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2345     slli(dst, src, LogMinObjAlignmentInBytes);
2346     if (CompressedOops::base() != nullptr) {
2347       add(dst, xheapbase, dst);
2348     }
2349   } else {
2350     assert(CompressedOops::base() == nullptr, "sanity");
2351     mv(dst, src);
2352   }
2353 }
2354 
2355 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2356   if (CompressedOops::base() == nullptr) {
2357     if (CompressedOops::shift() != 0 || d != s) {
2358       slli(d, s, CompressedOops::shift());
2359     }
2360   } else {
2361     Label done;
2362     mv(d, s);
2363     beqz(s, done);
2364     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2365     bind(done);
2366   }
2367   verify_oop_msg(d, "broken oop in decode_heap_oop");
2368 }
2369 
2370 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2371                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2372   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2373 }
2374 
2375 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2376                                    Register tmp2, DecoratorSet decorators) {
2377   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2378 }
2379 
2380 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2381                                             Register tmp2, DecoratorSet decorators) {
2382   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2383 }
2384 
2385 // Used for storing nulls.
2386 void MacroAssembler::store_heap_oop_null(Address dst) {
2387   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2388 }
2389 
2390 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2391                                     bool want_remainder)
2392 {
2393   // Full implementation of Java idiv and irem.  The function
2394   // returns the (pc) offset of the div instruction - may be needed
2395   // for implicit exceptions.
2396   //
2397   // input : rs1: dividend
2398   //         rs2: divisor
2399   //
2400   // result: either
2401   //         quotient  (= rs1 idiv rs2)
2402   //         remainder (= rs1 irem rs2)
2403 
2404 
2405   int idivl_offset = offset();
2406   if (!want_remainder) {
2407     divw(result, rs1, rs2);
2408   } else {
2409     remw(result, rs1, rs2); // result = rs1 % rs2;
2410   }
2411   return idivl_offset;
2412 }
2413 
2414 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2415                                     bool want_remainder)
2416 {
2417   // Full implementation of Java ldiv and lrem.  The function
2418   // returns the (pc) offset of the div instruction - may be needed
2419   // for implicit exceptions.
2420   //
2421   // input : rs1: dividend
2422   //         rs2: divisor
2423   //
2424   // result: either
2425   //         quotient  (= rs1 idiv rs2)
2426   //         remainder (= rs1 irem rs2)
2427 
2428   int idivq_offset = offset();
2429   if (!want_remainder) {
2430     div(result, rs1, rs2);
2431   } else {
2432     rem(result, rs1, rs2); // result = rs1 % rs2;
2433   }
2434   return idivq_offset;
2435 }
2436 
2437 // Look up the method for a megamorpic invkkeinterface call.
2438 // The target method is determined by <intf_klass, itable_index>.
2439 // The receiver klass is in recv_klass.
2440 // On success, the result will be in method_result, and execution falls through.
2441 // On failure, execution transfers to the given label.
2442 void MacroAssembler::lookup_interface_method(Register recv_klass,
2443                                              Register intf_klass,
2444                                              RegisterOrConstant itable_index,
2445                                              Register method_result,
2446                                              Register scan_tmp,
2447                                              Label& L_no_such_interface,
2448                                              bool return_method) {
2449   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2450   assert_different_registers(method_result, intf_klass, scan_tmp);
2451   assert(recv_klass != method_result || !return_method,
2452          "recv_klass can be destroyed when mehtid isn't needed");
2453   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2454          "caller must be same register for non-constant itable index as for method");
2455 
2456   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2457   int vtable_base = in_bytes(Klass::vtable_start_offset());
2458   int itentry_off = in_bytes(itableMethodEntry::method_offset());
2459   int scan_step   = itableOffsetEntry::size() * wordSize;
2460   int vte_size    = vtableEntry::size_in_bytes();
2461   assert(vte_size == wordSize, "else adjust times_vte_scale");
2462 
2463   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2464 
2465   // %%% Could store the aligned, prescaled offset in the klassoop.
2466   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2467   add(scan_tmp, scan_tmp, vtable_base);
2468 
2469   if (return_method) {
2470     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2471     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2472     if (itable_index.is_register()) {
2473       slli(t0, itable_index.as_register(), 3);
2474     } else {
2475       mv(t0, itable_index.as_constant() << 3);
2476     }
2477     add(recv_klass, recv_klass, t0);
2478     if (itentry_off) {
2479       add(recv_klass, recv_klass, itentry_off);
2480     }
2481   }
2482 
2483   Label search, found_method;
2484 
2485   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2486   beq(intf_klass, method_result, found_method);
2487   bind(search);
2488   // Check that the previous entry is non-null. A null entry means that
2489   // the receiver class doesn't implement the interface, and wasn't the
2490   // same as when the caller was compiled.
2491   beqz(method_result, L_no_such_interface, /* is_far */ true);
2492   addi(scan_tmp, scan_tmp, scan_step);
2493   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2494   bne(intf_klass, method_result, search);
2495 
2496   bind(found_method);
2497 
2498   // Got a hit.
2499   if (return_method) {
2500     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
2501     add(method_result, recv_klass, scan_tmp);
2502     ld(method_result, Address(method_result));
2503   }
2504 }
2505 
2506 // virtual method calling
2507 void MacroAssembler::lookup_virtual_method(Register recv_klass,
2508                                            RegisterOrConstant vtable_index,
2509                                            Register method_result) {
2510   const ByteSize base = Klass::vtable_start_offset();
2511   assert(vtableEntry::size() * wordSize == 8,
2512          "adjust the scaling in the code below");
2513   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
2514 
2515   if (vtable_index.is_register()) {
2516     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
2517     ld(method_result, Address(method_result, vtable_offset_in_bytes));
2518   } else {
2519     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
2520     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
2521   }
2522 }
2523 
2524 void MacroAssembler::membar(uint32_t order_constraint) {
2525   address prev = pc() - NativeMembar::instruction_size;
2526   address last = code()->last_insn();
2527 
2528   if (last != nullptr && nativeInstruction_at(last)->is_membar() && prev == last) {
2529     NativeMembar *bar = NativeMembar_at(prev);
2530     // We are merging two memory barrier instructions.  On RISCV we
2531     // can do this simply by ORing them together.
2532     bar->set_kind(bar->get_kind() | order_constraint);
2533     BLOCK_COMMENT("merged membar");
2534   } else {
2535     code()->set_last_insn(pc());
2536 
2537     uint32_t predecessor = 0;
2538     uint32_t successor = 0;
2539 
2540     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
2541     fence(predecessor, successor);
2542   }
2543 }
2544 
2545 // Form an address from base + offset in Rd. Rd my or may not
2546 // actually be used: you must use the Address that is returned. It
2547 // is up to you to ensure that the shift provided matches the size
2548 // of your data.
2549 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
2550   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
2551     return Address(base, byte_offset);
2552   }
2553 
2554   assert_different_registers(Rd, base, noreg);
2555 
2556   // Do it the hard way
2557   mv(Rd, byte_offset);
2558   add(Rd, base, Rd);
2559   return Address(Rd);
2560 }
2561 
2562 void MacroAssembler::check_klass_subtype(Register sub_klass,
2563                                          Register super_klass,
2564                                          Register tmp_reg,
2565                                          Label& L_success) {
2566   Label L_failure;
2567   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
2568   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
2569   bind(L_failure);
2570 }
2571 
2572 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
2573   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
2574   if (acquire) {
2575     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
2576   }
2577   if (at_return) {
2578     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
2579   } else {
2580     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
2581     bnez(t0, slow_path, true /* is_far */);
2582   }
2583 }
2584 
2585 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2586                                 Label &succeed, Label *fail) {
2587   assert_different_registers(addr, tmp);
2588   assert_different_registers(newv, tmp);
2589   assert_different_registers(oldv, tmp);
2590 
2591   // oldv holds comparison value
2592   // newv holds value to write in exchange
2593   // addr identifies memory word to compare against/update
2594   Label retry_load, nope;
2595   bind(retry_load);
2596   // Load reserved from the memory location
2597   lr_d(tmp, addr, Assembler::aqrl);
2598   // Fail and exit if it is not what we expect
2599   bne(tmp, oldv, nope);
2600   // If the store conditional succeeds, tmp will be zero
2601   sc_d(tmp, newv, addr, Assembler::rl);
2602   beqz(tmp, succeed);
2603   // Retry only when the store conditional failed
2604   j(retry_load);
2605 
2606   bind(nope);
2607   membar(AnyAny);
2608   mv(oldv, tmp);
2609   if (fail != nullptr) {
2610     j(*fail);
2611   }
2612 }
2613 
2614 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2615                                         Label &succeed, Label *fail) {
2616   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2617   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2618 }
2619 
2620 void MacroAssembler::load_reserved(Register addr,
2621                                    enum operand_size size,
2622                                    Assembler::Aqrl acquire) {
2623   switch (size) {
2624     case int64:
2625       lr_d(t0, addr, acquire);
2626       break;
2627     case int32:
2628       lr_w(t0, addr, acquire);
2629       break;
2630     case uint32:
2631       lr_w(t0, addr, acquire);
2632       zero_extend(t0, t0, 32);
2633       break;
2634     default:
2635       ShouldNotReachHere();
2636   }
2637 }
2638 
2639 void MacroAssembler::store_conditional(Register addr,
2640                                        Register new_val,
2641                                        enum operand_size size,
2642                                        Assembler::Aqrl release) {
2643   switch (size) {
2644     case int64:
2645       sc_d(t0, new_val, addr, release);
2646       break;
2647     case int32:
2648     case uint32:
2649       sc_w(t0, new_val, addr, release);
2650       break;
2651     default:
2652       ShouldNotReachHere();
2653   }
2654 }
2655 
2656 
2657 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
2658                                                  Register new_val,
2659                                                  enum operand_size size,
2660                                                  Register tmp1, Register tmp2, Register tmp3) {
2661   assert(size == int8 || size == int16, "unsupported operand size");
2662 
2663   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
2664 
2665   andi(shift, addr, 3);
2666   slli(shift, shift, 3);
2667 
2668   andi(aligned_addr, addr, ~3);
2669 
2670   if (size == int8) {
2671     mv(mask, 0xff);
2672   } else {
2673     // size == int16 case
2674     mv(mask, -1);
2675     zero_extend(mask, mask, 16);
2676   }
2677   sll(mask, mask, shift);
2678 
2679   xori(not_mask, mask, -1);
2680 
2681   sll(expected, expected, shift);
2682   andr(expected, expected, mask);
2683 
2684   sll(new_val, new_val, shift);
2685   andr(new_val, new_val, mask);
2686 }
2687 
2688 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
2689 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w,
2690 // which are forced to work with 4-byte aligned address.
2691 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
2692                                           Register new_val,
2693                                           enum operand_size size,
2694                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
2695                                           Register result, bool result_as_bool,
2696                                           Register tmp1, Register tmp2, Register tmp3) {
2697   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2698   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2699   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2700 
2701   Label retry, fail, done;
2702 
2703   bind(retry);
2704   lr_w(old, aligned_addr, acquire);
2705   andr(tmp, old, mask);
2706   bne(tmp, expected, fail);
2707 
2708   andr(tmp, old, not_mask);
2709   orr(tmp, tmp, new_val);
2710   sc_w(tmp, tmp, aligned_addr, release);
2711   bnez(tmp, retry);
2712 
2713   if (result_as_bool) {
2714     mv(result, 1);
2715     j(done);
2716 
2717     bind(fail);
2718     mv(result, zr);
2719 
2720     bind(done);
2721   } else {
2722     andr(tmp, old, mask);
2723 
2724     bind(fail);
2725     srl(result, tmp, shift);
2726 
2727     if (size == int8) {
2728       sign_extend(result, result, 8);
2729     } else {
2730       // size == int16 case
2731       sign_extend(result, result, 16);
2732     }
2733   }
2734 }
2735 
2736 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
2737 // the weak CAS stuff. The major difference is that it just failed when store conditional
2738 // failed.
2739 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
2740                                                Register new_val,
2741                                                enum operand_size size,
2742                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
2743                                                Register result,
2744                                                Register tmp1, Register tmp2, Register tmp3) {
2745   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2746   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2747   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2748 
2749   Label fail, done;
2750 
2751   lr_w(old, aligned_addr, acquire);
2752   andr(tmp, old, mask);
2753   bne(tmp, expected, fail);
2754 
2755   andr(tmp, old, not_mask);
2756   orr(tmp, tmp, new_val);
2757   sc_w(tmp, tmp, aligned_addr, release);
2758   bnez(tmp, fail);
2759 
2760   // Success
2761   mv(result, 1);
2762   j(done);
2763 
2764   // Fail
2765   bind(fail);
2766   mv(result, zr);
2767 
2768   bind(done);
2769 }
2770 
2771 void MacroAssembler::cmpxchg(Register addr, Register expected,
2772                              Register new_val,
2773                              enum operand_size size,
2774                              Assembler::Aqrl acquire, Assembler::Aqrl release,
2775                              Register result, bool result_as_bool) {
2776   assert(size != int8 && size != int16, "unsupported operand size");
2777   assert_different_registers(addr, t0);
2778   assert_different_registers(expected, t0);
2779   assert_different_registers(new_val, t0);
2780 
2781   Label retry_load, done, ne_done;
2782   bind(retry_load);
2783   load_reserved(addr, size, acquire);
2784   bne(t0, expected, ne_done);
2785   store_conditional(addr, new_val, size, release);
2786   bnez(t0, retry_load);
2787 
2788   // equal, succeed
2789   if (result_as_bool) {
2790     mv(result, 1);
2791   } else {
2792     mv(result, expected);
2793   }
2794   j(done);
2795 
2796   // not equal, failed
2797   bind(ne_done);
2798   if (result_as_bool) {
2799     mv(result, zr);
2800   } else {
2801     mv(result, t0);
2802   }
2803 
2804   bind(done);
2805 }
2806 
2807 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
2808                                   Register new_val,
2809                                   enum operand_size size,
2810                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
2811                                   Register result) {
2812   assert_different_registers(addr, t0);
2813   assert_different_registers(expected, t0);
2814   assert_different_registers(new_val, t0);
2815 
2816   Label fail, done;
2817   load_reserved(addr, size, acquire);
2818   bne(t0, expected, fail);
2819   store_conditional(addr, new_val, size, release);
2820   bnez(t0, fail);
2821 
2822   // Success
2823   mv(result, 1);
2824   j(done);
2825 
2826   // Fail
2827   bind(fail);
2828   mv(result, zr);
2829 
2830   bind(done);
2831 }
2832 
2833 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
2834 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2835   prev = prev->is_valid() ? prev : zr;                                                      \
2836   if (incr.is_register()) {                                                                 \
2837     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
2838   } else {                                                                                  \
2839     mv(t0, incr.as_constant());                                                             \
2840     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
2841   }                                                                                         \
2842   return;                                                                                   \
2843 }
2844 
2845 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
2846 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
2847 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
2848 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
2849 
2850 #undef ATOMIC_OP
2851 
2852 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
2853 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
2854   prev = prev->is_valid() ? prev : zr;                                               \
2855   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
2856   return;                                                                            \
2857 }
2858 
2859 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
2860 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
2861 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
2862 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
2863 
2864 #undef ATOMIC_XCHG
2865 
2866 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
2867 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
2868   atomic_##OP2(prev, newv, addr);                                                    \
2869   zero_extend(prev, prev, 32);                                                       \
2870   return;                                                                            \
2871 }
2872 
2873 ATOMIC_XCHGU(xchgwu, xchgw)
2874 ATOMIC_XCHGU(xchgalwu, xchgalw)
2875 
2876 #undef ATOMIC_XCHGU
2877 
2878 void MacroAssembler::far_jump(Address entry, Register tmp) {
2879   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2880   assert(CodeCache::find_blob(entry.target()) != nullptr,
2881          "destination of far call not found in code cache");
2882   assert(entry.rspec().type() == relocInfo::external_word_type
2883         || entry.rspec().type() == relocInfo::runtime_call_type
2884         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2885   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2886   if (far_branches()) {
2887     // We can use auipc + jalr here because we know that the total size of
2888     // the code cache cannot exceed 2Gb.
2889     relocate(entry.rspec(), [&] {
2890       int32_t offset;
2891       la_patchable(tmp, entry, offset);
2892       jalr(x0, tmp, offset);
2893     });
2894   } else {
2895     j(entry);
2896   }
2897 }
2898 
2899 void MacroAssembler::far_call(Address entry, Register tmp) {
2900   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2901   assert(CodeCache::find_blob(entry.target()) != nullptr,
2902          "destination of far call not found in code cache");
2903   assert(entry.rspec().type() == relocInfo::external_word_type
2904         || entry.rspec().type() == relocInfo::runtime_call_type
2905         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2906   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2907   if (far_branches()) {
2908     // We can use auipc + jalr here because we know that the total size of
2909     // the code cache cannot exceed 2Gb.
2910     relocate(entry.rspec(), [&] {
2911       int32_t offset;
2912       la_patchable(tmp, entry, offset);
2913       jalr(x1, tmp, offset); // link
2914     });
2915   } else {
2916     jal(entry); // link
2917   }
2918 }
2919 
2920 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
2921                                                    Register super_klass,
2922                                                    Register tmp_reg,
2923                                                    Label* L_success,
2924                                                    Label* L_failure,
2925                                                    Label* L_slow_path,
2926                                                    Register super_check_offset) {
2927   assert_different_registers(sub_klass, super_klass, tmp_reg);
2928   bool must_load_sco = (super_check_offset == noreg);
2929   if (must_load_sco) {
2930     assert(tmp_reg != noreg, "supply either a temp or a register offset");
2931   } else {
2932     assert_different_registers(sub_klass, super_klass, super_check_offset);
2933   }
2934 
2935   Label L_fallthrough;
2936   int label_nulls = 0;
2937   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
2938   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
2939   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
2940   assert(label_nulls <= 1, "at most one null in batch");
2941 
2942   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2943   int sco_offset = in_bytes(Klass::super_check_offset_offset());
2944   Address super_check_offset_addr(super_klass, sco_offset);
2945 
2946   // Hacked jmp, which may only be used just before L_fallthrough.
2947 #define final_jmp(label)                                                \
2948   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
2949   else                            j(label)             /*omit semi*/
2950 
2951   // If the pointers are equal, we are done (e.g., String[] elements).
2952   // This self-check enables sharing of secondary supertype arrays among
2953   // non-primary types such as array-of-interface. Otherwise, each such
2954   // type would need its own customized SSA.
2955   // We move this check to the front of the fast path because many
2956   // type checks are in fact trivially successful in this manner,
2957   // so we get a nicely predicted branch right at the start of the check.
2958   beq(sub_klass, super_klass, *L_success);
2959 
2960   // Check the supertype display:
2961   if (must_load_sco) {
2962     lwu(tmp_reg, super_check_offset_addr);
2963     super_check_offset = tmp_reg;
2964   }
2965   add(t0, sub_klass, super_check_offset);
2966   Address super_check_addr(t0);
2967   ld(t0, super_check_addr); // load displayed supertype
2968 
2969   // This check has worked decisively for primary supers.
2970   // Secondary supers are sought in the super_cache ('super_cache_addr').
2971   // (Secondary supers are interfaces and very deeply nested subtypes.)
2972   // This works in the same check above because of a tricky aliasing
2973   // between the super_Cache and the primary super display elements.
2974   // (The 'super_check_addr' can address either, as the case requires.)
2975   // Note that the cache is updated below if it does not help us find
2976   // what we need immediately.
2977   // So if it was a primary super, we can just fail immediately.
2978   // Otherwise, it's the slow path for us (no success at this point).
2979 
2980   beq(super_klass, t0, *L_success);
2981   mv(t1, sc_offset);
2982   if (L_failure == &L_fallthrough) {
2983     beq(super_check_offset, t1, *L_slow_path);
2984   } else {
2985     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
2986     final_jmp(*L_slow_path);
2987   }
2988 
2989   bind(L_fallthrough);
2990 
2991 #undef final_jmp
2992 }
2993 
2994 // Scans count pointer sized words at [addr] for occurrence of value,
2995 // generic
2996 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
2997                                 Register tmp) {
2998   Label Lloop, Lexit;
2999   beqz(count, Lexit);
3000   bind(Lloop);
3001   ld(tmp, addr);
3002   beq(value, tmp, Lexit);
3003   add(addr, addr, wordSize);
3004   sub(count, count, 1);
3005   bnez(count, Lloop);
3006   bind(Lexit);
3007 }
3008 
3009 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3010                                                    Register super_klass,
3011                                                    Register tmp1_reg,
3012                                                    Register tmp2_reg,
3013                                                    Label* L_success,
3014                                                    Label* L_failure) {
3015   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3016   if (tmp2_reg != noreg) {
3017     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3018   }
3019 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3020 
3021   Label L_fallthrough;
3022   int label_nulls = 0;
3023   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3024   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3025 
3026   assert(label_nulls <= 1, "at most one null in the batch");
3027 
3028   // A couple of useful fields in sub_klass:
3029   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3030   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3031   Address secondary_supers_addr(sub_klass, ss_offset);
3032   Address super_cache_addr(     sub_klass, sc_offset);
3033 
3034   BLOCK_COMMENT("check_klass_subtype_slow_path");
3035 
3036   // Do a linear scan of the secondary super-klass chain.
3037   // This code is rarely used, so simplicity is a virtue here.
3038   // The repne_scan instruction uses fixed registers, which we must spill.
3039   // Don't worry too much about pre-existing connections with the input regs.
3040 
3041   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3042   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3043 
3044   RegSet pushed_registers;
3045   if (!IS_A_TEMP(x12)) {
3046     pushed_registers += x12;
3047   }
3048   if (!IS_A_TEMP(x15)) {
3049     pushed_registers += x15;
3050   }
3051 
3052   if (super_klass != x10) {
3053     if (!IS_A_TEMP(x10)) {
3054       pushed_registers += x10;
3055     }
3056   }
3057 
3058   push_reg(pushed_registers, sp);
3059 
3060   // Get super_klass value into x10 (even if it was in x15 or x12)
3061   mv(x10, super_klass);
3062 
3063 #ifndef PRODUCT
3064   mv(t1, (address)&SharedRuntime::_partial_subtype_ctr);
3065   Address pst_counter_addr(t1);
3066   ld(t0, pst_counter_addr);
3067   add(t0, t0, 1);
3068   sd(t0, pst_counter_addr);
3069 #endif // PRODUCT
3070 
3071   // We will consult the secondary-super array.
3072   ld(x15, secondary_supers_addr);
3073   // Load the array length.
3074   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3075   // Skip to start of data.
3076   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3077 
3078   // Set t0 to an obvious invalid value, falling through by default
3079   mv(t0, -1);
3080   // Scan X12 words at [X15] for an occurrence of X10.
3081   repne_scan(x15, x10, x12, t0);
3082 
3083   // pop will restore x10, so we should use a temp register to keep its value
3084   mv(t1, x10);
3085 
3086   // Unspill the temp registers:
3087   pop_reg(pushed_registers, sp);
3088 
3089   bne(t1, t0, *L_failure);
3090 
3091   // Success. Cache the super we found an proceed in triumph.
3092   sd(super_klass, super_cache_addr);
3093 
3094   if (L_success != &L_fallthrough) {
3095     j(*L_success);
3096   }
3097 
3098 #undef IS_A_TEMP
3099 
3100   bind(L_fallthrough);
3101 }
3102 
3103 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
3104 void MacroAssembler::tlab_allocate(Register obj,
3105                                    Register var_size_in_bytes,
3106                                    int con_size_in_bytes,
3107                                    Register tmp1,
3108                                    Register tmp2,
3109                                    Label& slow_case,
3110                                    bool is_far) {
3111   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3112   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
3113 }
3114 
3115 // get_thread() can be called anywhere inside generated code so we
3116 // need to save whatever non-callee save context might get clobbered
3117 // by the call to Thread::current() or, indeed, the call setup code.
3118 void MacroAssembler::get_thread(Register thread) {
3119   // save all call-clobbered regs except thread
3120   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
3121                       RegSet::range(x28, x31) + ra - thread;
3122   push_reg(saved_regs, sp);
3123 
3124   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
3125   jalr(ra);
3126   if (thread != c_rarg0) {
3127     mv(thread, c_rarg0);
3128   }
3129 
3130   // restore pushed registers
3131   pop_reg(saved_regs, sp);
3132 }
3133 
3134 void MacroAssembler::load_byte_map_base(Register reg) {
3135   CardTable::CardValue* byte_map_base =
3136     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
3137   mv(reg, (uint64_t)byte_map_base);
3138 }
3139 
3140 void MacroAssembler::la_patchable(Register reg1, const Address &dest, int32_t &offset) {
3141   unsigned long low_address = (uintptr_t)CodeCache::low_bound();
3142   unsigned long high_address = (uintptr_t)CodeCache::high_bound();
3143   unsigned long dest_address = (uintptr_t)dest.target();
3144   long offset_low = dest_address - low_address;
3145   long offset_high = dest_address - high_address;
3146 
3147   assert(dest.getMode() == Address::literal, "la_patchable must be applied to a literal address");
3148   assert((uintptr_t)dest.target() < (1ull << 48), "bad address");
3149 
3150   // RISC-V doesn't compute a page-aligned address, in order to partially
3151   // compensate for the use of *signed* offsets in its base+disp12
3152   // addressing mode (RISC-V's PC-relative reach remains asymmetric
3153   // [-(2G + 2K), 2G - 2K).
3154   if (offset_high >= -((1L << 31) + (1L << 11)) && offset_low < (1L << 31) - (1L << 11)) {
3155     int64_t distance = dest.target() - pc();
3156     auipc(reg1, (int32_t)distance + 0x800);
3157     offset = ((int32_t)distance << 20) >> 20;
3158   } else {
3159     movptr(reg1, dest.target(), offset);
3160   }
3161 }
3162 
3163 void MacroAssembler::build_frame(int framesize) {
3164   assert(framesize >= 2, "framesize must include space for FP/RA");
3165   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3166   sub(sp, sp, framesize);
3167   sd(fp, Address(sp, framesize - 2 * wordSize));
3168   sd(ra, Address(sp, framesize - wordSize));
3169   if (PreserveFramePointer) { add(fp, sp, framesize); }
3170 }
3171 
3172 void MacroAssembler::remove_frame(int framesize) {
3173   assert(framesize >= 2, "framesize must include space for FP/RA");
3174   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3175   ld(fp, Address(sp, framesize - 2 * wordSize));
3176   ld(ra, Address(sp, framesize - wordSize));
3177   add(sp, sp, framesize);
3178 }
3179 
3180 void MacroAssembler::reserved_stack_check() {
3181     // testing if reserved zone needs to be enabled
3182     Label no_reserved_zone_enabling;
3183 
3184     ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
3185     bltu(sp, t0, no_reserved_zone_enabling);
3186 
3187     enter();   // RA and FP are live.
3188     mv(c_rarg0, xthread);
3189     RuntimeAddress target(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
3190     relocate(target.rspec(), [&] {
3191       int32_t offset;
3192       la_patchable(t0, target, offset);
3193       jalr(x1, t0, offset);
3194     });
3195     leave();
3196 
3197     // We have already removed our own frame.
3198     // throw_delayed_StackOverflowError will think that it's been
3199     // called by our caller.
3200     target = RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry());
3201     relocate(target.rspec(), [&] {
3202       int32_t offset;
3203       la_patchable(t0, target, offset);
3204       jalr(x0, t0, offset);
3205     });
3206     should_not_reach_here();
3207 
3208     bind(no_reserved_zone_enabling);
3209 }
3210 
3211 // Move the address of the polling page into dest.
3212 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
3213   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
3214 }
3215 
3216 // Read the polling page.  The address of the polling page must
3217 // already be in r.
3218 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
3219   relocate(rtype, [&] {
3220     lwu(zr, Address(r, offset));
3221   });
3222 }
3223 
3224 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3225 #ifdef ASSERT
3226   {
3227     ThreadInVMfromUnknown tiv;
3228     assert (UseCompressedOops, "should only be used for compressed oops");
3229     assert (Universe::heap() != nullptr, "java heap should be initialized");
3230     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3231     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3232   }
3233 #endif
3234   int oop_index = oop_recorder()->find_index(obj);
3235   relocate(oop_Relocation::spec(oop_index), [&] {
3236     li32(dst, 0xDEADBEEF);
3237   });
3238   zero_extend(dst, dst, 32);
3239 }
3240 
3241 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3242   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3243   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3244   int index = oop_recorder()->find_index(k);
3245   assert(!Universe::heap()->is_in(k), "should not be an oop");
3246 
3247   narrowKlass nk = CompressedKlassPointers::encode(k);
3248   relocate(metadata_Relocation::spec(index), [&] {
3249     li32(dst, nk);
3250   });
3251   zero_extend(dst, dst, 32);
3252 }
3253 
3254 // Maybe emit a call via a trampoline. If the code cache is small
3255 // trampolines won't be emitted.
3256 address MacroAssembler::trampoline_call(Address entry) {
3257   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
3258          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
3259          entry.rspec().type() == relocInfo::static_call_type ||
3260          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
3261 
3262   address target = entry.target();
3263 
3264   // We need a trampoline if branches are far.
3265   if (far_branches()) {
3266     if (!in_scratch_emit_size()) {
3267       if (entry.rspec().type() == relocInfo::runtime_call_type) {
3268         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
3269         code()->share_trampoline_for(entry.target(), offset());
3270       } else {
3271         address stub = emit_trampoline_stub(offset(), target);
3272         if (stub == nullptr) {
3273           postcond(pc() == badAddress);
3274           return nullptr; // CodeCache is full
3275         }
3276       }
3277     }
3278     target = pc();
3279   }
3280 
3281   address call_pc = pc();
3282 #ifdef ASSERT
3283   if (entry.rspec().type() != relocInfo::runtime_call_type) {
3284     assert_alignment(call_pc);
3285   }
3286 #endif
3287   relocate(entry.rspec(), [&] {
3288     jal(target);
3289   });
3290 
3291   postcond(pc() != badAddress);
3292   return call_pc;
3293 }
3294 
3295 address MacroAssembler::ic_call(address entry, jint method_index) {
3296   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
3297   IncompressibleRegion ir(this);  // relocations
3298   movptr(t1, (address)Universe::non_oop_word());
3299   assert_cond(entry != nullptr);
3300   return trampoline_call(Address(entry, rh));
3301 }
3302 
3303 // Emit a trampoline stub for a call to a target which is too far away.
3304 //
3305 // code sequences:
3306 //
3307 // call-site:
3308 //   branch-and-link to <destination> or <trampoline stub>
3309 //
3310 // Related trampoline stub for this call site in the stub section:
3311 //   load the call target from the constant pool
3312 //   branch (RA still points to the call site above)
3313 
3314 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
3315                                              address dest) {
3316   // Max stub size: alignment nop, TrampolineStub.
3317   address stub = start_a_stub(max_trampoline_stub_size());
3318   if (stub == nullptr) {
3319     return nullptr;  // CodeBuffer::expand failed
3320   }
3321 
3322   // We are always 4-byte aligned here.
3323   assert_alignment(pc());
3324 
3325   // Create a trampoline stub relocation which relates this trampoline stub
3326   // with the call instruction at insts_call_instruction_offset in the
3327   // instructions code-section.
3328 
3329   // Make sure the address of destination 8-byte aligned after 3 instructions.
3330   align(wordSize, NativeCallTrampolineStub::data_offset);
3331 
3332   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
3333                                                          insts_call_instruction_offset);
3334   const int stub_start_offset = offset();
3335   relocate(rh, [&] {
3336     // Now, create the trampoline stub's code:
3337     // - load the call
3338     // - call
3339     Label target;
3340     ld(t0, target);  // auipc + ld
3341     jr(t0);          // jalr
3342     bind(target);
3343     assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
3344            "should be");
3345     assert(offset() % wordSize == 0, "bad alignment");
3346     emit_int64((int64_t)dest);
3347   });
3348 
3349   const address stub_start_addr = addr_at(stub_start_offset);
3350 
3351   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
3352 
3353   end_a_stub();
3354   return stub_start_addr;
3355 }
3356 
3357 int MacroAssembler::max_trampoline_stub_size() {
3358   // Max stub size: alignment nop, TrampolineStub.
3359   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
3360 }
3361 
3362 int MacroAssembler::static_call_stub_size() {
3363   // (lui, addi, slli, addi, slli, addi) + (lui, addi, slli, addi, slli) + jalr
3364   return 12 * NativeInstruction::instruction_size;
3365 }
3366 
3367 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
3368   switch (dst.getMode()) {
3369     case Address::base_plus_offset:
3370       // This is the expected mode, although we allow all the other
3371       // forms below.
3372       return form_address(tmp, dst.base(), dst.offset());
3373     default:
3374       la(tmp, dst);
3375       return Address(tmp);
3376   }
3377 }
3378 
3379 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3380   assert(((dst.getMode() == Address::base_plus_offset &&
3381            is_simm12(dst.offset())) || is_simm12(value)),
3382           "invalid value and address mode combination");
3383   Address adr = add_memory_helper(dst, tmp2);
3384   assert(!adr.uses(tmp1), "invalid dst for address increment");
3385   ld(tmp1, adr);
3386   add(tmp1, tmp1, value, tmp2);
3387   sd(tmp1, adr);
3388 }
3389 
3390 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3391   assert(((dst.getMode() == Address::base_plus_offset &&
3392            is_simm12(dst.offset())) || is_simm12(value)),
3393           "invalid value and address mode combination");
3394   Address adr = add_memory_helper(dst, tmp2);
3395   assert(!adr.uses(tmp1), "invalid dst for address increment");
3396   lwu(tmp1, adr);
3397   addw(tmp1, tmp1, value, tmp2);
3398   sw(tmp1, adr);
3399 }
3400 
3401 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3402   assert(((dst.getMode() == Address::base_plus_offset &&
3403            is_simm12(dst.offset())) || is_simm12(value)),
3404           "invalid value and address mode combination");
3405   Address adr = add_memory_helper(dst, tmp2);
3406   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3407   ld(tmp1, adr);
3408   sub(tmp1, tmp1, value, tmp2);
3409   sd(tmp1, adr);
3410 }
3411 
3412 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3413   assert(((dst.getMode() == Address::base_plus_offset &&
3414            is_simm12(dst.offset())) || is_simm12(value)),
3415           "invalid value and address mode combination");
3416   Address adr = add_memory_helper(dst, tmp2);
3417   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3418   lwu(tmp1, adr);
3419   subw(tmp1, tmp1, value, tmp2);
3420   sw(tmp1, adr);
3421 }
3422 
3423 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
3424   assert_different_registers(src1, t0);
3425   relocate(src2.rspec(), [&] {
3426     int32_t offset;
3427     la_patchable(t0, src2, offset);
3428     ld(t0, Address(t0, offset));
3429   });
3430   beq(src1, t0, equal);
3431 }
3432 
3433 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
3434   load_method_holder(result, method);
3435   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
3436 }
3437 
3438 void MacroAssembler::load_method_holder(Register holder, Register method) {
3439   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3440   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3441   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
3442 }
3443 
3444 // string indexof
3445 // compute index by trailing zeros
3446 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
3447                                    Register match_mask, Register result,
3448                                    Register ch2, Register tmp,
3449                                    bool haystack_isL) {
3450   int haystack_chr_shift = haystack_isL ? 0 : 1;
3451   srl(match_mask, match_mask, trailing_zeros);
3452   srli(match_mask, match_mask, 1);
3453   srli(tmp, trailing_zeros, LogBitsPerByte);
3454   if (!haystack_isL) andi(tmp, tmp, 0xE);
3455   add(haystack, haystack, tmp);
3456   ld(ch2, Address(haystack));
3457   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
3458   add(result, result, tmp);
3459 }
3460 
3461 // string indexof
3462 // Find pattern element in src, compute match mask,
3463 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
3464 // match mask patterns and corresponding indices would be like:
3465 // - 0x8080808080808080 (Latin1)
3466 // -   7 6 5 4 3 2 1 0  (match index)
3467 // - 0x8000800080008000 (UTF16)
3468 // -   3   2   1   0    (match index)
3469 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
3470                                         Register mask1, Register mask2) {
3471   xorr(src, pattern, src);
3472   sub(match_mask, src, mask1);
3473   orr(src, src, mask2);
3474   notr(src, src);
3475   andr(match_mask, match_mask, src);
3476 }
3477 
3478 #ifdef COMPILER2
3479 // Code for BigInteger::mulAdd intrinsic
3480 // out     = x10
3481 // in      = x11
3482 // offset  = x12  (already out.length-offset)
3483 // len     = x13
3484 // k       = x14
3485 // tmp     = x28
3486 //
3487 // pseudo code from java implementation:
3488 // long kLong = k & LONG_MASK;
3489 // carry = 0;
3490 // offset = out.length-offset - 1;
3491 // for (int j = len - 1; j >= 0; j--) {
3492 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3493 //     out[offset--] = (int)product;
3494 //     carry = product >>> 32;
3495 // }
3496 // return (int)carry;
3497 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3498                              Register len, Register k, Register tmp) {
3499   Label L_tail_loop, L_unroll, L_end;
3500   mv(tmp, out);
3501   mv(out, zr);
3502   blez(len, L_end);
3503   zero_extend(k, k, 32);
3504   slliw(t0, offset, LogBytesPerInt);
3505   add(offset, tmp, t0);
3506   slliw(t0, len, LogBytesPerInt);
3507   add(in, in, t0);
3508 
3509   const int unroll = 8;
3510   mv(tmp, unroll);
3511   blt(len, tmp, L_tail_loop);
3512   bind(L_unroll);
3513   for (int i = 0; i < unroll; i++) {
3514     sub(in, in, BytesPerInt);
3515     lwu(t0, Address(in, 0));
3516     mul(t1, t0, k);
3517     add(t0, t1, out);
3518     sub(offset, offset, BytesPerInt);
3519     lwu(t1, Address(offset, 0));
3520     add(t0, t0, t1);
3521     sw(t0, Address(offset, 0));
3522     srli(out, t0, 32);
3523   }
3524   subw(len, len, tmp);
3525   bge(len, tmp, L_unroll);
3526 
3527   bind(L_tail_loop);
3528   blez(len, L_end);
3529   sub(in, in, BytesPerInt);
3530   lwu(t0, Address(in, 0));
3531   mul(t1, t0, k);
3532   add(t0, t1, out);
3533   sub(offset, offset, BytesPerInt);
3534   lwu(t1, Address(offset, 0));
3535   add(t0, t0, t1);
3536   sw(t0, Address(offset, 0));
3537   srli(out, t0, 32);
3538   subw(len, len, 1);
3539   j(L_tail_loop);
3540 
3541   bind(L_end);
3542 }
3543 
3544 // add two unsigned input and output carry
3545 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
3546 {
3547   assert_different_registers(dst, carry);
3548   assert_different_registers(dst, src2);
3549   add(dst, src1, src2);
3550   sltu(carry, dst, src2);
3551 }
3552 
3553 // add two input with carry
3554 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
3555   assert_different_registers(dst, carry);
3556   add(dst, src1, src2);
3557   add(dst, dst, carry);
3558 }
3559 
3560 // add two unsigned input with carry and output carry
3561 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
3562   assert_different_registers(dst, src2);
3563   adc(dst, src1, src2, carry);
3564   sltu(carry, dst, src2);
3565 }
3566 
3567 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3568                                      Register src1, Register src2, Register carry) {
3569   cad(dest_lo, dest_lo, src1, carry);
3570   add(dest_hi, dest_hi, carry);
3571   cad(dest_lo, dest_lo, src2, carry);
3572   add(final_dest_hi, dest_hi, carry);
3573 }
3574 
3575 /**
3576  * Multiply 32 bit by 32 bit first loop.
3577  */
3578 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
3579                                            Register y, Register y_idx, Register z,
3580                                            Register carry, Register product,
3581                                            Register idx, Register kdx) {
3582   // jlong carry, x[], y[], z[];
3583   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3584   //     long product = y[idx] * x[xstart] + carry;
3585   //     z[kdx] = (int)product;
3586   //     carry = product >>> 32;
3587   // }
3588   // z[xstart] = (int)carry;
3589 
3590   Label L_first_loop, L_first_loop_exit;
3591   blez(idx, L_first_loop_exit);
3592 
3593   shadd(t0, xstart, x, t0, LogBytesPerInt);
3594   lwu(x_xstart, Address(t0, 0));
3595 
3596   bind(L_first_loop);
3597   subw(idx, idx, 1);
3598   shadd(t0, idx, y, t0, LogBytesPerInt);
3599   lwu(y_idx, Address(t0, 0));
3600   mul(product, x_xstart, y_idx);
3601   add(product, product, carry);
3602   srli(carry, product, 32);
3603   subw(kdx, kdx, 1);
3604   shadd(t0, kdx, z, t0, LogBytesPerInt);
3605   sw(product, Address(t0, 0));
3606   bgtz(idx, L_first_loop);
3607 
3608   bind(L_first_loop_exit);
3609 }
3610 
3611 /**
3612  * Multiply 64 bit by 64 bit first loop.
3613  */
3614 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3615                                            Register y, Register y_idx, Register z,
3616                                            Register carry, Register product,
3617                                            Register idx, Register kdx) {
3618   //
3619   //  jlong carry, x[], y[], z[];
3620   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3621   //    huge_128 product = y[idx] * x[xstart] + carry;
3622   //    z[kdx] = (jlong)product;
3623   //    carry  = (jlong)(product >>> 64);
3624   //  }
3625   //  z[xstart] = carry;
3626   //
3627 
3628   Label L_first_loop, L_first_loop_exit;
3629   Label L_one_x, L_one_y, L_multiply;
3630 
3631   subw(xstart, xstart, 1);
3632   bltz(xstart, L_one_x);
3633 
3634   shadd(t0, xstart, x, t0, LogBytesPerInt);
3635   ld(x_xstart, Address(t0, 0));
3636   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3637 
3638   bind(L_first_loop);
3639   subw(idx, idx, 1);
3640   bltz(idx, L_first_loop_exit);
3641   subw(idx, idx, 1);
3642   bltz(idx, L_one_y);
3643 
3644   shadd(t0, idx, y, t0, LogBytesPerInt);
3645   ld(y_idx, Address(t0, 0));
3646   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
3647   bind(L_multiply);
3648 
3649   mulhu(t0, x_xstart, y_idx);
3650   mul(product, x_xstart, y_idx);
3651   cad(product, product, carry, t1);
3652   adc(carry, t0, zr, t1);
3653 
3654   subw(kdx, kdx, 2);
3655   ror_imm(product, product, 32); // back to big-endian
3656   shadd(t0, kdx, z, t0, LogBytesPerInt);
3657   sd(product, Address(t0, 0));
3658 
3659   j(L_first_loop);
3660 
3661   bind(L_one_y);
3662   lwu(y_idx, Address(y, 0));
3663   j(L_multiply);
3664 
3665   bind(L_one_x);
3666   lwu(x_xstart, Address(x, 0));
3667   j(L_first_loop);
3668 
3669   bind(L_first_loop_exit);
3670 }
3671 
3672 /**
3673  * Multiply 128 bit by 128 bit. Unrolled inner loop.
3674  *
3675  */
3676 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3677                                              Register carry, Register carry2,
3678                                              Register idx, Register jdx,
3679                                              Register yz_idx1, Register yz_idx2,
3680                                              Register tmp, Register tmp3, Register tmp4,
3681                                              Register tmp6, Register product_hi) {
3682   //   jlong carry, x[], y[], z[];
3683   //   int kdx = xstart+1;
3684   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3685   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3686   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3687   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3688   //     carry  = (jlong)(tmp4 >>> 64);
3689   //     z[kdx+idx+1] = (jlong)tmp3;
3690   //     z[kdx+idx] = (jlong)tmp4;
3691   //   }
3692   //   idx += 2;
3693   //   if (idx > 0) {
3694   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3695   //     z[kdx+idx] = (jlong)yz_idx1;
3696   //     carry  = (jlong)(yz_idx1 >>> 64);
3697   //   }
3698   //
3699 
3700   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3701 
3702   srliw(jdx, idx, 2);
3703 
3704   bind(L_third_loop);
3705 
3706   subw(jdx, jdx, 1);
3707   bltz(jdx, L_third_loop_exit);
3708   subw(idx, idx, 4);
3709 
3710   shadd(t0, idx, y, t0, LogBytesPerInt);
3711   ld(yz_idx2, Address(t0, 0));
3712   ld(yz_idx1, Address(t0, wordSize));
3713 
3714   shadd(tmp6, idx, z, t0, LogBytesPerInt);
3715 
3716   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3717   ror_imm(yz_idx2, yz_idx2, 32);
3718 
3719   ld(t1, Address(tmp6, 0));
3720   ld(t0, Address(tmp6, wordSize));
3721 
3722   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3723   mulhu(tmp4, product_hi, yz_idx1);
3724 
3725   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
3726   ror_imm(t1, t1, 32, tmp);
3727 
3728   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
3729   mulhu(carry2, product_hi, yz_idx2);
3730 
3731   cad(tmp3, tmp3, carry, carry);
3732   adc(tmp4, tmp4, zr, carry);
3733   cad(tmp3, tmp3, t0, t0);
3734   cadc(tmp4, tmp4, tmp, t0);
3735   adc(carry, carry2, zr, t0);
3736   cad(tmp4, tmp4, t1, carry2);
3737   adc(carry, carry, zr, carry2);
3738 
3739   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
3740   ror_imm(tmp4, tmp4, 32);
3741   sd(tmp4, Address(tmp6, 0));
3742   sd(tmp3, Address(tmp6, wordSize));
3743 
3744   j(L_third_loop);
3745 
3746   bind(L_third_loop_exit);
3747 
3748   andi(idx, idx, 0x3);
3749   beqz(idx, L_post_third_loop_done);
3750 
3751   Label L_check_1;
3752   subw(idx, idx, 2);
3753   bltz(idx, L_check_1);
3754 
3755   shadd(t0, idx, y, t0, LogBytesPerInt);
3756   ld(yz_idx1, Address(t0, 0));
3757   ror_imm(yz_idx1, yz_idx1, 32);
3758 
3759   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3760   mulhu(tmp4, product_hi, yz_idx1);
3761 
3762   shadd(t0, idx, z, t0, LogBytesPerInt);
3763   ld(yz_idx2, Address(t0, 0));
3764   ror_imm(yz_idx2, yz_idx2, 32, tmp);
3765 
3766   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
3767 
3768   ror_imm(tmp3, tmp3, 32, tmp);
3769   sd(tmp3, Address(t0, 0));
3770 
3771   bind(L_check_1);
3772 
3773   andi(idx, idx, 0x1);
3774   subw(idx, idx, 1);
3775   bltz(idx, L_post_third_loop_done);
3776   shadd(t0, idx, y, t0, LogBytesPerInt);
3777   lwu(tmp4, Address(t0, 0));
3778   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
3779   mulhu(carry2, tmp4, product_hi);
3780 
3781   shadd(t0, idx, z, t0, LogBytesPerInt);
3782   lwu(tmp4, Address(t0, 0));
3783 
3784   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
3785 
3786   shadd(t0, idx, z, t0, LogBytesPerInt);
3787   sw(tmp3, Address(t0, 0));
3788 
3789   slli(t0, carry2, 32);
3790   srli(carry, tmp3, 32);
3791   orr(carry, carry, t0);
3792 
3793   bind(L_post_third_loop_done);
3794 }
3795 
3796 /**
3797  * Code for BigInteger::multiplyToLen() intrinsic.
3798  *
3799  * x10: x
3800  * x11: xlen
3801  * x12: y
3802  * x13: ylen
3803  * x14: z
3804  * x15: zlen
3805  * x16: tmp1
3806  * x17: tmp2
3807  * x7:  tmp3
3808  * x28: tmp4
3809  * x29: tmp5
3810  * x30: tmp6
3811  * x31: tmp7
3812  */
3813 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3814                                      Register z, Register zlen,
3815                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3816                                      Register tmp5, Register tmp6, Register product_hi) {
3817   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3818 
3819   const Register idx = tmp1;
3820   const Register kdx = tmp2;
3821   const Register xstart = tmp3;
3822 
3823   const Register y_idx = tmp4;
3824   const Register carry = tmp5;
3825   const Register product = xlen;
3826   const Register x_xstart = zlen; // reuse register
3827 
3828   mv(idx, ylen); // idx = ylen;
3829   mv(kdx, zlen); // kdx = xlen+ylen;
3830   mv(carry, zr); // carry = 0;
3831 
3832   Label L_multiply_64_x_64_loop, L_done;
3833 
3834   subw(xstart, xlen, 1);
3835   bltz(xstart, L_done);
3836 
3837   const Register jdx = tmp1;
3838 
3839   if (AvoidUnalignedAccesses) {
3840     // Check if x and y are both 8-byte aligned.
3841     orr(t0, xlen, ylen);
3842     test_bit(t0, t0, 0);
3843     beqz(t0, L_multiply_64_x_64_loop);
3844 
3845     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3846     shadd(t0, xstart, z, t0, LogBytesPerInt);
3847     sw(carry, Address(t0, 0));
3848 
3849     Label L_second_loop_unaligned;
3850     bind(L_second_loop_unaligned);
3851     mv(carry, zr);
3852     mv(jdx, ylen);
3853     subw(xstart, xstart, 1);
3854     bltz(xstart, L_done);
3855     sub(sp, sp, 2 * wordSize);
3856     sd(z, Address(sp, 0));
3857     sd(zr, Address(sp, wordSize));
3858     shadd(t0, xstart, z, t0, LogBytesPerInt);
3859     addi(z, t0, 4);
3860     shadd(t0, xstart, x, t0, LogBytesPerInt);
3861     lwu(product, Address(t0, 0));
3862     Label L_third_loop, L_third_loop_exit;
3863 
3864     blez(jdx, L_third_loop_exit);
3865 
3866     bind(L_third_loop);
3867     subw(jdx, jdx, 1);
3868     shadd(t0, jdx, y, t0, LogBytesPerInt);
3869     lwu(t0, Address(t0, 0));
3870     mul(t1, t0, product);
3871     add(t0, t1, carry);
3872     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
3873     lwu(t1, Address(tmp6, 0));
3874     add(t0, t0, t1);
3875     sw(t0, Address(tmp6, 0));
3876     srli(carry, t0, 32);
3877     bgtz(jdx, L_third_loop);
3878 
3879     bind(L_third_loop_exit);
3880     ld(z, Address(sp, 0));
3881     addi(sp, sp, 2 * wordSize);
3882     shadd(t0, xstart, z, t0, LogBytesPerInt);
3883     sw(carry, Address(t0, 0));
3884 
3885     j(L_second_loop_unaligned);
3886   }
3887 
3888   bind(L_multiply_64_x_64_loop);
3889   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3890 
3891   Label L_second_loop_aligned;
3892   beqz(kdx, L_second_loop_aligned);
3893 
3894   Label L_carry;
3895   subw(kdx, kdx, 1);
3896   beqz(kdx, L_carry);
3897 
3898   shadd(t0, kdx, z, t0, LogBytesPerInt);
3899   sw(carry, Address(t0, 0));
3900   srli(carry, carry, 32);
3901   subw(kdx, kdx, 1);
3902 
3903   bind(L_carry);
3904   shadd(t0, kdx, z, t0, LogBytesPerInt);
3905   sw(carry, Address(t0, 0));
3906 
3907   // Second and third (nested) loops.
3908   //
3909   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3910   //   carry = 0;
3911   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3912   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3913   //                    (z[k] & LONG_MASK) + carry;
3914   //     z[k] = (int)product;
3915   //     carry = product >>> 32;
3916   //   }
3917   //   z[i] = (int)carry;
3918   // }
3919   //
3920   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3921 
3922   bind(L_second_loop_aligned);
3923   mv(carry, zr); // carry = 0;
3924   mv(jdx, ylen); // j = ystart+1
3925 
3926   subw(xstart, xstart, 1); // i = xstart-1;
3927   bltz(xstart, L_done);
3928 
3929   sub(sp, sp, 4 * wordSize);
3930   sd(z, Address(sp, 0));
3931 
3932   Label L_last_x;
3933   shadd(t0, xstart, z, t0, LogBytesPerInt);
3934   addi(z, t0, 4);
3935   subw(xstart, xstart, 1); // i = xstart-1;
3936   bltz(xstart, L_last_x);
3937 
3938   shadd(t0, xstart, x, t0, LogBytesPerInt);
3939   ld(product_hi, Address(t0, 0));
3940   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
3941 
3942   Label L_third_loop_prologue;
3943   bind(L_third_loop_prologue);
3944 
3945   sd(ylen, Address(sp, wordSize));
3946   sd(x, Address(sp, 2 * wordSize));
3947   sd(xstart, Address(sp, 3 * wordSize));
3948   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3949                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3950   ld(z, Address(sp, 0));
3951   ld(ylen, Address(sp, wordSize));
3952   ld(x, Address(sp, 2 * wordSize));
3953   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
3954   addi(sp, sp, 4 * wordSize);
3955 
3956   addiw(tmp3, xlen, 1);
3957   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3958   sw(carry, Address(t0, 0));
3959 
3960   subw(tmp3, tmp3, 1);
3961   bltz(tmp3, L_done);
3962 
3963   srli(carry, carry, 32);
3964   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3965   sw(carry, Address(t0, 0));
3966   j(L_second_loop_aligned);
3967 
3968   // Next infrequent code is moved outside loops.
3969   bind(L_last_x);
3970   lwu(product_hi, Address(x, 0));
3971   j(L_third_loop_prologue);
3972 
3973   bind(L_done);
3974 }
3975 #endif
3976 
3977 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
3978 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
3979 // shift 16 bits once.
3980 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
3981   if (UseZbb) {
3982     assert_different_registers(Rd, Rs, tmp1);
3983     int step = isLL ? 8 : 16;
3984     ctz(Rd, Rs);
3985     andi(tmp1, Rd, step - 1);
3986     sub(Rd, Rd, tmp1);
3987     return;
3988   }
3989 
3990   assert_different_registers(Rd, Rs, tmp1, tmp2);
3991   Label Loop;
3992   int step = isLL ? 8 : 16;
3993   mv(Rd, -step);
3994   mv(tmp2, Rs);
3995 
3996   bind(Loop);
3997   addi(Rd, Rd, step);
3998   andi(tmp1, tmp2, ((1 << step) - 1));
3999   srli(tmp2, tmp2, step);
4000   beqz(tmp1, Loop);
4001 }
4002 
4003 // This instruction reads adjacent 4 bytes from the lower half of source register,
4004 // inflate into a register, for example:
4005 // Rs: A7A6A5A4A3A2A1A0
4006 // Rd: 00A300A200A100A0
4007 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4008   assert_different_registers(Rd, Rs, tmp1, tmp2);
4009 
4010   mv(tmp1, 0xFF000000); // first byte mask at lower word
4011   andr(Rd, Rs, tmp1);
4012   for (int i = 0; i < 2; i++) {
4013     slli(Rd, Rd, wordSize);
4014     srli(tmp1, tmp1, wordSize);
4015     andr(tmp2, Rs, tmp1);
4016     orr(Rd, Rd, tmp2);
4017   }
4018   slli(Rd, Rd, wordSize);
4019   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
4020   orr(Rd, Rd, tmp2);
4021 }
4022 
4023 // This instruction reads adjacent 4 bytes from the upper half of source register,
4024 // inflate into a register, for example:
4025 // Rs: A7A6A5A4A3A2A1A0
4026 // Rd: 00A700A600A500A4
4027 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4028   assert_different_registers(Rd, Rs, tmp1, tmp2);
4029   srli(Rs, Rs, 32);   // only upper 32 bits are needed
4030   inflate_lo32(Rd, Rs, tmp1, tmp2);
4031 }
4032 
4033 // The size of the blocks erased by the zero_blocks stub.  We must
4034 // handle anything smaller than this ourselves in zero_words().
4035 const int MacroAssembler::zero_words_block_size = 8;
4036 
4037 // zero_words() is used by C2 ClearArray patterns.  It is as small as
4038 // possible, handling small word counts locally and delegating
4039 // anything larger to the zero_blocks stub.  It is expanded many times
4040 // in compiled code, so it is important to keep it short.
4041 
4042 // ptr:   Address of a buffer to be zeroed.
4043 // cnt:   Count in HeapWords.
4044 //
4045 // ptr, cnt, and t0 are clobbered.
4046 address MacroAssembler::zero_words(Register ptr, Register cnt) {
4047   assert(is_power_of_2(zero_words_block_size), "adjust this");
4048   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
4049   assert_different_registers(cnt, t0);
4050 
4051   BLOCK_COMMENT("zero_words {");
4052 
4053   mv(t0, zero_words_block_size);
4054   Label around, done, done16;
4055   bltu(cnt, t0, around);
4056   {
4057     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::riscv::zero_blocks());
4058     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
4059     if (StubRoutines::riscv::complete()) {
4060       address tpc = trampoline_call(zero_blocks);
4061       if (tpc == nullptr) {
4062         DEBUG_ONLY(reset_labels(around));
4063         postcond(pc() == badAddress);
4064         return nullptr;
4065       }
4066     } else {
4067       jal(zero_blocks);
4068     }
4069   }
4070   bind(around);
4071   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4072     Label l;
4073     test_bit(t0, cnt, exact_log2(i));
4074     beqz(t0, l);
4075     for (int j = 0; j < i; j++) {
4076       sd(zr, Address(ptr, j * wordSize));
4077     }
4078     addi(ptr, ptr, i * wordSize);
4079     bind(l);
4080   }
4081   {
4082     Label l;
4083     test_bit(t0, cnt, 0);
4084     beqz(t0, l);
4085     sd(zr, Address(ptr, 0));
4086     bind(l);
4087   }
4088 
4089   BLOCK_COMMENT("} zero_words");
4090   postcond(pc() != badAddress);
4091   return pc();
4092 }
4093 
4094 #define SmallArraySize (18 * BytesPerLong)
4095 
4096 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
4097 // cnt:   Immediate count in HeapWords.
4098 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
4099   assert_different_registers(base, t0, t1);
4100 
4101   BLOCK_COMMENT("zero_words {");
4102 
4103   if (cnt <= SmallArraySize / BytesPerLong) {
4104     for (int i = 0; i < (int)cnt; i++) {
4105       sd(zr, Address(base, i * wordSize));
4106     }
4107   } else {
4108     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
4109     int remainder = cnt % unroll;
4110     for (int i = 0; i < remainder; i++) {
4111       sd(zr, Address(base, i * wordSize));
4112     }
4113 
4114     Label loop;
4115     Register cnt_reg = t0;
4116     Register loop_base = t1;
4117     cnt = cnt - remainder;
4118     mv(cnt_reg, cnt);
4119     add(loop_base, base, remainder * wordSize);
4120     bind(loop);
4121     sub(cnt_reg, cnt_reg, unroll);
4122     for (int i = 0; i < unroll; i++) {
4123       sd(zr, Address(loop_base, i * wordSize));
4124     }
4125     add(loop_base, loop_base, unroll * wordSize);
4126     bnez(cnt_reg, loop);
4127   }
4128 
4129   BLOCK_COMMENT("} zero_words");
4130 }
4131 
4132 // base:   Address of a buffer to be filled, 8 bytes aligned.
4133 // cnt:    Count in 8-byte unit.
4134 // value:  Value to be filled with.
4135 // base will point to the end of the buffer after filling.
4136 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
4137 //  Algorithm:
4138 //
4139 //    t0 = cnt & 7
4140 //    cnt -= t0
4141 //    p += t0
4142 //    switch (t0):
4143 //      switch start:
4144 //      do while cnt
4145 //        cnt -= 8
4146 //          p[-8] = value
4147 //        case 7:
4148 //          p[-7] = value
4149 //        case 6:
4150 //          p[-6] = value
4151 //          // ...
4152 //        case 1:
4153 //          p[-1] = value
4154 //        case 0:
4155 //          p += 8
4156 //      do-while end
4157 //    switch end
4158 
4159   assert_different_registers(base, cnt, value, t0, t1);
4160 
4161   Label fini, skip, entry, loop;
4162   const int unroll = 8; // Number of sd instructions we'll unroll
4163 
4164   beqz(cnt, fini);
4165 
4166   andi(t0, cnt, unroll - 1);
4167   sub(cnt, cnt, t0);
4168   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
4169   shadd(base, t0, base, t1, 3);
4170   la(t1, entry);
4171   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
4172   sub(t1, t1, t0);
4173   jr(t1);
4174 
4175   bind(loop);
4176   add(base, base, unroll * 8);
4177   for (int i = -unroll; i < 0; i++) {
4178     sd(value, Address(base, i * 8));
4179   }
4180   bind(entry);
4181   sub(cnt, cnt, unroll);
4182   bgez(cnt, loop);
4183 
4184   bind(fini);
4185 }
4186 
4187 // Zero blocks of memory by using CBO.ZERO.
4188 //
4189 // Aligns the base address first sufficiently for CBO.ZERO, then uses
4190 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
4191 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4192 // in cnt.
4193 //
4194 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4195 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
4196 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
4197   Label initial_table_end, loop;
4198 
4199   // Align base with cache line size.
4200   neg(tmp1, base);
4201   andi(tmp1, tmp1, CacheLineSize - 1);
4202 
4203   // tmp1: the number of bytes to be filled to align the base with cache line size.
4204   add(base, base, tmp1);
4205   srai(tmp2, tmp1, 3);
4206   sub(cnt, cnt, tmp2);
4207   srli(tmp2, tmp1, 1);
4208   la(tmp1, initial_table_end);
4209   sub(tmp2, tmp1, tmp2);
4210   jr(tmp2);
4211   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
4212     sd(zr, Address(base, i));
4213   }
4214   bind(initial_table_end);
4215 
4216   mv(tmp1, CacheLineSize / wordSize);
4217   bind(loop);
4218   cbo_zero(base);
4219   sub(cnt, cnt, tmp1);
4220   add(base, base, CacheLineSize);
4221   bge(cnt, tmp1, loop);
4222 }
4223 
4224 // java.lang.Math.round(float a)
4225 // Returns the closest int to the argument, with ties rounding to positive infinity.
4226 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
4227   // this instructions calling sequence provides performance improvement on all tested devices;
4228   // don't change it without re-verification
4229   Label done;
4230   mv(t0, jint_cast(0.5f));
4231   fmv_w_x(ftmp, t0);
4232 
4233   // dst = 0 if NaN
4234   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
4235   mv(dst, zr);
4236   beqz(t0, done);
4237 
4238   // dst = (src + 0.5f) rounded down towards negative infinity
4239   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
4240   //   RDN is required for fadd_s, RNE gives incorrect results:
4241   //     --------------------------------------------------------------------
4242   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
4243   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
4244   //     --------------------------------------------------------------------
4245   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
4246   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
4247   //     --------------------------------------------------------------------
4248   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
4249   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
4250 
4251   bind(done);
4252 }
4253 
4254 // java.lang.Math.round(double a)
4255 // Returns the closest long to the argument, with ties rounding to positive infinity.
4256 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
4257   // this instructions calling sequence provides performance improvement on all tested devices;
4258   // don't change it without re-verification
4259   Label done;
4260   mv(t0, julong_cast(0.5));
4261   fmv_d_x(ftmp, t0);
4262 
4263   // dst = 0 if NaN
4264   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
4265   mv(dst, zr);
4266   beqz(t0, done);
4267 
4268   // dst = (src + 0.5) rounded down towards negative infinity
4269   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
4270   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
4271 
4272   bind(done);
4273 }
4274 
4275 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
4276 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
4277   Label done;                                                                             \
4278   assert_different_registers(dst, tmp);                                                   \
4279   fclass_##FLOATSIG(tmp, src);                                                            \
4280   mv(dst, zr);                                                                            \
4281   /* check if src is NaN */                                                               \
4282   andi(tmp, tmp, 0b1100000000);                                                           \
4283   bnez(tmp, done);                                                                        \
4284   FLOATCVT(dst, src);                                                                     \
4285   bind(done);                                                                             \
4286 }
4287 
4288 FCVT_SAFE(fcvt_w_s, s);
4289 FCVT_SAFE(fcvt_l_s, s);
4290 FCVT_SAFE(fcvt_w_d, d);
4291 FCVT_SAFE(fcvt_l_d, d);
4292 
4293 #undef FCVT_SAFE
4294 
4295 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
4296 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
4297                                          FloatRegister Rs2, int unordered_result) {     \
4298   Label Ldone;                                                                          \
4299   if (unordered_result < 0) {                                                           \
4300     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
4301     /* installs 1 if gt else 0 */                                                       \
4302     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
4303     /* Rs1 > Rs2, install 1 */                                                          \
4304     bgtz(result, Ldone);                                                                \
4305     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4306     addi(result, result, -1);                                                           \
4307     /* Rs1 = Rs2, install 0 */                                                          \
4308     /* NaN or Rs1 < Rs2, install -1 */                                                  \
4309     bind(Ldone);                                                                        \
4310   } else {                                                                              \
4311     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
4312     /* installs 1 if gt or unordered else 0 */                                          \
4313     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
4314     /* Rs1 < Rs2, install -1 */                                                         \
4315     bgtz(result, Ldone);                                                                \
4316     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4317     addi(result, result, -1);                                                           \
4318     /* Rs1 = Rs2, install 0 */                                                          \
4319     /* NaN or Rs1 > Rs2, install 1 */                                                   \
4320     bind(Ldone);                                                                        \
4321     neg(result, result);                                                                \
4322   }                                                                                     \
4323 }
4324 
4325 FCMP(float, s);
4326 FCMP(double, d);
4327 
4328 #undef FCMP
4329 
4330 // Zero words; len is in bytes
4331 // Destroys all registers except addr
4332 // len must be a nonzero multiple of wordSize
4333 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
4334   assert_different_registers(addr, len, tmp, t0, t1);
4335 
4336 #ifdef ASSERT
4337   {
4338     Label L;
4339     andi(t0, len, BytesPerWord - 1);
4340     beqz(t0, L);
4341     stop("len is not a multiple of BytesPerWord");
4342     bind(L);
4343   }
4344 #endif // ASSERT
4345 
4346 #ifndef PRODUCT
4347   block_comment("zero memory");
4348 #endif // PRODUCT
4349 
4350   Label loop;
4351   Label entry;
4352 
4353   // Algorithm:
4354   //
4355   //  t0 = cnt & 7
4356   //  cnt -= t0
4357   //  p += t0
4358   //  switch (t0) {
4359   //    do {
4360   //      cnt -= 8
4361   //        p[-8] = 0
4362   //      case 7:
4363   //        p[-7] = 0
4364   //      case 6:
4365   //        p[-6] = 0
4366   //        ...
4367   //      case 1:
4368   //        p[-1] = 0
4369   //      case 0:
4370   //        p += 8
4371   //     } while (cnt)
4372   //  }
4373 
4374   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
4375 
4376   srli(len, len, LogBytesPerWord);
4377   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
4378   sub(len, len, t0);          // cnt -= unroll
4379   // tmp always points to the end of the region we're about to zero
4380   shadd(tmp, t0, addr, t1, LogBytesPerWord);
4381   la(t1, entry);
4382   slli(t0, t0, 2);
4383   sub(t1, t1, t0);
4384   jr(t1);
4385   bind(loop);
4386   sub(len, len, unroll);
4387   for (int i = -unroll; i < 0; i++) {
4388     sd(zr, Address(tmp, i * wordSize));
4389   }
4390   bind(entry);
4391   add(tmp, tmp, unroll * wordSize);
4392   bnez(len, loop);
4393 }
4394 
4395 // shift left by shamt and add
4396 // Rd = (Rs1 << shamt) + Rs2
4397 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
4398   if (UseZba) {
4399     if (shamt == 1) {
4400       sh1add(Rd, Rs1, Rs2);
4401       return;
4402     } else if (shamt == 2) {
4403       sh2add(Rd, Rs1, Rs2);
4404       return;
4405     } else if (shamt == 3) {
4406       sh3add(Rd, Rs1, Rs2);
4407       return;
4408     }
4409   }
4410 
4411   if (shamt != 0) {
4412     slli(tmp, Rs1, shamt);
4413     add(Rd, Rs2, tmp);
4414   } else {
4415     add(Rd, Rs1, Rs2);
4416   }
4417 }
4418 
4419 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
4420   if (UseZba && bits == 32) {
4421     zext_w(dst, src);
4422     return;
4423   }
4424 
4425   if (UseZbb && bits == 16) {
4426     zext_h(dst, src);
4427     return;
4428   }
4429 
4430   if (bits == 8) {
4431     zext_b(dst, src);
4432   } else {
4433     slli(dst, src, XLEN - bits);
4434     srli(dst, dst, XLEN - bits);
4435   }
4436 }
4437 
4438 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
4439   if (UseZbb) {
4440     if (bits == 8) {
4441       sext_b(dst, src);
4442       return;
4443     } else if (bits == 16) {
4444       sext_h(dst, src);
4445       return;
4446     }
4447   }
4448 
4449   if (bits == 32) {
4450     sext_w(dst, src);
4451   } else {
4452     slli(dst, src, XLEN - bits);
4453     srai(dst, dst, XLEN - bits);
4454   }
4455 }
4456 
4457 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
4458 {
4459   if (src1 == src2) {
4460     mv(dst, zr);
4461     return;
4462   }
4463   Label done;
4464   Register left = src1;
4465   Register right = src2;
4466   if (dst == src1) {
4467     assert_different_registers(dst, src2, tmp);
4468     mv(tmp, src1);
4469     left = tmp;
4470   } else if (dst == src2) {
4471     assert_different_registers(dst, src1, tmp);
4472     mv(tmp, src2);
4473     right = tmp;
4474   }
4475 
4476   // installs 1 if gt else 0
4477   slt(dst, right, left);
4478   bnez(dst, done);
4479   slt(dst, left, right);
4480   // dst = -1 if lt; else if eq , dst = 0
4481   neg(dst, dst);
4482   bind(done);
4483 }
4484 
4485 // The java_calling_convention describes stack locations as ideal slots on
4486 // a frame with no abi restrictions. Since we must observe abi restrictions
4487 // (like the placement of the register window) the slots must be biased by
4488 // the following value.
4489 static int reg2offset_in(VMReg r) {
4490   // Account for saved fp and ra
4491   // This should really be in_preserve_stack_slots
4492   return r->reg2stack() * VMRegImpl::stack_slot_size;
4493 }
4494 
4495 static int reg2offset_out(VMReg r) {
4496   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
4497 }
4498 
4499 // On 64 bit we will store integer like items to the stack as
4500 // 64 bits items (riscv64 abi) even though java would only store
4501 // 32bits for a parameter. On 32bit it will simply be 32 bits
4502 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
4503 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
4504   if (src.first()->is_stack()) {
4505     if (dst.first()->is_stack()) {
4506       // stack to stack
4507       ld(tmp, Address(fp, reg2offset_in(src.first())));
4508       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4509     } else {
4510       // stack to reg
4511       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4512     }
4513   } else if (dst.first()->is_stack()) {
4514     // reg to stack
4515     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4516   } else {
4517     if (dst.first() != src.first()) {
4518       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
4519     }
4520   }
4521 }
4522 
4523 // An oop arg. Must pass a handle not the oop itself
4524 void MacroAssembler::object_move(OopMap* map,
4525                                  int oop_handle_offset,
4526                                  int framesize_in_slots,
4527                                  VMRegPair src,
4528                                  VMRegPair dst,
4529                                  bool is_receiver,
4530                                  int* receiver_offset) {
4531   assert_cond(map != nullptr && receiver_offset != nullptr);
4532 
4533   // must pass a handle. First figure out the location we use as a handle
4534   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
4535 
4536   // See if oop is null if it is we need no handle
4537 
4538   if (src.first()->is_stack()) {
4539     // Oop is already on the stack as an argument
4540     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
4541     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
4542     if (is_receiver) {
4543       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
4544     }
4545 
4546     ld(t0, Address(fp, reg2offset_in(src.first())));
4547     la(rHandle, Address(fp, reg2offset_in(src.first())));
4548     // conditionally move a null
4549     Label notZero1;
4550     bnez(t0, notZero1);
4551     mv(rHandle, zr);
4552     bind(notZero1);
4553   } else {
4554 
4555     // Oop is in a register we must store it to the space we reserve
4556     // on the stack for oop_handles and pass a handle if oop is non-null
4557 
4558     const Register rOop = src.first()->as_Register();
4559     int oop_slot = -1;
4560     if (rOop == j_rarg0) {
4561       oop_slot = 0;
4562     } else if (rOop == j_rarg1) {
4563       oop_slot = 1;
4564     } else if (rOop == j_rarg2) {
4565       oop_slot = 2;
4566     } else if (rOop == j_rarg3) {
4567       oop_slot = 3;
4568     } else if (rOop == j_rarg4) {
4569       oop_slot = 4;
4570     } else if (rOop == j_rarg5) {
4571       oop_slot = 5;
4572     } else if (rOop == j_rarg6) {
4573       oop_slot = 6;
4574     } else {
4575       assert(rOop == j_rarg7, "wrong register");
4576       oop_slot = 7;
4577     }
4578 
4579     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
4580     int offset = oop_slot * VMRegImpl::stack_slot_size;
4581 
4582     map->set_oop(VMRegImpl::stack2reg(oop_slot));
4583     // Store oop in handle area, may be null
4584     sd(rOop, Address(sp, offset));
4585     if (is_receiver) {
4586       *receiver_offset = offset;
4587     }
4588 
4589     //rOop maybe the same as rHandle
4590     if (rOop == rHandle) {
4591       Label isZero;
4592       beqz(rOop, isZero);
4593       la(rHandle, Address(sp, offset));
4594       bind(isZero);
4595     } else {
4596       Label notZero2;
4597       la(rHandle, Address(sp, offset));
4598       bnez(rOop, notZero2);
4599       mv(rHandle, zr);
4600       bind(notZero2);
4601     }
4602   }
4603 
4604   // If arg is on the stack then place it otherwise it is already in correct reg.
4605   if (dst.first()->is_stack()) {
4606     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
4607   }
4608 }
4609 
4610 // A float arg may have to do float reg int reg conversion
4611 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
4612   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4613          src.first()->is_reg() && dst.first()->is_reg() ||
4614          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4615   if (src.first()->is_stack()) {
4616     if (dst.first()->is_stack()) {
4617       lwu(tmp, Address(fp, reg2offset_in(src.first())));
4618       sw(tmp, Address(sp, reg2offset_out(dst.first())));
4619     } else if (dst.first()->is_Register()) {
4620       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4621     } else {
4622       ShouldNotReachHere();
4623     }
4624   } else if (src.first() != dst.first()) {
4625     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4626       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4627     } else {
4628       ShouldNotReachHere();
4629     }
4630   }
4631 }
4632 
4633 // A long move
4634 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
4635   if (src.first()->is_stack()) {
4636     if (dst.first()->is_stack()) {
4637       // stack to stack
4638       ld(tmp, Address(fp, reg2offset_in(src.first())));
4639       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4640     } else {
4641       // stack to reg
4642       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4643     }
4644   } else if (dst.first()->is_stack()) {
4645     // reg to stack
4646     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4647   } else {
4648     if (dst.first() != src.first()) {
4649       mv(dst.first()->as_Register(), src.first()->as_Register());
4650     }
4651   }
4652 }
4653 
4654 // A double move
4655 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
4656   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4657          src.first()->is_reg() && dst.first()->is_reg() ||
4658          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4659   if (src.first()->is_stack()) {
4660     if (dst.first()->is_stack()) {
4661       ld(tmp, Address(fp, reg2offset_in(src.first())));
4662       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4663     } else if (dst.first()-> is_Register()) {
4664       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4665     } else {
4666       ShouldNotReachHere();
4667     }
4668   } else if (src.first() != dst.first()) {
4669     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4670       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4671     } else {
4672       ShouldNotReachHere();
4673     }
4674   }
4675 }
4676 
4677 void MacroAssembler::rt_call(address dest, Register tmp) {
4678   CodeBlob *cb = CodeCache::find_blob(dest);
4679   RuntimeAddress target(dest);
4680   if (cb) {
4681     far_call(target);
4682   } else {
4683     relocate(target.rspec(), [&] {
4684       int32_t offset;
4685       la_patchable(tmp, target, offset);
4686       jalr(x1, tmp, offset);
4687     });
4688   }
4689 }
4690 
4691 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
4692   assert(bit_pos < 64, "invalid bit range");
4693   if (UseZbs) {
4694     bexti(Rd, Rs, bit_pos);
4695     return;
4696   }
4697   int64_t imm = (int64_t)(1UL << bit_pos);
4698   if (is_simm12(imm)) {
4699     and_imm12(Rd, Rs, imm);
4700   } else {
4701     srli(Rd, Rs, bit_pos);
4702     and_imm12(Rd, Rd, 1);
4703   }
4704 }
4705 
4706 // Implements lightweight-locking.
4707 //
4708 //  - obj: the object to be locked
4709 //  - tmp1, tmp2, tmp3: temporary registers, will be destroyed
4710 //  - slow: branched to if locking fails
4711 void MacroAssembler::lightweight_lock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
4712   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4713   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
4714 
4715   Label push;
4716   const Register top = tmp1;
4717   const Register mark = tmp2;
4718   const Register t = tmp3;
4719 
4720   // Preload the markWord. It is important that this is the first
4721   // instruction emitted as it is part of C1's null check semantics.
4722   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
4723 
4724   // Check if the lock-stack is full.
4725   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
4726   mv(t, (unsigned)LockStack::end_offset());
4727   bge(top, t, slow, /* is_far */ true);
4728 
4729   // Check for recursion.
4730   add(t, xthread, top);
4731   ld(t, Address(t, -oopSize));
4732   beq(obj, t, push);
4733 
4734   // Check header for monitor (0b10).
4735   test_bit(t, mark, exact_log2(markWord::monitor_value));
4736   bnez(t, slow, /* is_far */ true);
4737 
4738   // Try to lock. Transition lock-bits 0b01 => 0b00
4739   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
4740   ori(mark, mark, markWord::unlocked_value);
4741   xori(t, mark, markWord::unlocked_value);
4742   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
4743           /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
4744   bne(mark, t, slow, /* is_far */ true);
4745 
4746   bind(push);
4747   // After successful lock, push object on lock-stack.
4748   add(t, xthread, top);
4749   sd(obj, Address(t));
4750   addw(top, top, oopSize);
4751   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
4752 }
4753 
4754 // Implements ligthweight-unlocking.
4755 //
4756 // - obj: the object to be unlocked
4757 // - tmp1, tmp2, tmp3: temporary registers
4758 // - slow: branched to if unlocking fails
4759 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
4760   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4761   assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
4762 
4763 #ifdef ASSERT
4764   {
4765     // Check for lock-stack underflow.
4766     Label stack_ok;
4767     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4768     mv(tmp2, (unsigned)LockStack::start_offset());
4769     bge(tmp1, tmp2, stack_ok);
4770     STOP("Lock-stack underflow");
4771     bind(stack_ok);
4772   }
4773 #endif
4774 
4775   Label unlocked, push_and_slow;
4776   const Register top = tmp1;
4777   const Register mark = tmp2;
4778   const Register t = tmp3;
4779 
4780   // Check if obj is top of lock-stack.
4781   lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
4782   subw(top, top, oopSize);
4783   add(t, xthread, top);
4784   ld(t, Address(t));
4785   bne(obj, t, slow, /* is_far */ true);
4786 
4787   // Pop lock-stack.
4788   DEBUG_ONLY(add(t, xthread, top);)
4789   DEBUG_ONLY(sd(zr, Address(t));)
4790   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
4791 
4792   // Check if recursive.
4793   add(t, xthread, top);
4794   ld(t, Address(t, -oopSize));
4795   beq(obj, t, unlocked);
4796 
4797   // Not recursive. Check header for monitor (0b10).
4798   ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
4799   test_bit(t, mark, exact_log2(markWord::monitor_value));
4800   bnez(t, push_and_slow);
4801 
4802 #ifdef ASSERT
4803   // Check header not unlocked (0b01).
4804   Label not_unlocked;
4805   test_bit(t, mark, exact_log2(markWord::unlocked_value));
4806   beqz(t, not_unlocked);
4807   stop("lightweight_unlock already unlocked");
4808   bind(not_unlocked);
4809 #endif
4810 
4811   // Try to unlock. Transition lock bits 0b00 => 0b01
4812   assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
4813   ori(t, mark, markWord::unlocked_value);
4814   cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
4815           /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
4816   beq(mark, t, unlocked);
4817 
4818   bind(push_and_slow);
4819   // Restore lock-stack and handle the unlock in runtime.
4820   DEBUG_ONLY(add(t, xthread, top);)
4821   DEBUG_ONLY(sd(obj, Address(t));)
4822   addw(top, top, oopSize);
4823   sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
4824   j(slow);
4825 
4826   bind(unlocked);
4827 }