Old src/hotspot/cpu/riscv/macroAssembler

   1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_riscv.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/oop.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/javaThread.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/compile.hpp"
  53 #include "opto/node.hpp"
  54 #include "opto/output.hpp"
  55 #endif
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #endif
  62 #define STOP(str) stop(str);
  63 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  64 
  65 static void pass_arg0(MacroAssembler* masm, Register arg) {
  66   if (c_rarg0 != arg) {
  67     masm->mv(c_rarg0, arg);
  68   }
  69 }
  70 
  71 static void pass_arg1(MacroAssembler* masm, Register arg) {
  72   if (c_rarg1 != arg) {
  73     masm->mv(c_rarg1, arg);
  74   }
  75 }
  76 
  77 static void pass_arg2(MacroAssembler* masm, Register arg) {
  78   if (c_rarg2 != arg) {
  79     masm->mv(c_rarg2, arg);
  80   }
  81 }
  82 
  83 static void pass_arg3(MacroAssembler* masm, Register arg) {
  84   if (c_rarg3 != arg) {
  85     masm->mv(c_rarg3, arg);
  86   }
  87 }
  88 
  89 void MacroAssembler::push_cont_fastpath(Register java_thread) {
  90   if (!Continuations::enabled()) return;
  91   Label done;
  92   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
  93   bleu(sp, t0, done);
  94   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
  95   bind(done);
  96 }
  97 
  98 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
  99   if (!Continuations::enabled()) return;
 100   Label done;
 101   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 102   bltu(sp, t0, done);
 103   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 104   bind(done);
 105 }
 106 
 107 int MacroAssembler::align(int modulus, int extra_offset) {
 108   CompressibleRegion cr(this);
 109   intptr_t before = offset();
 110   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 111   return (int)(offset() - before);
 112 }
 113 
 114 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 115   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 116 }
 117 
 118 // Implementation of call_VM versions
 119 
 120 void MacroAssembler::call_VM(Register oop_result,
 121                              address entry_point,
 122                              bool check_exceptions) {
 123   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 124 }
 125 
 126 void MacroAssembler::call_VM(Register oop_result,
 127                              address entry_point,
 128                              Register arg_1,
 129                              bool check_exceptions) {
 130   pass_arg1(this, arg_1);
 131   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 132 }
 133 
 134 void MacroAssembler::call_VM(Register oop_result,
 135                              address entry_point,
 136                              Register arg_1,
 137                              Register arg_2,
 138                              bool check_exceptions) {
 139   assert(arg_1 != c_rarg2, "smashed arg");
 140   pass_arg2(this, arg_2);
 141   pass_arg1(this, arg_1);
 142   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 143 }
 144 
 145 void MacroAssembler::call_VM(Register oop_result,
 146                              address entry_point,
 147                              Register arg_1,
 148                              Register arg_2,
 149                              Register arg_3,
 150                              bool check_exceptions) {
 151   assert(arg_1 != c_rarg3, "smashed arg");
 152   assert(arg_2 != c_rarg3, "smashed arg");
 153   pass_arg3(this, arg_3);
 154 
 155   assert(arg_1 != c_rarg2, "smashed arg");
 156   pass_arg2(this, arg_2);
 157 
 158   pass_arg1(this, arg_1);
 159   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 160 }
 161 
 162 void MacroAssembler::call_VM(Register oop_result,
 163                              Register last_java_sp,
 164                              address entry_point,
 165                              int number_of_arguments,
 166                              bool check_exceptions) {
 167   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 168 }
 169 
 170 void MacroAssembler::call_VM(Register oop_result,
 171                              Register last_java_sp,
 172                              address entry_point,
 173                              Register arg_1,
 174                              bool check_exceptions) {
 175   pass_arg1(this, arg_1);
 176   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 177 }
 178 
 179 void MacroAssembler::call_VM(Register oop_result,
 180                              Register last_java_sp,
 181                              address entry_point,
 182                              Register arg_1,
 183                              Register arg_2,
 184                              bool check_exceptions) {
 185 
 186   assert(arg_1 != c_rarg2, "smashed arg");
 187   pass_arg2(this, arg_2);
 188   pass_arg1(this, arg_1);
 189   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 190 }
 191 
 192 void MacroAssembler::call_VM(Register oop_result,
 193                              Register last_java_sp,
 194                              address entry_point,
 195                              Register arg_1,
 196                              Register arg_2,
 197                              Register arg_3,
 198                              bool check_exceptions) {
 199   assert(arg_1 != c_rarg3, "smashed arg");
 200   assert(arg_2 != c_rarg3, "smashed arg");
 201   pass_arg3(this, arg_3);
 202   assert(arg_1 != c_rarg2, "smashed arg");
 203   pass_arg2(this, arg_2);
 204   pass_arg1(this, arg_1);
 205   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 206 }
 207 
 208 void MacroAssembler::post_call_nop() {
 209   if (!Continuations::enabled()) {
 210     return;
 211   }
 212   relocate(post_call_nop_Relocation::spec(), [&] {
 213     InlineSkippedInstructionsCounter skipCounter(this);
 214     nop();
 215     li32(zr, 0);
 216   });
 217 }
 218 
 219 // these are no-ops overridden by InterpreterMacroAssembler
 220 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 221 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 222 
 223 // Calls to C land
 224 //
 225 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 226 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 227 // has to be reset to 0. This is required to allow proper stack traversal.
 228 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 229                                          Register last_java_fp,
 230                                          Register last_java_pc,
 231                                          Register tmp) {
 232 
 233   if (last_java_pc->is_valid()) {
 234       sd(last_java_pc, Address(xthread,
 235                                JavaThread::frame_anchor_offset() +
 236                                JavaFrameAnchor::last_Java_pc_offset()));
 237   }
 238 
 239   // determine last_java_sp register
 240   if (last_java_sp == sp) {
 241     mv(tmp, sp);
 242     last_java_sp = tmp;
 243   } else if (!last_java_sp->is_valid()) {
 244     last_java_sp = esp;
 245   }
 246 
 247   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 248 
 249   // last_java_fp is optional
 250   if (last_java_fp->is_valid()) {
 251     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 252   }
 253 }
 254 
 255 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 256                                          Register last_java_fp,
 257                                          address  last_java_pc,
 258                                          Register tmp) {
 259   assert(last_java_pc != nullptr, "must provide a valid PC");
 260 
 261   la(tmp, last_java_pc);
 262   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 263 
 264   set_last_Java_frame(last_java_sp, last_java_fp, noreg, tmp);
 265 }
 266 
 267 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 268                                          Register last_java_fp,
 269                                          Label &L,
 270                                          Register tmp) {
 271   if (L.is_bound()) {
 272     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 273   } else {
 274     L.add_patch_at(code(), locator());
 275     IncompressibleRegion ir(this);  // the label address will be patched back.
 276     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 277   }
 278 }
 279 
 280 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 281   // we must set sp to zero to clear frame
 282   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 283 
 284   // must clear fp, so that compiled frames are not confused; it is
 285   // possible that we need it only for debugging
 286   if (clear_fp) {
 287     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 288   }
 289 
 290   // Always clear the pc because it could have been set by make_walkable()
 291   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 292 }
 293 
 294 void MacroAssembler::call_VM_base(Register oop_result,
 295                                   Register java_thread,
 296                                   Register last_java_sp,
 297                                   address  entry_point,
 298                                   int      number_of_arguments,
 299                                   bool     check_exceptions) {
 300    // determine java_thread register
 301   if (!java_thread->is_valid()) {
 302     java_thread = xthread;
 303   }
 304   // determine last_java_sp register
 305   if (!last_java_sp->is_valid()) {
 306     last_java_sp = esp;
 307   }
 308 
 309   // debugging support
 310   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 311   assert(java_thread == xthread, "unexpected register");
 312 
 313   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 314   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 315 
 316   // push java thread (becomes first argument of C function)
 317   mv(c_rarg0, java_thread);
 318 
 319   // set last Java frame before call
 320   assert(last_java_sp != fp, "can't use fp");
 321 
 322   Label l;
 323   set_last_Java_frame(last_java_sp, fp, l, t0);
 324 
 325   // do the call, remove parameters
 326   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 327 
 328   // reset last Java frame
 329   // Only interpreter should have to clear fp
 330   reset_last_Java_frame(true);
 331 
 332    // C++ interp handles this in the interpreter
 333   check_and_handle_popframe(java_thread);
 334   check_and_handle_earlyret(java_thread);
 335 
 336   if (check_exceptions) {
 337     // check for pending exceptions (java_thread is set upon return)
 338     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 339     Label ok;
 340     beqz(t0, ok);
 341     RuntimeAddress target(StubRoutines::forward_exception_entry());
 342     relocate(target.rspec(), [&] {
 343       int32_t offset;
 344       la_patchable(t0, target, offset);
 345       jalr(x0, t0, offset);
 346     });
 347     bind(ok);
 348   }
 349 
 350   // get oop result if there is one and reset the value in the thread
 351   if (oop_result->is_valid()) {
 352     get_vm_result(oop_result, java_thread);
 353   }
 354 }
 355 
 356 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 357   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 358   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 359   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 360 }
 361 
 362 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 363   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 364   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 365 }
 366 
 367 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 368   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 369   assert_different_registers(klass, xthread, tmp);
 370 
 371   Label L_fallthrough, L_tmp;
 372   if (L_fast_path == nullptr) {
 373     L_fast_path = &L_fallthrough;
 374   } else if (L_slow_path == nullptr) {
 375     L_slow_path = &L_fallthrough;
 376   }
 377 
 378   // Fast path check: class is fully initialized
 379   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 380   sub(tmp, tmp, InstanceKlass::fully_initialized);
 381   beqz(tmp, *L_fast_path);
 382 
 383   // Fast path check: current thread is initializer thread
 384   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 385 
 386   if (L_slow_path == &L_fallthrough) {
 387     beq(xthread, tmp, *L_fast_path);
 388     bind(*L_slow_path);
 389   } else if (L_fast_path == &L_fallthrough) {
 390     bne(xthread, tmp, *L_slow_path);
 391     bind(*L_fast_path);
 392   } else {
 393     Unimplemented();
 394   }
 395 }
 396 
 397 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 398   if (!VerifyOops) { return; }
 399 
 400   // Pass register number to verify_oop_subroutine
 401   const char* b = nullptr;
 402   {
 403     ResourceMark rm;
 404     stringStream ss;
 405     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 406     b = code_string(ss.as_string());
 407   }
 408   BLOCK_COMMENT("verify_oop {");
 409 
 410   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 411 
 412   mv(c_rarg0, reg); // c_rarg0 : x10
 413   {
 414     // The length of the instruction sequence emitted should not depend
 415     // on the address of the char buffer so that the size of mach nodes for
 416     // scratch emit and normal emit matches.
 417     IncompressibleRegion ir(this);  // Fixed length
 418     movptr(t0, (address) b);
 419   }
 420 
 421   // call indirectly to solve generation ordering problem
 422   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 423   relocate(target.rspec(), [&] {
 424     int32_t offset;
 425     la_patchable(t1, target, offset);
 426     ld(t1, Address(t1, offset));
 427   });
 428   jalr(t1);
 429 
 430   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 431 
 432   BLOCK_COMMENT("} verify_oop");
 433 }
 434 
 435 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 436   if (!VerifyOops) {
 437     return;
 438   }
 439 
 440   const char* b = nullptr;
 441   {
 442     ResourceMark rm;
 443     stringStream ss;
 444     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 445     b = code_string(ss.as_string());
 446   }
 447   BLOCK_COMMENT("verify_oop_addr {");
 448 
 449   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 450 
 451   if (addr.uses(sp)) {
 452     la(x10, addr);
 453     ld(x10, Address(x10, 4 * wordSize));
 454   } else {
 455     ld(x10, addr);
 456   }
 457 
 458   {
 459     // The length of the instruction sequence emitted should not depend
 460     // on the address of the char buffer so that the size of mach nodes for
 461     // scratch emit and normal emit matches.
 462     IncompressibleRegion ir(this);  // Fixed length
 463     movptr(t0, (address) b);
 464   }
 465 
 466   // call indirectly to solve generation ordering problem
 467   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 468   relocate(target.rspec(), [&] {
 469     int32_t offset;
 470     la_patchable(t1, target, offset);
 471     ld(t1, Address(t1, offset));
 472   });
 473   jalr(t1);
 474 
 475   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 476 
 477   BLOCK_COMMENT("} verify_oop_addr");
 478 }
 479 
 480 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 481                                          int extra_slot_offset) {
 482   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 483   int stackElementSize = Interpreter::stackElementSize;
 484   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 485 #ifdef ASSERT
 486   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 487   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 488 #endif
 489   if (arg_slot.is_constant()) {
 490     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 491   } else {
 492     assert_different_registers(t0, arg_slot.as_register());
 493     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 494     return Address(t0, offset);
 495   }
 496 }
 497 
 498 #ifndef PRODUCT
 499 extern "C" void findpc(intptr_t x);
 500 #endif
 501 
 502 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 503 {
 504   // In order to get locks to work, we need to fake a in_VM state
 505   if (ShowMessageBoxOnError) {
 506     JavaThread* thread = JavaThread::current();
 507     JavaThreadState saved_state = thread->thread_state();
 508     thread->set_thread_state(_thread_in_vm);
 509 #ifndef PRODUCT
 510     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 511       ttyLocker ttyl;
 512       BytecodeCounter::print();
 513     }
 514 #endif
 515     if (os::message_box(msg, "Execution stopped, print registers?")) {
 516       ttyLocker ttyl;
 517       tty->print_cr(" pc = 0x%016lx", pc);
 518 #ifndef PRODUCT
 519       tty->cr();
 520       findpc(pc);
 521       tty->cr();
 522 #endif
 523       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 524       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 525       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 526       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 527       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 528       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 529       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 530       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 531       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 532       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 533       tty->print_cr("x10 = 0x%016lx", regs[10]);
 534       tty->print_cr("x11 = 0x%016lx", regs[11]);
 535       tty->print_cr("x12 = 0x%016lx", regs[12]);
 536       tty->print_cr("x13 = 0x%016lx", regs[13]);
 537       tty->print_cr("x14 = 0x%016lx", regs[14]);
 538       tty->print_cr("x15 = 0x%016lx", regs[15]);
 539       tty->print_cr("x16 = 0x%016lx", regs[16]);
 540       tty->print_cr("x17 = 0x%016lx", regs[17]);
 541       tty->print_cr("x18 = 0x%016lx", regs[18]);
 542       tty->print_cr("x19 = 0x%016lx", regs[19]);
 543       tty->print_cr("x20 = 0x%016lx", regs[20]);
 544       tty->print_cr("x21 = 0x%016lx", regs[21]);
 545       tty->print_cr("x22 = 0x%016lx", regs[22]);
 546       tty->print_cr("x23 = 0x%016lx", regs[23]);
 547       tty->print_cr("x24 = 0x%016lx", regs[24]);
 548       tty->print_cr("x25 = 0x%016lx", regs[25]);
 549       tty->print_cr("x26 = 0x%016lx", regs[26]);
 550       tty->print_cr("x27 = 0x%016lx", regs[27]);
 551       tty->print_cr("x28 = 0x%016lx", regs[28]);
 552       tty->print_cr("x30 = 0x%016lx", regs[30]);
 553       tty->print_cr("x31 = 0x%016lx", regs[31]);
 554       BREAKPOINT;
 555     }
 556   }
 557   fatal("DEBUG MESSAGE: %s", msg);
 558 }
 559 
 560 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 561   assert_different_registers(value, tmp1, tmp2);
 562   Label done, tagged, weak_tagged;
 563 
 564   beqz(value, done);           // Use null as-is.
 565   // Test for tag.
 566   andi(tmp1, value, JNIHandles::tag_mask);
 567   bnez(tmp1, tagged);
 568 
 569   // Resolve local handle
 570   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 571   verify_oop(value);
 572   j(done);
 573 
 574   bind(tagged);
 575   // Test for jweak tag.
 576   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 577   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 578   bnez(tmp1, weak_tagged);
 579 
 580   // Resolve global handle
 581   access_load_at(T_OBJECT, IN_NATIVE, value,
 582                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 583   verify_oop(value);
 584   j(done);
 585 
 586   bind(weak_tagged);
 587   // Resolve jweak.
 588   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 589                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 590   verify_oop(value);
 591 
 592   bind(done);
 593 }
 594 
 595 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 596   assert_different_registers(value, tmp1, tmp2);
 597   Label done;
 598 
 599   beqz(value, done);           // Use null as-is.
 600 
 601 #ifdef ASSERT
 602   {
 603     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 604     Label valid_global_tag;
 605     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 606     bnez(tmp1, valid_global_tag);
 607     stop("non global jobject using resolve_global_jobject");
 608     bind(valid_global_tag);
 609   }
 610 #endif
 611 
 612   // Resolve global handle
 613   access_load_at(T_OBJECT, IN_NATIVE, value,
 614                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 615   verify_oop(value);
 616 
 617   bind(done);
 618 }
 619 
 620 void MacroAssembler::stop(const char* msg) {
 621   BLOCK_COMMENT(msg);
 622   illegal_instruction(Assembler::csr::time);
 623   emit_int64((uintptr_t)msg);
 624 }
 625 
 626 void MacroAssembler::unimplemented(const char* what) {
 627   const char* buf = nullptr;
 628   {
 629     ResourceMark rm;
 630     stringStream ss;
 631     ss.print("unimplemented: %s", what);
 632     buf = code_string(ss.as_string());
 633   }
 634   stop(buf);
 635 }
 636 
 637 void MacroAssembler::emit_static_call_stub() {
 638   IncompressibleRegion ir(this);  // Fixed length: see CompiledStaticCall::to_interp_stub_size().
 639   // CompiledDirectStaticCall::set_to_interpreted knows the
 640   // exact layout of this stub.
 641 
 642   mov_metadata(xmethod, (Metadata*)nullptr);
 643 
 644   // Jump to the entry point of the c2i stub.
 645   int32_t offset = 0;
 646   movptr(t0, 0, offset);
 647   jalr(x0, t0, offset);
 648 }
 649 
 650 void MacroAssembler::call_VM_leaf_base(address entry_point,
 651                                        int number_of_arguments,
 652                                        Label *retaddr) {
 653   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 654   call(entry_point);
 655   if (retaddr != nullptr) {
 656     bind(*retaddr);
 657   }
 658   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 659 }
 660 
 661 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 662   call_VM_leaf_base(entry_point, number_of_arguments);
 663 }
 664 
 665 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 666   pass_arg0(this, arg_0);
 667   call_VM_leaf_base(entry_point, 1);
 668 }
 669 
 670 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 671   pass_arg0(this, arg_0);
 672   pass_arg1(this, arg_1);
 673   call_VM_leaf_base(entry_point, 2);
 674 }
 675 
 676 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 677                                   Register arg_1, Register arg_2) {
 678   pass_arg0(this, arg_0);
 679   pass_arg1(this, arg_1);
 680   pass_arg2(this, arg_2);
 681   call_VM_leaf_base(entry_point, 3);
 682 }
 683 
 684 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 685   pass_arg0(this, arg_0);
 686   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 687 }
 688 
 689 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 690 
 691   assert(arg_0 != c_rarg1, "smashed arg");
 692   pass_arg1(this, arg_1);
 693   pass_arg0(this, arg_0);
 694   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 695 }
 696 
 697 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 698   assert(arg_0 != c_rarg2, "smashed arg");
 699   assert(arg_1 != c_rarg2, "smashed arg");
 700   pass_arg2(this, arg_2);
 701   assert(arg_0 != c_rarg1, "smashed arg");
 702   pass_arg1(this, arg_1);
 703   pass_arg0(this, arg_0);
 704   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 705 }
 706 
 707 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 708   assert(arg_0 != c_rarg3, "smashed arg");
 709   assert(arg_1 != c_rarg3, "smashed arg");
 710   assert(arg_2 != c_rarg3, "smashed arg");
 711   pass_arg3(this, arg_3);
 712   assert(arg_0 != c_rarg2, "smashed arg");
 713   assert(arg_1 != c_rarg2, "smashed arg");
 714   pass_arg2(this, arg_2);
 715   assert(arg_0 != c_rarg1, "smashed arg");
 716   pass_arg1(this, arg_1);
 717   pass_arg0(this, arg_0);
 718   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 719 }
 720 
 721 void MacroAssembler::la(Register Rd, const address dest) {
 722   int64_t offset = dest - pc();
 723   if (is_valid_32bit_offset(offset)) {
 724     auipc(Rd, (int32_t)offset + 0x800);  //0x800, Note:the 11th sign bit
 725     addi(Rd, Rd, ((int64_t)offset << 52) >> 52);
 726   } else {
 727     movptr(Rd, dest);
 728   }
 729 }
 730 
 731 void MacroAssembler::la(Register Rd, const Address &adr) {
 732   switch (adr.getMode()) {
 733     case Address::literal: {
 734       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 735       if (rtype == relocInfo::none) {
 736         mv(Rd, (intptr_t)(adr.target()));
 737       } else {
 738         relocate(adr.rspec(), [&] {
 739           movptr(Rd, adr.target());
 740         });
 741       }
 742       break;
 743     }
 744     case Address::base_plus_offset: {
 745       Address new_adr = legitimize_address(Rd, adr);
 746       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 747         addi(Rd, new_adr.base(), new_adr.offset());
 748       }
 749       break;
 750     }
 751     default:
 752       ShouldNotReachHere();
 753   }
 754 }
 755 
 756 void MacroAssembler::la(Register Rd, Label &label) {
 757   IncompressibleRegion ir(this);   // the label address may be patched back.
 758   wrap_label(Rd, label, &MacroAssembler::la);
 759 }
 760 
 761 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 762   lui(Rd, (uint32_t)imm << 12);
 763   srli(Rd, Rd, 12);
 764 }
 765 
 766 void MacroAssembler::li32(Register Rd, int32_t imm) {
 767   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 768   int64_t upper = imm, lower = imm;
 769   lower = (imm << 20) >> 20;
 770   upper -= lower;
 771   upper = (int32_t)upper;
 772   // lui Rd, imm[31:12] + imm[11]
 773   lui(Rd, upper);
 774   // use addiw to distinguish li32 to li64
 775   addiw(Rd, Rd, lower);
 776 }
 777 
 778 void MacroAssembler::li64(Register Rd, int64_t imm) {
 779   // Load upper 32 bits. upper = imm[63:32], but if imm[31] == 1 or
 780   // (imm[31:20] == 0x7ff && imm[19] == 1), upper = imm[63:32] + 1.
 781   int64_t lower = imm & 0xffffffff;
 782   lower -= ((lower << 44) >> 44);
 783   int64_t tmp_imm = ((uint64_t)(imm & 0xffffffff00000000)) + (uint64_t)lower;
 784   int32_t upper = (tmp_imm - (int32_t)lower) >> 32;
 785 
 786   // Load upper 32 bits
 787   int64_t up = upper, lo = upper;
 788   lo = (lo << 52) >> 52;
 789   up -= lo;
 790   up = (int32_t)up;
 791   lui(Rd, up);
 792   addi(Rd, Rd, lo);
 793 
 794   // Load the rest 32 bits.
 795   slli(Rd, Rd, 12);
 796   addi(Rd, Rd, (int32_t)lower >> 20);
 797   slli(Rd, Rd, 12);
 798   lower = ((int32_t)imm << 12) >> 20;
 799   addi(Rd, Rd, lower);
 800   slli(Rd, Rd, 8);
 801   lower = imm & 0xff;
 802   addi(Rd, Rd, lower);
 803 }
 804 
 805 void MacroAssembler::li(Register Rd, int64_t imm) {
 806   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 807   // li -> c.li
 808   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 809     c_li(Rd, imm);
 810     return;
 811   }
 812 
 813   int shift = 12;
 814   int64_t upper = imm, lower = imm;
 815   // Split imm to a lower 12-bit sign-extended part and the remainder,
 816   // because addi will sign-extend the lower imm.
 817   lower = ((int32_t)imm << 20) >> 20;
 818   upper -= lower;
 819 
 820   // Test whether imm is a 32-bit integer.
 821   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 822         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 823     while (((upper >> shift) & 1) == 0) { shift++; }
 824     upper >>= shift;
 825     li(Rd, upper);
 826     slli(Rd, Rd, shift);
 827     if (lower != 0) {
 828       addi(Rd, Rd, lower);
 829     }
 830   } else {
 831     // 32-bit integer
 832     Register hi_Rd = zr;
 833     if (upper != 0) {
 834       lui(Rd, (int32_t)upper);
 835       hi_Rd = Rd;
 836     }
 837     if (lower != 0 || hi_Rd == zr) {
 838       addiw(Rd, hi_Rd, lower);
 839     }
 840   }
 841 }
 842 
 843 #define INSN(NAME, REGISTER)                                       \
 844   void MacroAssembler::NAME(const address dest, Register temp) {   \
 845     assert_cond(dest != nullptr);                                  \
 846     int64_t distance = dest - pc();                                \
 847     if (is_simm21(distance) && ((distance % 2) == 0)) {            \
 848       Assembler::jal(REGISTER, distance);                          \
 849     } else {                                                       \
 850       assert(temp != noreg, "expecting a register");               \
 851       int32_t offset = 0;                                          \
 852       movptr(temp, dest, offset);                                  \
 853       Assembler::jalr(REGISTER, temp, offset);                     \
 854     }                                                              \
 855   }                                                                \
 856 
 857   INSN(j,   x0);
 858   INSN(jal, x1);
 859 
 860 #undef INSN
 861 
 862 #define INSN(NAME, REGISTER)                                       \
 863   void MacroAssembler::NAME(const Address &adr, Register temp) {   \
 864     switch (adr.getMode()) {                                       \
 865       case Address::literal: {                                     \
 866         relocate(adr.rspec(), [&] {                                \
 867           NAME(adr.target(), temp);                                \
 868         });                                                        \
 869         break;                                                     \
 870       }                                                            \
 871       case Address::base_plus_offset: {                            \
 872         int32_t offset = ((int32_t)adr.offset() << 20) >> 20;      \
 873         la(temp, Address(adr.base(), adr.offset() - offset));      \
 874         Assembler::jalr(REGISTER, temp, offset);                   \
 875         break;                                                     \
 876       }                                                            \
 877       default:                                                     \
 878         ShouldNotReachHere();                                      \
 879     }                                                              \
 880   }
 881 
 882   INSN(j,   x0);
 883   INSN(jal, x1);
 884 
 885 #undef INSN
 886 
 887 #define INSN(NAME)                                                                    \
 888   void MacroAssembler::NAME(Register Rd, const address dest, Register temp) {         \
 889     assert_cond(dest != nullptr);                                                     \
 890     int64_t distance = dest - pc();                                                   \
 891     if (is_simm21(distance) && ((distance % 2) == 0)) {                               \
 892       Assembler::NAME(Rd, distance);                                                  \
 893     } else {                                                                          \
 894       assert_different_registers(Rd, temp);                                           \
 895       int32_t offset = 0;                                                             \
 896       movptr(temp, dest, offset);                                                     \
 897       jalr(Rd, temp, offset);                                                         \
 898     }                                                                                 \
 899   }                                                                                   \
 900   void MacroAssembler::NAME(Register Rd, Label &L, Register temp) {                   \
 901     assert_different_registers(Rd, temp);                                             \
 902     wrap_label(Rd, L, temp, &MacroAssembler::NAME);                                   \
 903   }
 904 
 905   INSN(jal);
 906 
 907 #undef INSN
 908 
 909 #define INSN(NAME, REGISTER)                                       \
 910   void MacroAssembler::NAME(Label &l, Register temp) {             \
 911     jal(REGISTER, l, temp);                                        \
 912   }                                                                \
 913 
 914   INSN(j,   x0);
 915   INSN(jal, x1);
 916 
 917 #undef INSN
 918 
 919 void MacroAssembler::wrap_label(Register Rt, Label &L, Register tmp, load_insn_by_temp insn) {
 920   if (L.is_bound()) {
 921     (this->*insn)(Rt, target(L), tmp);
 922   } else {
 923     L.add_patch_at(code(), locator());
 924     (this->*insn)(Rt, pc(), tmp);
 925   }
 926 }
 927 
 928 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
 929   if (L.is_bound()) {
 930     (this->*insn)(Rt, target(L));
 931   } else {
 932     L.add_patch_at(code(), locator());
 933     (this->*insn)(Rt, pc());
 934   }
 935 }
 936 
 937 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
 938                                 compare_and_branch_insn insn,
 939                                 compare_and_branch_label_insn neg_insn, bool is_far) {
 940   if (is_far) {
 941     Label done;
 942     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
 943     j(L);
 944     bind(done);
 945   } else {
 946     if (L.is_bound()) {
 947       (this->*insn)(r1, r2, target(L));
 948     } else {
 949       L.add_patch_at(code(), locator());
 950       (this->*insn)(r1, r2, pc());
 951     }
 952   }
 953 }
 954 
 955 #define INSN(NAME, NEG_INSN)                                                              \
 956   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
 957     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
 958   }
 959 
 960   INSN(beq,  bne);
 961   INSN(bne,  beq);
 962   INSN(blt,  bge);
 963   INSN(bge,  blt);
 964   INSN(bltu, bgeu);
 965   INSN(bgeu, bltu);
 966 
 967 #undef INSN
 968 
 969 #define INSN(NAME)                                                                \
 970   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
 971     NAME(Rs, zr, dest);                                                           \
 972   }                                                                               \
 973   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
 974     NAME(Rs, zr, l, is_far);                                                      \
 975   }                                                                               \
 976 
 977   INSN(beq);
 978   INSN(bne);
 979   INSN(blt);
 980   INSN(ble);
 981   INSN(bge);
 982   INSN(bgt);
 983 
 984 #undef INSN
 985 
 986 #define INSN(NAME, NEG_INSN)                                                      \
 987   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
 988     NEG_INSN(Rt, Rs, dest);                                                       \
 989   }                                                                               \
 990   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
 991     NEG_INSN(Rt, Rs, l, is_far);                                                  \
 992   }
 993 
 994   INSN(bgt,  blt);
 995   INSN(ble,  bge);
 996   INSN(bgtu, bltu);
 997   INSN(bleu, bgeu);
 998 
 999 #undef INSN
1000 
1001 // Float compare branch instructions
1002 
1003 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1004   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1005     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1006     BRANCH(t0, l, is_far);                                                                                              \
1007   }                                                                                                                     \
1008   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1009     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1010     BRANCH(t0, l, is_far);                                                                                              \
1011   }
1012 
1013   INSN(beq, feq, bnez);
1014   INSN(bne, feq, beqz);
1015 
1016 #undef INSN
1017 
1018 
1019 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1020   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1021                                     bool is_far, bool is_unordered) {                 \
1022     if (is_unordered) {                                                               \
1023       /* jump if either source is NaN or condition is expected */                     \
1024       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1025       beqz(t0, l, is_far);                                                            \
1026     } else {                                                                          \
1027       /* jump if no NaN in source and condition is expected */                        \
1028       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1029       bnez(t0, l, is_far);                                                            \
1030     }                                                                                 \
1031   }                                                                                   \
1032   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1033                                      bool is_far, bool is_unordered) {                \
1034     if (is_unordered) {                                                               \
1035       /* jump if either source is NaN or condition is expected */                     \
1036       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1037       beqz(t0, l, is_far);                                                            \
1038     } else {                                                                          \
1039       /* jump if no NaN in source and condition is expected */                        \
1040       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1041       bnez(t0, l, is_far);                                                            \
1042     }                                                                                 \
1043   }
1044 
1045   INSN(ble, fle, flt);
1046   INSN(blt, flt, fle);
1047 
1048 #undef INSN
1049 
1050 #define INSN(NAME, CMP)                                                              \
1051   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1052                                     bool is_far, bool is_unordered) {                \
1053     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1054   }                                                                                  \
1055   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1056                                      bool is_far, bool is_unordered) {               \
1057     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1058   }
1059 
1060   INSN(bgt, blt);
1061   INSN(bge, ble);
1062 
1063 #undef INSN
1064 
1065 
1066 #define INSN(NAME, CSR)                       \
1067   void MacroAssembler::NAME(Register Rd) {    \
1068     csrr(Rd, CSR);                            \
1069   }
1070 
1071   INSN(rdinstret,  CSR_INSTRET);
1072   INSN(rdcycle,    CSR_CYCLE);
1073   INSN(rdtime,     CSR_TIME);
1074   INSN(frcsr,      CSR_FCSR);
1075   INSN(frrm,       CSR_FRM);
1076   INSN(frflags,    CSR_FFLAGS);
1077 
1078 #undef INSN
1079 
1080 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1081   csrrs(Rd, csr, x0);
1082 }
1083 
1084 #define INSN(NAME, OPFUN)                                      \
1085   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1086     OPFUN(x0, csr, Rs);                                        \
1087   }
1088 
1089   INSN(csrw, csrrw);
1090   INSN(csrs, csrrs);
1091   INSN(csrc, csrrc);
1092 
1093 #undef INSN
1094 
1095 #define INSN(NAME, OPFUN)                                      \
1096   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1097     OPFUN(x0, csr, imm);                                       \
1098   }
1099 
1100   INSN(csrwi, csrrwi);
1101   INSN(csrsi, csrrsi);
1102   INSN(csrci, csrrci);
1103 
1104 #undef INSN
1105 
1106 #define INSN(NAME, CSR)                                      \
1107   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1108     csrrw(Rd, CSR, Rs);                                      \
1109   }
1110 
1111   INSN(fscsr,   CSR_FCSR);
1112   INSN(fsrm,    CSR_FRM);
1113   INSN(fsflags, CSR_FFLAGS);
1114 
1115 #undef INSN
1116 
1117 #define INSN(NAME)                              \
1118   void MacroAssembler::NAME(Register Rs) {      \
1119     NAME(x0, Rs);                               \
1120   }
1121 
1122   INSN(fscsr);
1123   INSN(fsrm);
1124   INSN(fsflags);
1125 
1126 #undef INSN
1127 
1128 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1129   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1130   csrrwi(Rd, CSR_FRM, imm);
1131 }
1132 
1133 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1134    csrrwi(Rd, CSR_FFLAGS, imm);
1135 }
1136 
1137 #define INSN(NAME)                             \
1138   void MacroAssembler::NAME(unsigned imm) {    \
1139     NAME(x0, imm);                             \
1140   }
1141 
1142   INSN(fsrmi);
1143   INSN(fsflagsi);
1144 
1145 #undef INSN
1146 
1147 void MacroAssembler::push_reg(Register Rs)
1148 {
1149   addi(esp, esp, 0 - wordSize);
1150   sd(Rs, Address(esp, 0));
1151 }
1152 
1153 void MacroAssembler::pop_reg(Register Rd)
1154 {
1155   ld(Rd, Address(esp, 0));
1156   addi(esp, esp, wordSize);
1157 }
1158 
1159 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1160   int count = 0;
1161   // Scan bitset to accumulate register pairs
1162   for (int reg = 31; reg >= 0; reg--) {
1163     if ((1U << 31) & bitset) {
1164       regs[count++] = reg;
1165     }
1166     bitset <<= 1;
1167   }
1168   return count;
1169 }
1170 
1171 // Push integer registers in the bitset supplied. Don't push sp.
1172 // Return the number of words pushed
1173 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1174   DEBUG_ONLY(int words_pushed = 0;)
1175   unsigned char regs[32];
1176   int count = bitset_to_regs(bitset, regs);
1177   // reserve one slot to align for odd count
1178   int offset = is_even(count) ? 0 : wordSize;
1179 
1180   if (count) {
1181     addi(stack, stack, -count * wordSize - offset);
1182   }
1183   for (int i = count - 1; i >= 0; i--) {
1184     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1185     DEBUG_ONLY(words_pushed++;)
1186   }
1187 
1188   assert(words_pushed == count, "oops, pushed != count");
1189 
1190   return count;
1191 }
1192 
1193 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1194   DEBUG_ONLY(int words_popped = 0;)
1195   unsigned char regs[32];
1196   int count = bitset_to_regs(bitset, regs);
1197   // reserve one slot to align for odd count
1198   int offset = is_even(count) ? 0 : wordSize;
1199 
1200   for (int i = count - 1; i >= 0; i--) {
1201     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1202     DEBUG_ONLY(words_popped++;)
1203   }
1204 
1205   if (count) {
1206     addi(stack, stack, count * wordSize + offset);
1207   }
1208   assert(words_popped == count, "oops, popped != count");
1209 
1210   return count;
1211 }
1212 
1213 // Push floating-point registers in the bitset supplied.
1214 // Return the number of words pushed
1215 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1216   DEBUG_ONLY(int words_pushed = 0;)
1217   unsigned char regs[32];
1218   int count = bitset_to_regs(bitset, regs);
1219   int push_slots = count + (count & 1);
1220 
1221   if (count) {
1222     addi(stack, stack, -push_slots * wordSize);
1223   }
1224 
1225   for (int i = count - 1; i >= 0; i--) {
1226     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1227     DEBUG_ONLY(words_pushed++;)
1228   }
1229 
1230   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1231 
1232   return count;
1233 }
1234 
1235 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1236   DEBUG_ONLY(int words_popped = 0;)
1237   unsigned char regs[32];
1238   int count = bitset_to_regs(bitset, regs);
1239   int pop_slots = count + (count & 1);
1240 
1241   for (int i = count - 1; i >= 0; i--) {
1242     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1243     DEBUG_ONLY(words_popped++;)
1244   }
1245 
1246   if (count) {
1247     addi(stack, stack, pop_slots * wordSize);
1248   }
1249 
1250   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1251 
1252   return count;
1253 }
1254 
1255 #ifdef COMPILER2
1256 // Push vector registers in the bitset supplied.
1257 // Return the number of words pushed
1258 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1259   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1260 
1261   // Scan bitset to accumulate register pairs
1262   unsigned char regs[32];
1263   int count = bitset_to_regs(bitset, regs);
1264 
1265   for (int i = 0; i < count; i++) {
1266     sub(stack, stack, vector_size_in_bytes);
1267     vs1r_v(as_VectorRegister(regs[i]), stack);
1268   }
1269 
1270   return count * vector_size_in_bytes / wordSize;
1271 }
1272 
1273 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1274   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1275 
1276   // Scan bitset to accumulate register pairs
1277   unsigned char regs[32];
1278   int count = bitset_to_regs(bitset, regs);
1279 
1280   for (int i = count - 1; i >= 0; i--) {
1281     vl1r_v(as_VectorRegister(regs[i]), stack);
1282     add(stack, stack, vector_size_in_bytes);
1283   }
1284 
1285   return count * vector_size_in_bytes / wordSize;
1286 }
1287 #endif // COMPILER2
1288 
1289 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1290   // Push integer registers x7, x10-x17, x28-x31.
1291   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1292 
1293   // Push float registers f0-f7, f10-f17, f28-f31.
1294   addi(sp, sp, - wordSize * 20);
1295   int offset = 0;
1296   for (int i = 0; i < 32; i++) {
1297     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1298       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1299     }
1300   }
1301 }
1302 
1303 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1304   int offset = 0;
1305   for (int i = 0; i < 32; i++) {
1306     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1307       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1308     }
1309   }
1310   addi(sp, sp, wordSize * 20);
1311 
1312   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1313 }
1314 
1315 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1316   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1317   push_reg(RegSet::range(x5, x31), sp);
1318 
1319   // float registers
1320   addi(sp, sp, - 32 * wordSize);
1321   for (int i = 0; i < 32; i++) {
1322     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1323   }
1324 
1325   // vector registers
1326   if (save_vectors) {
1327     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1328     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1329     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1330       add(t0, sp, vector_size_in_bytes * i);
1331       vse64_v(as_VectorRegister(i), t0);
1332     }
1333   }
1334 }
1335 
1336 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1337   // vector registers
1338   if (restore_vectors) {
1339     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1340     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1341       vle64_v(as_VectorRegister(i), sp);
1342       add(sp, sp, vector_size_in_bytes * 8);
1343     }
1344   }
1345 
1346   // float registers
1347   for (int i = 0; i < 32; i++) {
1348     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1349   }
1350   addi(sp, sp, 32 * wordSize);
1351 
1352   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1353   pop_reg(RegSet::range(x5, x31), sp);
1354 }
1355 
1356 static int patch_offset_in_jal(address branch, int64_t offset) {
1357   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1358          "offset is too large to be patched in one jal instruction!\n");
1359   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1360   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1361   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1362   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1363   return NativeInstruction::instruction_size;                                   // only one instruction
1364 }
1365 
1366 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1367   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1368          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1369   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1370   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1371   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1372   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1373   return NativeInstruction::instruction_size;                                   // only one instruction
1374 }
1375 
1376 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1377   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1378   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1379   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1380   return PC_RELATIVE_INSTRUCTION_NUM * NativeInstruction::instruction_size;
1381 }
1382 
1383 static int patch_addr_in_movptr(address branch, address target) {
1384   const int MOVPTR_INSTRUCTIONS_NUM = 6;                                        // lui + addi + slli + addi + slli + addi/jalr/load
1385   int32_t lower = ((intptr_t)target << 35) >> 35;
1386   int64_t upper = ((intptr_t)target - lower) >> 29;
1387   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1388   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1389   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1390   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1391   return MOVPTR_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1392 }
1393 
1394 static int patch_imm_in_li64(address branch, address target) {
1395   const int LI64_INSTRUCTIONS_NUM = 8;                                          // lui + addi + slli + addi + slli + addi + slli + addi
1396   int64_t lower = (intptr_t)target & 0xffffffff;
1397   lower = lower - ((lower << 44) >> 44);
1398   int64_t tmp_imm = ((uint64_t)((intptr_t)target & 0xffffffff00000000)) + (uint64_t)lower;
1399   int32_t upper =  (tmp_imm - (int32_t)lower) >> 32;
1400   int64_t tmp_upper = upper, tmp_lower = upper;
1401   tmp_lower = (tmp_lower << 52) >> 52;
1402   tmp_upper -= tmp_lower;
1403   tmp_upper >>= 12;
1404   // Load upper 32 bits. Upper = target[63:32], but if target[31] = 1 or (target[31:20] == 0x7ff && target[19] == 1),
1405   // upper = target[63:32] + 1.
1406   Assembler::patch(branch + 0,  31, 12, tmp_upper & 0xfffff);                       // Lui.
1407   Assembler::patch(branch + 4,  31, 20, tmp_lower & 0xfff);                         // Addi.
1408   // Load the rest 32 bits.
1409   Assembler::patch(branch + 12, 31, 20, ((int32_t)lower >> 20) & 0xfff);            // Addi.
1410   Assembler::patch(branch + 20, 31, 20, (((intptr_t)target << 44) >> 52) & 0xfff);  // Addi.
1411   Assembler::patch(branch + 28, 31, 20, (intptr_t)target & 0xff);                   // Addi.
1412   return LI64_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1413 }
1414 
1415 static int patch_imm_in_li16u(address branch, uint16_t target) {
1416   Assembler::patch(branch, 31, 12, target); // patch lui only
1417   return NativeInstruction::instruction_size;
1418 }
1419 
1420 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1421   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1422   int64_t upper = (intptr_t)target;
1423   int32_t lower = (((int32_t)target) << 20) >> 20;
1424   upper -= lower;
1425   upper = (int32_t)upper;
1426   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1427   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1428   return LI32_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1429 }
1430 
1431 static long get_offset_of_jal(address insn_addr) {
1432   assert_cond(insn_addr != nullptr);
1433   long offset = 0;
1434   unsigned insn = Assembler::ld_instr(insn_addr);
1435   long val = (long)Assembler::sextract(insn, 31, 12);
1436   offset |= ((val >> 19) & 0x1) << 20;
1437   offset |= (val & 0xff) << 12;
1438   offset |= ((val >> 8) & 0x1) << 11;
1439   offset |= ((val >> 9) & 0x3ff) << 1;
1440   offset = (offset << 43) >> 43;
1441   return offset;
1442 }
1443 
1444 static long get_offset_of_conditional_branch(address insn_addr) {
1445   long offset = 0;
1446   assert_cond(insn_addr != nullptr);
1447   unsigned insn = Assembler::ld_instr(insn_addr);
1448   offset = (long)Assembler::sextract(insn, 31, 31);
1449   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1450   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1451   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1452   offset = (offset << 41) >> 41;
1453   return offset;
1454 }
1455 
1456 static long get_offset_of_pc_relative(address insn_addr) {
1457   long offset = 0;
1458   assert_cond(insn_addr != nullptr);
1459   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1460   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1461   offset = (offset << 32) >> 32;
1462   return offset;
1463 }
1464 
1465 static address get_target_of_movptr(address insn_addr) {
1466   assert_cond(insn_addr != nullptr);
1467   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1468   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1469   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1470   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1471   return (address) target_address;
1472 }
1473 
1474 static address get_target_of_li64(address insn_addr) {
1475   assert_cond(insn_addr != nullptr);
1476   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 44; // Lui.
1477   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 32;                 // Addi.
1478   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 20;                // Addi.
1479   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)) << 8;                 // Addi.
1480   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 28), 31, 20));                      // Addi.
1481   return (address)target_address;
1482 }
1483 
1484 address MacroAssembler::get_target_of_li32(address insn_addr) {
1485   assert_cond(insn_addr != nullptr);
1486   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1487   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1488   return (address)target_address;
1489 }
1490 
1491 // Patch any kind of instruction; there may be several instructions.
1492 // Return the total length (in bytes) of the instructions.
1493 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
1494   assert_cond(branch != nullptr);
1495   int64_t offset = target - branch;
1496   if (NativeInstruction::is_jal_at(branch)) {                         // jal
1497     return patch_offset_in_jal(branch, offset);
1498   } else if (NativeInstruction::is_branch_at(branch)) {               // beq/bge/bgeu/blt/bltu/bne
1499     return patch_offset_in_conditional_branch(branch, offset);
1500   } else if (NativeInstruction::is_pc_relative_at(branch)) {          // auipc, addi/jalr/load
1501     return patch_offset_in_pc_relative(branch, offset);
1502   } else if (NativeInstruction::is_movptr_at(branch)) {               // movptr
1503     return patch_addr_in_movptr(branch, target);
1504   } else if (NativeInstruction::is_li64_at(branch)) {                 // li64
1505     return patch_imm_in_li64(branch, target);
1506   } else if (NativeInstruction::is_li32_at(branch)) {                 // li32
1507     int64_t imm = (intptr_t)target;
1508     return patch_imm_in_li32(branch, (int32_t)imm);
1509   } else if (NativeInstruction::is_li16u_at(branch)) {
1510     int64_t imm = (intptr_t)target;
1511     return patch_imm_in_li16u(branch, (uint16_t)imm);
1512   } else {
1513 #ifdef ASSERT
1514     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1515                   Assembler::ld_instr(branch), p2i(branch));
1516     Disassembler::decode(branch - 16, branch + 16);
1517 #endif
1518     ShouldNotReachHere();
1519     return -1;
1520   }
1521 }
1522 
1523 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1524   long offset = 0;
1525   assert_cond(insn_addr != nullptr);
1526   if (NativeInstruction::is_jal_at(insn_addr)) {                     // jal
1527     offset = get_offset_of_jal(insn_addr);
1528   } else if (NativeInstruction::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1529     offset = get_offset_of_conditional_branch(insn_addr);
1530   } else if (NativeInstruction::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1531     offset = get_offset_of_pc_relative(insn_addr);
1532   } else if (NativeInstruction::is_movptr_at(insn_addr)) {           // movptr
1533     return get_target_of_movptr(insn_addr);
1534   } else if (NativeInstruction::is_li64_at(insn_addr)) {             // li64
1535     return get_target_of_li64(insn_addr);
1536   } else if (NativeInstruction::is_li32_at(insn_addr)) {             // li32
1537     return get_target_of_li32(insn_addr);
1538   } else {
1539     ShouldNotReachHere();
1540   }
1541   return address(((uintptr_t)insn_addr + offset));
1542 }
1543 
1544 int MacroAssembler::patch_oop(address insn_addr, address o) {
1545   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1546   // narrow OOPs by setting the upper 16 bits in the first
1547   // instruction.
1548   if (NativeInstruction::is_li32_at(insn_addr)) {
1549     // Move narrow OOP
1550     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1551     return patch_imm_in_li32(insn_addr, (int32_t)n);
1552   } else if (NativeInstruction::is_movptr_at(insn_addr)) {
1553     // Move wide OOP
1554     return patch_addr_in_movptr(insn_addr, o);
1555   }
1556   ShouldNotReachHere();
1557   return -1;
1558 }
1559 
1560 void MacroAssembler::reinit_heapbase() {
1561   if (UseCompressedOops) {
1562     if (Universe::is_fully_initialized()) {
1563       mv(xheapbase, CompressedOops::ptrs_base());
1564     } else {
1565       ExternalAddress target(CompressedOops::ptrs_base_addr());
1566       relocate(target.rspec(), [&] {
1567         int32_t offset;
1568         la_patchable(xheapbase, target, offset);
1569         ld(xheapbase, Address(xheapbase, offset));
1570       });
1571     }
1572   }
1573 }
1574 
1575 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset) {
1576   int64_t imm64 = (int64_t)addr;
1577 #ifndef PRODUCT
1578   {
1579     char buffer[64];
1580     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, imm64);
1581     block_comment(buffer);
1582   }
1583 #endif
1584   assert((uintptr_t)imm64 < (1ull << 48), "48-bit overflow in address constant");
1585   // Load upper 31 bits
1586   int64_t imm = imm64 >> 17;
1587   int64_t upper = imm, lower = imm;
1588   lower = (lower << 52) >> 52;
1589   upper -= lower;
1590   upper = (int32_t)upper;
1591   lui(Rd, upper);
1592   addi(Rd, Rd, lower);
1593 
1594   // Load the rest 17 bits.
1595   slli(Rd, Rd, 11);
1596   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
1597   slli(Rd, Rd, 6);
1598 
1599   // This offset will be used by following jalr/ld.
1600   offset = imm64 & 0x3f;
1601 }
1602 
1603 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
1604   if (is_simm12(increment)) {
1605     addi(Rd, Rn, increment);
1606   } else {
1607     assert_different_registers(Rn, temp);
1608     li(temp, increment);
1609     add(Rd, Rn, temp);
1610   }
1611 }
1612 
1613 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
1614   if (is_simm12(increment)) {
1615     addiw(Rd, Rn, increment);
1616   } else {
1617     assert_different_registers(Rn, temp);
1618     li(temp, increment);
1619     addw(Rd, Rn, temp);
1620   }
1621 }
1622 
1623 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
1624   if (is_simm12(-decrement)) {
1625     addi(Rd, Rn, -decrement);
1626   } else {
1627     assert_different_registers(Rn, temp);
1628     li(temp, decrement);
1629     sub(Rd, Rn, temp);
1630   }
1631 }
1632 
1633 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
1634   if (is_simm12(-decrement)) {
1635     addiw(Rd, Rn, -decrement);
1636   } else {
1637     assert_different_registers(Rn, temp);
1638     li(temp, decrement);
1639     subw(Rd, Rn, temp);
1640   }
1641 }
1642 
1643 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
1644   andr(Rd, Rs1, Rs2);
1645   sign_extend(Rd, Rd, 32);
1646 }
1647 
1648 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
1649   orr(Rd, Rs1, Rs2);
1650   sign_extend(Rd, Rd, 32);
1651 }
1652 
1653 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
1654   xorr(Rd, Rs1, Rs2);
1655   sign_extend(Rd, Rd, 32);
1656 }
1657 
1658 // Rd = Rs1 & (~Rd2)
1659 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
1660   if (UseZbb) {
1661     Assembler::andn(Rd, Rs1, Rs2);
1662     return;
1663   }
1664 
1665   notr(Rd, Rs2);
1666   andr(Rd, Rs1, Rd);
1667 }
1668 
1669 // Rd = Rs1 | (~Rd2)
1670 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
1671   if (UseZbb) {
1672     Assembler::orn(Rd, Rs1, Rs2);
1673     return;
1674   }
1675 
1676   notr(Rd, Rs2);
1677   orr(Rd, Rs1, Rd);
1678 }
1679 
1680 // Note: load_unsigned_short used to be called load_unsigned_word.
1681 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1682   int off = offset();
1683   lhu(dst, src);
1684   return off;
1685 }
1686 
1687 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1688   int off = offset();
1689   lbu(dst, src);
1690   return off;
1691 }
1692 
1693 int MacroAssembler::load_signed_short(Register dst, Address src) {
1694   int off = offset();
1695   lh(dst, src);
1696   return off;
1697 }
1698 
1699 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1700   int off = offset();
1701   lb(dst, src);
1702   return off;
1703 }
1704 
1705 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
1706   switch (size_in_bytes) {
1707     case  8:  ld(dst, src); break;
1708     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
1709     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1710     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1711     default:  ShouldNotReachHere();
1712   }
1713 }
1714 
1715 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
1716   switch (size_in_bytes) {
1717     case  8:  sd(src, dst); break;
1718     case  4:  sw(src, dst); break;
1719     case  2:  sh(src, dst); break;
1720     case  1:  sb(src, dst); break;
1721     default:  ShouldNotReachHere();
1722   }
1723 }
1724 
1725 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
1726 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1727   if (granularity != 1 && granularity != 2) {
1728     ShouldNotReachHere();
1729   }
1730   if (AvoidUnalignedAccesses && (granularity != 2)) {
1731     assert_different_registers(dst, tmp);
1732     assert_different_registers(tmp, src.base());
1733     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
1734     slli(tmp, tmp, 8);
1735     lbu(dst, src);
1736     add(dst, dst, tmp);
1737   } else {
1738     is_signed ? lh(dst, src) : lhu(dst, src);
1739   }
1740 }
1741 
1742 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
1743 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1744   if (AvoidUnalignedAccesses && (granularity != 4)) {
1745     switch(granularity) {
1746       case 1:
1747         assert_different_registers(dst, tmp, src.base());
1748         lbu(dst, src);
1749         lbu(tmp, Address(src.base(), src.offset() + 1));
1750         slli(tmp, tmp, 8);
1751         add(dst, dst, tmp);
1752         lbu(tmp, Address(src.base(), src.offset() + 2));
1753         slli(tmp, tmp, 16);
1754         add(dst, dst, tmp);
1755         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
1756         slli(tmp, tmp, 24);
1757         add(dst, dst, tmp);
1758         break;
1759       case 2:
1760         assert_different_registers(dst, tmp);
1761         assert_different_registers(tmp, src.base());
1762         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
1763         slli(tmp, tmp, 16);
1764         lhu(dst, src);
1765         add(dst, dst, tmp);
1766         break;
1767       default:
1768         ShouldNotReachHere();
1769     }
1770   } else {
1771     is_signed ? lw(dst, src) : lwu(dst, src);
1772   }
1773 }
1774 
1775 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
1776 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
1777   if (AvoidUnalignedAccesses && (granularity != 8)) {
1778     switch(granularity){
1779       case 1:
1780         assert_different_registers(dst, tmp, src.base());
1781         lbu(dst, src);
1782         lbu(tmp, Address(src.base(), src.offset() + 1));
1783         slli(tmp, tmp, 8);
1784         add(dst, dst, tmp);
1785         lbu(tmp, Address(src.base(), src.offset() + 2));
1786         slli(tmp, tmp, 16);
1787         add(dst, dst, tmp);
1788         lbu(tmp, Address(src.base(), src.offset() + 3));
1789         slli(tmp, tmp, 24);
1790         add(dst, dst, tmp);
1791         lbu(tmp, Address(src.base(), src.offset() + 4));
1792         slli(tmp, tmp, 32);
1793         add(dst, dst, tmp);
1794         lbu(tmp, Address(src.base(), src.offset() + 5));
1795         slli(tmp, tmp, 40);
1796         add(dst, dst, tmp);
1797         lbu(tmp, Address(src.base(), src.offset() + 6));
1798         slli(tmp, tmp, 48);
1799         add(dst, dst, tmp);
1800         lbu(tmp, Address(src.base(), src.offset() + 7));
1801         slli(tmp, tmp, 56);
1802         add(dst, dst, tmp);
1803         break;
1804       case 2:
1805         assert_different_registers(dst, tmp, src.base());
1806         lhu(dst, src);
1807         lhu(tmp, Address(src.base(), src.offset() + 2));
1808         slli(tmp, tmp, 16);
1809         add(dst, dst, tmp);
1810         lhu(tmp, Address(src.base(), src.offset() + 4));
1811         slli(tmp, tmp, 32);
1812         add(dst, dst, tmp);
1813         lhu(tmp, Address(src.base(), src.offset() + 6));
1814         slli(tmp, tmp, 48);
1815         add(dst, dst, tmp);
1816         break;
1817       case 4:
1818         assert_different_registers(dst, tmp);
1819         assert_different_registers(tmp, src.base());
1820         lwu(tmp, Address(src.base(), src.offset() + 4));
1821         slli(tmp, tmp, 32);
1822         lwu(dst, src);
1823         add(dst, dst, tmp);
1824         break;
1825       default:
1826         ShouldNotReachHere();
1827     }
1828   } else {
1829     ld(dst, src);
1830   }
1831 }
1832 
1833 
1834 // reverse bytes in halfword in lower 16 bits and sign-extend
1835 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
1836 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
1837   if (UseZbb) {
1838     rev8(Rd, Rs);
1839     srai(Rd, Rd, 48);
1840     return;
1841   }
1842   assert_different_registers(Rs, tmp);
1843   assert_different_registers(Rd, tmp);
1844   srli(tmp, Rs, 8);
1845   andi(tmp, tmp, 0xFF);
1846   slli(Rd, Rs, 56);
1847   srai(Rd, Rd, 48); // sign-extend
1848   orr(Rd, Rd, tmp);
1849 }
1850 
1851 // reverse bytes in lower word and sign-extend
1852 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
1853 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1854   if (UseZbb) {
1855     rev8(Rd, Rs);
1856     srai(Rd, Rd, 32);
1857     return;
1858   }
1859   assert_different_registers(Rs, tmp1, tmp2);
1860   assert_different_registers(Rd, tmp1, tmp2);
1861   revb_h_w_u(Rd, Rs, tmp1, tmp2);
1862   slli(tmp2, Rd, 48);
1863   srai(tmp2, tmp2, 32); // sign-extend
1864   srli(Rd, Rd, 16);
1865   orr(Rd, Rd, tmp2);
1866 }
1867 
1868 // reverse bytes in halfword in lower 16 bits and zero-extend
1869 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1870 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
1871   if (UseZbb) {
1872     rev8(Rd, Rs);
1873     srli(Rd, Rd, 48);
1874     return;
1875   }
1876   assert_different_registers(Rs, tmp);
1877   assert_different_registers(Rd, tmp);
1878   srli(tmp, Rs, 8);
1879   andi(tmp, tmp, 0xFF);
1880   andi(Rd, Rs, 0xFF);
1881   slli(Rd, Rd, 8);
1882   orr(Rd, Rd, tmp);
1883 }
1884 
1885 // reverse bytes in halfwords in lower 32 bits and zero-extend
1886 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1887 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1888   if (UseZbb) {
1889     rev8(Rd, Rs);
1890     rori(Rd, Rd, 32);
1891     roriw(Rd, Rd, 16);
1892     zero_extend(Rd, Rd, 32);
1893     return;
1894   }
1895   assert_different_registers(Rs, tmp1, tmp2);
1896   assert_different_registers(Rd, tmp1, tmp2);
1897   srli(tmp2, Rs, 16);
1898   revb_h_h_u(tmp2, tmp2, tmp1);
1899   revb_h_h_u(Rd, Rs, tmp1);
1900   slli(tmp2, tmp2, 16);
1901   orr(Rd, Rd, tmp2);
1902 }
1903 
1904 // This method is only used for revb_h
1905 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
1906 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1907   assert_different_registers(Rs, tmp1, tmp2);
1908   assert_different_registers(Rd, tmp1);
1909   srli(tmp1, Rs, 48);
1910   andi(tmp2, tmp1, 0xFF);
1911   slli(tmp2, tmp2, 8);
1912   srli(tmp1, tmp1, 8);
1913   orr(tmp1, tmp1, tmp2);
1914   slli(Rd, Rs, 16);
1915   orr(Rd, Rd, tmp1);
1916 }
1917 
1918 // reverse bytes in each halfword
1919 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
1920 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1921   if (UseZbb) {
1922     assert_different_registers(Rs, tmp1);
1923     assert_different_registers(Rd, tmp1);
1924     rev8(Rd, Rs);
1925     zero_extend(tmp1, Rd, 32);
1926     roriw(tmp1, tmp1, 16);
1927     slli(tmp1, tmp1, 32);
1928     srli(Rd, Rd, 32);
1929     roriw(Rd, Rd, 16);
1930     zero_extend(Rd, Rd, 32);
1931     orr(Rd, Rd, tmp1);
1932     return;
1933   }
1934   assert_different_registers(Rs, tmp1, tmp2);
1935   assert_different_registers(Rd, tmp1, tmp2);
1936   revb_h_helper(Rd, Rs, tmp1, tmp2);
1937   for (int i = 0; i < 3; ++i) {
1938     revb_h_helper(Rd, Rd, tmp1, tmp2);
1939   }
1940 }
1941 
1942 // reverse bytes in each word
1943 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
1944 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1945   if (UseZbb) {
1946     rev8(Rd, Rs);
1947     rori(Rd, Rd, 32);
1948     return;
1949   }
1950   assert_different_registers(Rs, tmp1, tmp2);
1951   assert_different_registers(Rd, tmp1, tmp2);
1952   revb(Rd, Rs, tmp1, tmp2);
1953   ror_imm(Rd, Rd, 32);
1954 }
1955 
1956 // reverse bytes in doubleword
1957 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
1958 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1959   if (UseZbb) {
1960     rev8(Rd, Rs);
1961     return;
1962   }
1963   assert_different_registers(Rs, tmp1, tmp2);
1964   assert_different_registers(Rd, tmp1, tmp2);
1965   andi(tmp1, Rs, 0xFF);
1966   slli(tmp1, tmp1, 8);
1967   for (int step = 8; step < 56; step += 8) {
1968     srli(tmp2, Rs, step);
1969     andi(tmp2, tmp2, 0xFF);
1970     orr(tmp1, tmp1, tmp2);
1971     slli(tmp1, tmp1, 8);
1972   }
1973   srli(Rd, Rs, 56);
1974   andi(Rd, Rd, 0xFF);
1975   orr(Rd, tmp1, Rd);
1976 }
1977 
1978 // rotate right with shift bits
1979 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
1980 {
1981   if (UseZbb) {
1982     rori(dst, src, shift);
1983     return;
1984   }
1985 
1986   assert_different_registers(dst, tmp);
1987   assert_different_registers(src, tmp);
1988   assert(shift < 64, "shift amount must be < 64");
1989   slli(tmp, src, 64 - shift);
1990   srli(dst, src, shift);
1991   orr(dst, dst, tmp);
1992 }
1993 
1994 // rotate left with shift bits, 32-bit version
1995 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
1996   if (UseZbb) {
1997     // no roliw available
1998     roriw(dst, src, 32 - shift);
1999     return;
2000   }
2001 
2002   assert_different_registers(dst, tmp);
2003   assert_different_registers(src, tmp);
2004   assert(shift < 32, "shift amount must be < 32");
2005   srliw(tmp, src, 32 - shift);
2006   slliw(dst, src, shift);
2007   orr(dst, dst, tmp);
2008 }
2009 
2010 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2011   if (is_simm12(imm)) {
2012     and_imm12(Rd, Rn, imm);
2013   } else {
2014     assert_different_registers(Rn, tmp);
2015     mv(tmp, imm);
2016     andr(Rd, Rn, tmp);
2017   }
2018 }
2019 
2020 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2021   ld(tmp1, adr);
2022   if (src.is_register()) {
2023     orr(tmp1, tmp1, src.as_register());
2024   } else {
2025     if (is_simm12(src.as_constant())) {
2026       ori(tmp1, tmp1, src.as_constant());
2027     } else {
2028       assert_different_registers(tmp1, tmp2);
2029       mv(tmp2, src.as_constant());
2030       orr(tmp1, tmp1, tmp2);
2031     }
2032   }
2033   sd(tmp1, adr);
2034 }
2035 
2036 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2037   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2038   if (UseCompressedClassPointers) {
2039     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2040     if (CompressedKlassPointers::base() == nullptr) {
2041       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2042       beq(trial_klass, tmp1, L);
2043       return;
2044     }
2045     decode_klass_not_null(tmp1, tmp2);
2046   } else {
2047     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2048   }
2049   beq(trial_klass, tmp1, L);
2050 }
2051 
2052 // Move an oop into a register.
2053 void MacroAssembler::movoop(Register dst, jobject obj) {
2054   int oop_index;
2055   if (obj == nullptr) {
2056     oop_index = oop_recorder()->allocate_oop_index(obj);
2057   } else {
2058 #ifdef ASSERT
2059     {
2060       ThreadInVMfromUnknown tiv;
2061       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2062     }
2063 #endif
2064     oop_index = oop_recorder()->find_index(obj);
2065   }
2066   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2067 
2068   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2069     mv(dst, Address((address)obj, rspec));
2070   } else {
2071     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2072     ld_constant(dst, Address(dummy, rspec));
2073   }
2074 }
2075 
2076 // Move a metadata address into a register.
2077 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2078   int oop_index;
2079   if (obj == nullptr) {
2080     oop_index = oop_recorder()->allocate_metadata_index(obj);
2081   } else {
2082     oop_index = oop_recorder()->find_index(obj);
2083   }
2084   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2085   mv(dst, Address((address)obj, rspec));
2086 }
2087 
2088 // Writes to stack successive pages until offset reached to check for
2089 // stack overflow + shadow pages.  This clobbers tmp.
2090 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2091   assert_different_registers(tmp, size, t0);
2092   // Bang stack for total size given plus shadow page size.
2093   // Bang one page at a time because large size can bang beyond yellow and
2094   // red zones.
2095   mv(t0, (int)os::vm_page_size());
2096   Label loop;
2097   bind(loop);
2098   sub(tmp, sp, t0);
2099   subw(size, size, t0);
2100   sd(size, Address(tmp));
2101   bgtz(size, loop);
2102 
2103   // Bang down shadow pages too.
2104   // At this point, (tmp-0) is the last address touched, so don't
2105   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2106   // was post-decremented.)  Skip this address by starting at i=1, and
2107   // touch a few more pages below.  N.B.  It is important to touch all
2108   // the way down to and including i=StackShadowPages.
2109   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2110     // this could be any sized move but this is can be a debugging crumb
2111     // so the bigger the better.
2112     sub(tmp, tmp, (int)os::vm_page_size());
2113     sd(size, Address(tmp, 0));
2114   }
2115 }
2116 
2117 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2118   int32_t offset = 0;
2119   _masm = masm;
2120   ExternalAddress target((address)flag_addr);
2121   _masm->relocate(target.rspec(), [&] {
2122     int32_t offset;
2123     _masm->la_patchable(t0, target, offset);
2124     _masm->lbu(t0, Address(t0, offset));
2125   });
2126   if (value) {
2127     _masm->bnez(t0, _label);
2128   } else {
2129     _masm->beqz(t0, _label);
2130   }
2131 }
2132 
2133 SkipIfEqual::~SkipIfEqual() {
2134   _masm->bind(_label);
2135   _masm = nullptr;
2136 }
2137 
2138 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2139   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2140   ld(dst, Address(xmethod, Method::const_offset()));
2141   ld(dst, Address(dst, ConstMethod::constants_offset()));
2142   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2143   ld(dst, Address(dst, mirror_offset));
2144   resolve_oop_handle(dst, tmp1, tmp2);
2145 }
2146 
2147 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2148   // OopHandle::resolve is an indirection.
2149   assert_different_registers(result, tmp1, tmp2);
2150   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2151 }
2152 
2153 // ((WeakHandle)result).resolve()
2154 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2155   assert_different_registers(result, tmp1, tmp2);
2156   Label resolved;
2157 
2158   // A null weak handle resolves to null.
2159   beqz(result, resolved);
2160 
2161   // Only 64 bit platforms support GCs that require a tmp register
2162   // Only IN_HEAP loads require a thread_tmp register
2163   // WeakHandle::resolve is an indirection like jweak.
2164   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2165                  result, Address(result), tmp1, tmp2);
2166   bind(resolved);
2167 }
2168 
2169 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2170                                     Register dst, Address src,
2171                                     Register tmp1, Register tmp2) {
2172   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2173   decorators = AccessInternal::decorator_fixup(decorators, type);
2174   bool as_raw = (decorators & AS_RAW) != 0;
2175   if (as_raw) {
2176     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2177   } else {
2178     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2179   }
2180 }
2181 
2182 void MacroAssembler::null_check(Register reg, int offset) {
2183   if (needs_explicit_null_check(offset)) {
2184     // provoke OS null exception if reg is null by
2185     // accessing M[reg] w/o changing any registers
2186     // NOTE: this is plenty to provoke a segv
2187     ld(zr, Address(reg, 0));
2188   } else {
2189     // nothing to do, (later) access of M[reg + offset]
2190     // will provoke OS null exception if reg is null
2191   }
2192 }
2193 
2194 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2195                                      Address dst, Register val,
2196                                      Register tmp1, Register tmp2, Register tmp3) {
2197   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2198   decorators = AccessInternal::decorator_fixup(decorators, type);
2199   bool as_raw = (decorators & AS_RAW) != 0;
2200   if (as_raw) {
2201     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2202   } else {
2203     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2204   }
2205 }
2206 
2207 // Algorithm must match CompressedOops::encode.
2208 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2209   verify_oop_msg(s, "broken oop in encode_heap_oop");
2210   if (CompressedOops::base() == nullptr) {
2211     if (CompressedOops::shift() != 0) {
2212       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2213       srli(d, s, LogMinObjAlignmentInBytes);
2214     } else {
2215       mv(d, s);
2216     }
2217   } else {
2218     Label notNull;
2219     sub(d, s, xheapbase);
2220     bgez(d, notNull);
2221     mv(d, zr);
2222     bind(notNull);
2223     if (CompressedOops::shift() != 0) {
2224       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2225       srli(d, d, CompressedOops::shift());
2226     }
2227   }
2228 }
2229 
2230 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2231   assert_different_registers(dst, tmp);
2232   assert_different_registers(src, tmp);
2233   if (UseCompressedClassPointers) {
2234     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2235     decode_klass_not_null(dst, tmp);
2236   } else {
2237     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2238   }
2239 }
2240 
2241 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2242   // FIXME: Should this be a store release? concurrent gcs assumes
2243   // klass length is valid if klass field is not null.
2244   if (UseCompressedClassPointers) {
2245     encode_klass_not_null(src, tmp);
2246     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2247   } else {
2248     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2249   }
2250 }
2251 
2252 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2253   if (UseCompressedClassPointers) {
2254     // Store to klass gap in destination
2255     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2256   }
2257 }
2258 
2259 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2260   assert_different_registers(r, tmp);
2261   decode_klass_not_null(r, r, tmp);
2262 }
2263 
2264 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2265   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2266 
2267   if (CompressedKlassPointers::base() == nullptr) {
2268     if (CompressedKlassPointers::shift() != 0) {
2269       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2270       slli(dst, src, LogKlassAlignmentInBytes);
2271     } else {
2272       mv(dst, src);
2273     }
2274     return;
2275   }
2276 
2277   Register xbase = dst;
2278   if (dst == src) {
2279     xbase = tmp;
2280   }
2281 
2282   assert_different_registers(src, xbase);
2283   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2284 
2285   if (CompressedKlassPointers::shift() != 0) {
2286     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2287     assert_different_registers(t0, xbase);
2288     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2289   } else {
2290     add(dst, xbase, src);
2291   }
2292 }
2293 
2294 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2295   assert_different_registers(r, tmp);
2296   encode_klass_not_null(r, r, tmp);
2297 }
2298 
2299 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2300   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2301 
2302   if (CompressedKlassPointers::base() == nullptr) {
2303     if (CompressedKlassPointers::shift() != 0) {
2304       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2305       srli(dst, src, LogKlassAlignmentInBytes);
2306     } else {
2307       mv(dst, src);
2308     }
2309     return;
2310   }
2311 
2312   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2313       CompressedKlassPointers::shift() == 0) {
2314     zero_extend(dst, src, 32);
2315     return;
2316   }
2317 
2318   Register xbase = dst;
2319   if (dst == src) {
2320     xbase = tmp;
2321   }
2322 
2323   assert_different_registers(src, xbase);
2324   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2325   sub(dst, src, xbase);
2326   if (CompressedKlassPointers::shift() != 0) {
2327     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2328     srli(dst, dst, LogKlassAlignmentInBytes);
2329   }
2330 }
2331 
2332 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2333   decode_heap_oop_not_null(r, r);
2334 }
2335 
2336 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2337   assert(UseCompressedOops, "should only be used for compressed headers");
2338   assert(Universe::heap() != nullptr, "java heap should be initialized");
2339   // Cannot assert, unverified entry point counts instructions (see .ad file)
2340   // vtableStubs also counts instructions in pd_code_size_limit.
2341   // Also do not verify_oop as this is called by verify_oop.
2342   if (CompressedOops::shift() != 0) {
2343     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2344     slli(dst, src, LogMinObjAlignmentInBytes);
2345     if (CompressedOops::base() != nullptr) {
2346       add(dst, xheapbase, dst);
2347     }
2348   } else {
2349     assert(CompressedOops::base() == nullptr, "sanity");
2350     mv(dst, src);
2351   }
2352 }
2353 
2354 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2355   if (CompressedOops::base() == nullptr) {
2356     if (CompressedOops::shift() != 0 || d != s) {
2357       slli(d, s, CompressedOops::shift());
2358     }
2359   } else {
2360     Label done;
2361     mv(d, s);
2362     beqz(s, done);
2363     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2364     bind(done);
2365   }
2366   verify_oop_msg(d, "broken oop in decode_heap_oop");
2367 }
2368 
2369 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2370                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2371   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2372 }
2373 
2374 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2375                                    Register tmp2, DecoratorSet decorators) {
2376   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2377 }
2378 
2379 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2380                                             Register tmp2, DecoratorSet decorators) {
2381   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2382 }
2383 
2384 // Used for storing nulls.
2385 void MacroAssembler::store_heap_oop_null(Address dst) {
2386   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2387 }
2388 
2389 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2390                                     bool want_remainder)
2391 {
2392   // Full implementation of Java idiv and irem.  The function
2393   // returns the (pc) offset of the div instruction - may be needed
2394   // for implicit exceptions.
2395   //
2396   // input : rs1: dividend
2397   //         rs2: divisor
2398   //
2399   // result: either
2400   //         quotient  (= rs1 idiv rs2)
2401   //         remainder (= rs1 irem rs2)
2402 
2403 
2404   int idivl_offset = offset();
2405   if (!want_remainder) {
2406     divw(result, rs1, rs2);
2407   } else {
2408     remw(result, rs1, rs2); // result = rs1 % rs2;
2409   }
2410   return idivl_offset;
2411 }
2412 
2413 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2414                                     bool want_remainder)
2415 {
2416   // Full implementation of Java ldiv and lrem.  The function
2417   // returns the (pc) offset of the div instruction - may be needed
2418   // for implicit exceptions.
2419   //
2420   // input : rs1: dividend
2421   //         rs2: divisor
2422   //
2423   // result: either
2424   //         quotient  (= rs1 idiv rs2)
2425   //         remainder (= rs1 irem rs2)
2426 
2427   int idivq_offset = offset();
2428   if (!want_remainder) {
2429     div(result, rs1, rs2);
2430   } else {
2431     rem(result, rs1, rs2); // result = rs1 % rs2;
2432   }
2433   return idivq_offset;
2434 }
2435 
2436 // Look up the method for a megamorpic invkkeinterface call.
2437 // The target method is determined by <intf_klass, itable_index>.
2438 // The receiver klass is in recv_klass.
2439 // On success, the result will be in method_result, and execution falls through.
2440 // On failure, execution transfers to the given label.
2441 void MacroAssembler::lookup_interface_method(Register recv_klass,
2442                                              Register intf_klass,
2443                                              RegisterOrConstant itable_index,
2444                                              Register method_result,
2445                                              Register scan_tmp,
2446                                              Label& L_no_such_interface,
2447                                              bool return_method) {
2448   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2449   assert_different_registers(method_result, intf_klass, scan_tmp);
2450   assert(recv_klass != method_result || !return_method,
2451          "recv_klass can be destroyed when mehtid isn't needed");
2452   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2453          "caller must be same register for non-constant itable index as for method");
2454 
2455   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2456   int vtable_base = in_bytes(Klass::vtable_start_offset());
2457   int itentry_off = in_bytes(itableMethodEntry::method_offset());
2458   int scan_step   = itableOffsetEntry::size() * wordSize;
2459   int vte_size    = vtableEntry::size_in_bytes();
2460   assert(vte_size == wordSize, "else adjust times_vte_scale");
2461 
2462   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2463 
2464   // %%% Could store the aligned, prescaled offset in the klassoop.
2465   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2466   add(scan_tmp, scan_tmp, vtable_base);
2467 
2468   if (return_method) {
2469     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2470     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2471     if (itable_index.is_register()) {
2472       slli(t0, itable_index.as_register(), 3);
2473     } else {
2474       mv(t0, itable_index.as_constant() << 3);
2475     }
2476     add(recv_klass, recv_klass, t0);
2477     if (itentry_off) {
2478       add(recv_klass, recv_klass, itentry_off);
2479     }
2480   }
2481 
2482   Label search, found_method;
2483 
2484   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2485   beq(intf_klass, method_result, found_method);
2486   bind(search);
2487   // Check that the previous entry is non-null. A null entry means that
2488   // the receiver class doesn't implement the interface, and wasn't the
2489   // same as when the caller was compiled.
2490   beqz(method_result, L_no_such_interface, /* is_far */ true);
2491   addi(scan_tmp, scan_tmp, scan_step);
2492   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2493   bne(intf_klass, method_result, search);
2494 
2495   bind(found_method);
2496 
2497   // Got a hit.
2498   if (return_method) {
2499     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
2500     add(method_result, recv_klass, scan_tmp);
2501     ld(method_result, Address(method_result));
2502   }
2503 }
2504 
2505 // virtual method calling
2506 void MacroAssembler::lookup_virtual_method(Register recv_klass,
2507                                            RegisterOrConstant vtable_index,
2508                                            Register method_result) {
2509   const ByteSize base = Klass::vtable_start_offset();
2510   assert(vtableEntry::size() * wordSize == 8,
2511          "adjust the scaling in the code below");
2512   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
2513 
2514   if (vtable_index.is_register()) {
2515     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
2516     ld(method_result, Address(method_result, vtable_offset_in_bytes));
2517   } else {
2518     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
2519     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
2520   }
2521 }
2522 
2523 void MacroAssembler::membar(uint32_t order_constraint) {
2524   address prev = pc() - NativeMembar::instruction_size;
2525   address last = code()->last_insn();
2526 
2527   if (last != nullptr && nativeInstruction_at(last)->is_membar() && prev == last) {
2528     NativeMembar *bar = NativeMembar_at(prev);
2529     // We are merging two memory barrier instructions.  On RISCV we
2530     // can do this simply by ORing them together.
2531     bar->set_kind(bar->get_kind() | order_constraint);
2532     BLOCK_COMMENT("merged membar");
2533   } else {
2534     code()->set_last_insn(pc());
2535 
2536     uint32_t predecessor = 0;
2537     uint32_t successor = 0;
2538 
2539     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
2540     fence(predecessor, successor);
2541   }
2542 }
2543 
2544 // Form an address from base + offset in Rd. Rd my or may not
2545 // actually be used: you must use the Address that is returned. It
2546 // is up to you to ensure that the shift provided matches the size
2547 // of your data.
2548 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
2549   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
2550     return Address(base, byte_offset);
2551   }
2552 
2553   assert_different_registers(Rd, base, noreg);
2554 
2555   // Do it the hard way
2556   mv(Rd, byte_offset);
2557   add(Rd, base, Rd);
2558   return Address(Rd);
2559 }
2560 
2561 void MacroAssembler::check_klass_subtype(Register sub_klass,
2562                                          Register super_klass,
2563                                          Register tmp_reg,
2564                                          Label& L_success) {
2565   Label L_failure;
2566   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
2567   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
2568   bind(L_failure);
2569 }
2570 
2571 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
2572   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
2573   if (acquire) {
2574     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
2575   }
2576   if (at_return) {
2577     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
2578   } else {
2579     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
2580     bnez(t0, slow_path, true /* is_far */);
2581   }
2582 }
2583 
2584 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2585                                 Label &succeed, Label *fail) {
2586   assert_different_registers(addr, tmp);
2587   assert_different_registers(newv, tmp);
2588   assert_different_registers(oldv, tmp);
2589 
2590   // oldv holds comparison value
2591   // newv holds value to write in exchange
2592   // addr identifies memory word to compare against/update
2593   Label retry_load, nope;
2594   bind(retry_load);
2595   // Load reserved from the memory location
2596   lr_d(tmp, addr, Assembler::aqrl);
2597   // Fail and exit if it is not what we expect
2598   bne(tmp, oldv, nope);
2599   // If the store conditional succeeds, tmp will be zero
2600   sc_d(tmp, newv, addr, Assembler::rl);
2601   beqz(tmp, succeed);
2602   // Retry only when the store conditional failed
2603   j(retry_load);
2604 
2605   bind(nope);
2606   membar(AnyAny);
2607   mv(oldv, tmp);
2608   if (fail != nullptr) {
2609     j(*fail);
2610   }
2611 }
2612 
2613 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2614                                         Label &succeed, Label *fail) {
2615   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2616   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2617 }
2618 
2619 void MacroAssembler::load_reserved(Register addr,
2620                                    enum operand_size size,
2621                                    Assembler::Aqrl acquire) {
2622   switch (size) {
2623     case int64:
2624       lr_d(t0, addr, acquire);
2625       break;
2626     case int32:
2627       lr_w(t0, addr, acquire);
2628       break;
2629     case uint32:
2630       lr_w(t0, addr, acquire);
2631       zero_extend(t0, t0, 32);
2632       break;
2633     default:
2634       ShouldNotReachHere();
2635   }
2636 }
2637 
2638 void MacroAssembler::store_conditional(Register addr,
2639                                        Register new_val,
2640                                        enum operand_size size,
2641                                        Assembler::Aqrl release) {
2642   switch (size) {
2643     case int64:
2644       sc_d(t0, new_val, addr, release);
2645       break;
2646     case int32:
2647     case uint32:
2648       sc_w(t0, new_val, addr, release);
2649       break;
2650     default:
2651       ShouldNotReachHere();
2652   }
2653 }
2654 
2655 
2656 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
2657                                                  Register new_val,
2658                                                  enum operand_size size,
2659                                                  Register tmp1, Register tmp2, Register tmp3) {
2660   assert(size == int8 || size == int16, "unsupported operand size");
2661 
2662   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
2663 
2664   andi(shift, addr, 3);
2665   slli(shift, shift, 3);
2666 
2667   andi(aligned_addr, addr, ~3);
2668 
2669   if (size == int8) {
2670     mv(mask, 0xff);
2671   } else {
2672     // size == int16 case
2673     mv(mask, -1);
2674     zero_extend(mask, mask, 16);
2675   }
2676   sll(mask, mask, shift);
2677 
2678   xori(not_mask, mask, -1);
2679 
2680   sll(expected, expected, shift);
2681   andr(expected, expected, mask);
2682 
2683   sll(new_val, new_val, shift);
2684   andr(new_val, new_val, mask);
2685 }
2686 
2687 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
2688 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w,
2689 // which are forced to work with 4-byte aligned address.
2690 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
2691                                           Register new_val,
2692                                           enum operand_size size,
2693                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
2694                                           Register result, bool result_as_bool,
2695                                           Register tmp1, Register tmp2, Register tmp3) {
2696   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2697   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2698   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2699 
2700   Label retry, fail, done;
2701 
2702   bind(retry);
2703   lr_w(old, aligned_addr, acquire);
2704   andr(tmp, old, mask);
2705   bne(tmp, expected, fail);
2706 
2707   andr(tmp, old, not_mask);
2708   orr(tmp, tmp, new_val);
2709   sc_w(tmp, tmp, aligned_addr, release);
2710   bnez(tmp, retry);
2711 
2712   if (result_as_bool) {
2713     mv(result, 1);
2714     j(done);
2715 
2716     bind(fail);
2717     mv(result, zr);
2718 
2719     bind(done);
2720   } else {
2721     andr(tmp, old, mask);
2722 
2723     bind(fail);
2724     srl(result, tmp, shift);
2725 
2726     if (size == int8) {
2727       sign_extend(result, result, 8);
2728     } else {
2729       // size == int16 case
2730       sign_extend(result, result, 16);
2731     }
2732   }
2733 }
2734 
2735 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
2736 // the weak CAS stuff. The major difference is that it just failed when store conditional
2737 // failed.
2738 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
2739                                                Register new_val,
2740                                                enum operand_size size,
2741                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
2742                                                Register result,
2743                                                Register tmp1, Register tmp2, Register tmp3) {
2744   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2745   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2746   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2747 
2748   Label fail, done;
2749 
2750   lr_w(old, aligned_addr, acquire);
2751   andr(tmp, old, mask);
2752   bne(tmp, expected, fail);
2753 
2754   andr(tmp, old, not_mask);
2755   orr(tmp, tmp, new_val);
2756   sc_w(tmp, tmp, aligned_addr, release);
2757   bnez(tmp, fail);
2758 
2759   // Success
2760   mv(result, 1);
2761   j(done);
2762 
2763   // Fail
2764   bind(fail);
2765   mv(result, zr);
2766 
2767   bind(done);
2768 }
2769 
2770 void MacroAssembler::cmpxchg(Register addr, Register expected,
2771                              Register new_val,
2772                              enum operand_size size,
2773                              Assembler::Aqrl acquire, Assembler::Aqrl release,
2774                              Register result, bool result_as_bool) {
2775   assert(size != int8 && size != int16, "unsupported operand size");
2776   assert_different_registers(addr, t0);
2777   assert_different_registers(expected, t0);
2778   assert_different_registers(new_val, t0);
2779 
2780   Label retry_load, done, ne_done;
2781   bind(retry_load);
2782   load_reserved(addr, size, acquire);
2783   bne(t0, expected, ne_done);
2784   store_conditional(addr, new_val, size, release);
2785   bnez(t0, retry_load);
2786 
2787   // equal, succeed
2788   if (result_as_bool) {
2789     mv(result, 1);
2790   } else {
2791     mv(result, expected);
2792   }
2793   j(done);
2794 
2795   // not equal, failed
2796   bind(ne_done);
2797   if (result_as_bool) {
2798     mv(result, zr);
2799   } else {
2800     mv(result, t0);
2801   }
2802 
2803   bind(done);
2804 }
2805 
2806 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
2807                                   Register new_val,
2808                                   enum operand_size size,
2809                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
2810                                   Register result) {
2811   assert_different_registers(addr, t0);
2812   assert_different_registers(expected, t0);
2813   assert_different_registers(new_val, t0);
2814 
2815   Label fail, done;
2816   load_reserved(addr, size, acquire);
2817   bne(t0, expected, fail);
2818   store_conditional(addr, new_val, size, release);
2819   bnez(t0, fail);
2820 
2821   // Success
2822   mv(result, 1);
2823   j(done);
2824 
2825   // Fail
2826   bind(fail);
2827   mv(result, zr);
2828 
2829   bind(done);
2830 }
2831 
2832 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
2833 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2834   prev = prev->is_valid() ? prev : zr;                                                      \
2835   if (incr.is_register()) {                                                                 \
2836     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
2837   } else {                                                                                  \
2838     mv(t0, incr.as_constant());                                                             \
2839     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
2840   }                                                                                         \
2841   return;                                                                                   \
2842 }
2843 
2844 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
2845 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
2846 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
2847 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
2848 
2849 #undef ATOMIC_OP
2850 
2851 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
2852 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
2853   prev = prev->is_valid() ? prev : zr;                                               \
2854   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
2855   return;                                                                            \
2856 }
2857 
2858 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
2859 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
2860 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
2861 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
2862 
2863 #undef ATOMIC_XCHG
2864 
2865 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
2866 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
2867   atomic_##OP2(prev, newv, addr);                                                    \
2868   zero_extend(prev, prev, 32);                                                       \
2869   return;                                                                            \
2870 }
2871 
2872 ATOMIC_XCHGU(xchgwu, xchgw)
2873 ATOMIC_XCHGU(xchgalwu, xchgalw)
2874 
2875 #undef ATOMIC_XCHGU
2876 
2877 void MacroAssembler::far_jump(Address entry, Register tmp) {
2878   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2879   assert(CodeCache::find_blob(entry.target()) != nullptr,
2880          "destination of far call not found in code cache");
2881   assert(entry.rspec().type() == relocInfo::external_word_type
2882         || entry.rspec().type() == relocInfo::runtime_call_type
2883         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2884   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2885   if (far_branches()) {
2886     // We can use auipc + jalr here because we know that the total size of
2887     // the code cache cannot exceed 2Gb.
2888     relocate(entry.rspec(), [&] {
2889       int32_t offset;
2890       la_patchable(tmp, entry, offset);
2891       jalr(x0, tmp, offset);
2892     });
2893   } else {
2894     j(entry);
2895   }
2896 }
2897 
2898 void MacroAssembler::far_call(Address entry, Register tmp) {
2899   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2900   assert(CodeCache::find_blob(entry.target()) != nullptr,
2901          "destination of far call not found in code cache");
2902   assert(entry.rspec().type() == relocInfo::external_word_type
2903         || entry.rspec().type() == relocInfo::runtime_call_type
2904         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2905   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2906   if (far_branches()) {
2907     // We can use auipc + jalr here because we know that the total size of
2908     // the code cache cannot exceed 2Gb.
2909     relocate(entry.rspec(), [&] {
2910       int32_t offset;
2911       la_patchable(tmp, entry, offset);
2912       jalr(x1, tmp, offset); // link
2913     });
2914   } else {
2915     jal(entry); // link
2916   }
2917 }
2918 
2919 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
2920                                                    Register super_klass,
2921                                                    Register tmp_reg,
2922                                                    Label* L_success,
2923                                                    Label* L_failure,
2924                                                    Label* L_slow_path,
2925                                                    Register super_check_offset) {
2926   assert_different_registers(sub_klass, super_klass, tmp_reg);
2927   bool must_load_sco = (super_check_offset == noreg);
2928   if (must_load_sco) {
2929     assert(tmp_reg != noreg, "supply either a temp or a register offset");
2930   } else {
2931     assert_different_registers(sub_klass, super_klass, super_check_offset);
2932   }
2933 
2934   Label L_fallthrough;
2935   int label_nulls = 0;
2936   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
2937   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
2938   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
2939   assert(label_nulls <= 1, "at most one null in batch");
2940 
2941   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2942   int sco_offset = in_bytes(Klass::super_check_offset_offset());
2943   Address super_check_offset_addr(super_klass, sco_offset);
2944 
2945   // Hacked jmp, which may only be used just before L_fallthrough.
2946 #define final_jmp(label)                                                \
2947   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
2948   else                            j(label)             /*omit semi*/
2949 
2950   // If the pointers are equal, we are done (e.g., String[] elements).
2951   // This self-check enables sharing of secondary supertype arrays among
2952   // non-primary types such as array-of-interface. Otherwise, each such
2953   // type would need its own customized SSA.
2954   // We move this check to the front of the fast path because many
2955   // type checks are in fact trivially successful in this manner,
2956   // so we get a nicely predicted branch right at the start of the check.
2957   beq(sub_klass, super_klass, *L_success);
2958 
2959   // Check the supertype display:
2960   if (must_load_sco) {
2961     lwu(tmp_reg, super_check_offset_addr);
2962     super_check_offset = tmp_reg;
2963   }
2964   add(t0, sub_klass, super_check_offset);
2965   Address super_check_addr(t0);
2966   ld(t0, super_check_addr); // load displayed supertype
2967 
2968   // This check has worked decisively for primary supers.
2969   // Secondary supers are sought in the super_cache ('super_cache_addr').
2970   // (Secondary supers are interfaces and very deeply nested subtypes.)
2971   // This works in the same check above because of a tricky aliasing
2972   // between the super_Cache and the primary super display elements.
2973   // (The 'super_check_addr' can address either, as the case requires.)
2974   // Note that the cache is updated below if it does not help us find
2975   // what we need immediately.
2976   // So if it was a primary super, we can just fail immediately.
2977   // Otherwise, it's the slow path for us (no success at this point).
2978 
2979   beq(super_klass, t0, *L_success);
2980   mv(t1, sc_offset);
2981   if (L_failure == &L_fallthrough) {
2982     beq(super_check_offset, t1, *L_slow_path);
2983   } else {
2984     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
2985     final_jmp(*L_slow_path);
2986   }
2987 
2988   bind(L_fallthrough);
2989 
2990 #undef final_jmp
2991 }
2992 
2993 // Scans count pointer sized words at [addr] for occurrence of value,
2994 // generic
2995 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
2996                                 Register tmp) {
2997   Label Lloop, Lexit;
2998   beqz(count, Lexit);
2999   bind(Lloop);
3000   ld(tmp, addr);
3001   beq(value, tmp, Lexit);
3002   add(addr, addr, wordSize);
3003   sub(count, count, 1);
3004   bnez(count, Lloop);
3005   bind(Lexit);
3006 }
3007 
3008 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3009                                                    Register super_klass,
3010                                                    Register tmp1_reg,
3011                                                    Register tmp2_reg,
3012                                                    Label* L_success,
3013                                                    Label* L_failure) {
3014   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3015   if (tmp2_reg != noreg) {
3016     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3017   }
3018 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3019 
3020   Label L_fallthrough;
3021   int label_nulls = 0;
3022   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3023   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3024 
3025   assert(label_nulls <= 1, "at most one null in the batch");
3026 
3027   // A couple of useful fields in sub_klass:
3028   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3029   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3030   Address secondary_supers_addr(sub_klass, ss_offset);
3031   Address super_cache_addr(     sub_klass, sc_offset);
3032 
3033   BLOCK_COMMENT("check_klass_subtype_slow_path");
3034 
3035   // Do a linear scan of the secondary super-klass chain.
3036   // This code is rarely used, so simplicity is a virtue here.
3037   // The repne_scan instruction uses fixed registers, which we must spill.
3038   // Don't worry too much about pre-existing connections with the input regs.
3039 
3040   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3041   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3042 
3043   RegSet pushed_registers;
3044   if (!IS_A_TEMP(x12)) {
3045     pushed_registers += x12;
3046   }
3047   if (!IS_A_TEMP(x15)) {
3048     pushed_registers += x15;
3049   }
3050 
3051   if (super_klass != x10) {
3052     if (!IS_A_TEMP(x10)) {
3053       pushed_registers += x10;
3054     }
3055   }
3056 
3057   push_reg(pushed_registers, sp);
3058 
3059   // Get super_klass value into x10 (even if it was in x15 or x12)
3060   mv(x10, super_klass);
3061 
3062 #ifndef PRODUCT
3063   mv(t1, (address)&SharedRuntime::_partial_subtype_ctr);
3064   Address pst_counter_addr(t1);
3065   ld(t0, pst_counter_addr);
3066   add(t0, t0, 1);
3067   sd(t0, pst_counter_addr);
3068 #endif // PRODUCT
3069 
3070   // We will consult the secondary-super array.
3071   ld(x15, secondary_supers_addr);
3072   // Load the array length.
3073   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3074   // Skip to start of data.
3075   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3076 
3077   // Set t0 to an obvious invalid value, falling through by default
3078   mv(t0, -1);
3079   // Scan X12 words at [X15] for an occurrence of X10.
3080   repne_scan(x15, x10, x12, t0);
3081 
3082   // pop will restore x10, so we should use a temp register to keep its value
3083   mv(t1, x10);
3084 
3085   // Unspill the temp registers:
3086   pop_reg(pushed_registers, sp);
3087 
3088   bne(t1, t0, *L_failure);
3089 
3090   // Success. Cache the super we found an proceed in triumph.
3091   sd(super_klass, super_cache_addr);
3092 
3093   if (L_success != &L_fallthrough) {
3094     j(*L_success);
3095   }
3096 
3097 #undef IS_A_TEMP
3098 
3099   bind(L_fallthrough);
3100 }
3101 
3102 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
3103 void MacroAssembler::tlab_allocate(Register obj,
3104                                    Register var_size_in_bytes,
3105                                    int con_size_in_bytes,
3106                                    Register tmp1,
3107                                    Register tmp2,
3108                                    Label& slow_case,
3109                                    bool is_far) {
3110   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3111   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
3112 }
3113 
3114 // get_thread() can be called anywhere inside generated code so we
3115 // need to save whatever non-callee save context might get clobbered
3116 // by the call to Thread::current() or, indeed, the call setup code.
3117 void MacroAssembler::get_thread(Register thread) {
3118   // save all call-clobbered regs except thread
3119   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
3120                       RegSet::range(x28, x31) + ra - thread;
3121   push_reg(saved_regs, sp);
3122 
3123   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
3124   jalr(ra);
3125   if (thread != c_rarg0) {
3126     mv(thread, c_rarg0);
3127   }
3128 
3129   // restore pushed registers
3130   pop_reg(saved_regs, sp);
3131 }
3132 
3133 void MacroAssembler::load_byte_map_base(Register reg) {
3134   CardTable::CardValue* byte_map_base =
3135     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
3136   mv(reg, (uint64_t)byte_map_base);
3137 }
3138 
3139 void MacroAssembler::la_patchable(Register reg1, const Address &dest, int32_t &offset) {
3140   unsigned long low_address = (uintptr_t)CodeCache::low_bound();
3141   unsigned long high_address = (uintptr_t)CodeCache::high_bound();
3142   unsigned long dest_address = (uintptr_t)dest.target();
3143   long offset_low = dest_address - low_address;
3144   long offset_high = dest_address - high_address;
3145 
3146   assert(dest.getMode() == Address::literal, "la_patchable must be applied to a literal address");
3147   assert((uintptr_t)dest.target() < (1ull << 48), "bad address");
3148 
3149   // RISC-V doesn't compute a page-aligned address, in order to partially
3150   // compensate for the use of *signed* offsets in its base+disp12
3151   // addressing mode (RISC-V's PC-relative reach remains asymmetric
3152   // [-(2G + 2K), 2G - 2K).
3153   if (offset_high >= -((1L << 31) + (1L << 11)) && offset_low < (1L << 31) - (1L << 11)) {
3154     int64_t distance = dest.target() - pc();
3155     auipc(reg1, (int32_t)distance + 0x800);
3156     offset = ((int32_t)distance << 20) >> 20;
3157   } else {
3158     movptr(reg1, dest.target(), offset);
3159   }
3160 }
3161 
3162 void MacroAssembler::build_frame(int framesize) {
3163   assert(framesize >= 2, "framesize must include space for FP/RA");
3164   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3165   sub(sp, sp, framesize);
3166   sd(fp, Address(sp, framesize - 2 * wordSize));
3167   sd(ra, Address(sp, framesize - wordSize));
3168   if (PreserveFramePointer) { add(fp, sp, framesize); }
3169 }
3170 
3171 void MacroAssembler::remove_frame(int framesize) {
3172   assert(framesize >= 2, "framesize must include space for FP/RA");
3173   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3174   ld(fp, Address(sp, framesize - 2 * wordSize));
3175   ld(ra, Address(sp, framesize - wordSize));
3176   add(sp, sp, framesize);
3177 }
3178 
3179 void MacroAssembler::reserved_stack_check() {
3180     // testing if reserved zone needs to be enabled
3181     Label no_reserved_zone_enabling;
3182 
3183     ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
3184     bltu(sp, t0, no_reserved_zone_enabling);
3185 
3186     enter();   // RA and FP are live.
3187     mv(c_rarg0, xthread);
3188     RuntimeAddress target(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
3189     relocate(target.rspec(), [&] {
3190       int32_t offset;
3191       la_patchable(t0, target, offset);
3192       jalr(x1, t0, offset);
3193     });
3194     leave();
3195 
3196     // We have already removed our own frame.
3197     // throw_delayed_StackOverflowError will think that it's been
3198     // called by our caller.
3199     target = RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry());
3200     relocate(target.rspec(), [&] {
3201       int32_t offset;
3202       la_patchable(t0, target, offset);
3203       jalr(x0, t0, offset);
3204     });
3205     should_not_reach_here();
3206 
3207     bind(no_reserved_zone_enabling);
3208 }
3209 
3210 // Move the address of the polling page into dest.
3211 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
3212   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
3213 }
3214 
3215 // Read the polling page.  The address of the polling page must
3216 // already be in r.
3217 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
3218   relocate(rtype, [&] {
3219     lwu(zr, Address(r, offset));
3220   });
3221 }
3222 
3223 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3224 #ifdef ASSERT
3225   {
3226     ThreadInVMfromUnknown tiv;
3227     assert (UseCompressedOops, "should only be used for compressed oops");
3228     assert (Universe::heap() != nullptr, "java heap should be initialized");
3229     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3230     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3231   }
3232 #endif
3233   int oop_index = oop_recorder()->find_index(obj);
3234   relocate(oop_Relocation::spec(oop_index), [&] {
3235     li32(dst, 0xDEADBEEF);
3236   });
3237   zero_extend(dst, dst, 32);
3238 }
3239 
3240 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3241   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3242   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3243   int index = oop_recorder()->find_index(k);
3244   assert(!Universe::heap()->is_in(k), "should not be an oop");
3245 
3246   narrowKlass nk = CompressedKlassPointers::encode(k);
3247   relocate(metadata_Relocation::spec(index), [&] {
3248     li32(dst, nk);
3249   });
3250   zero_extend(dst, dst, 32);
3251 }
3252 
3253 // Maybe emit a call via a trampoline. If the code cache is small
3254 // trampolines won't be emitted.
3255 address MacroAssembler::trampoline_call(Address entry) {
3256   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
3257          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
3258          entry.rspec().type() == relocInfo::static_call_type ||
3259          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
3260 
3261   address target = entry.target();
3262 
3263   // We need a trampoline if branches are far.
3264   if (far_branches()) {
3265     if (!in_scratch_emit_size()) {
3266       if (entry.rspec().type() == relocInfo::runtime_call_type) {
3267         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
3268         code()->share_trampoline_for(entry.target(), offset());
3269       } else {
3270         address stub = emit_trampoline_stub(offset(), target);
3271         if (stub == nullptr) {
3272           postcond(pc() == badAddress);
3273           return nullptr; // CodeCache is full
3274         }
3275       }
3276     }
3277     target = pc();
3278   }
3279 
3280   address call_pc = pc();
3281 #ifdef ASSERT
3282   if (entry.rspec().type() != relocInfo::runtime_call_type) {
3283     assert_alignment(call_pc);
3284   }
3285 #endif
3286   relocate(entry.rspec(), [&] {
3287     jal(target);
3288   });
3289 
3290   postcond(pc() != badAddress);
3291   return call_pc;
3292 }
3293 
3294 address MacroAssembler::ic_call(address entry, jint method_index) {
3295   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
3296   IncompressibleRegion ir(this);  // relocations
3297   movptr(t1, (address)Universe::non_oop_word());
3298   assert_cond(entry != nullptr);
3299   return trampoline_call(Address(entry, rh));
3300 }
3301 
3302 // Emit a trampoline stub for a call to a target which is too far away.
3303 //
3304 // code sequences:
3305 //
3306 // call-site:
3307 //   branch-and-link to <destination> or <trampoline stub>
3308 //
3309 // Related trampoline stub for this call site in the stub section:
3310 //   load the call target from the constant pool
3311 //   branch (RA still points to the call site above)
3312 
3313 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
3314                                              address dest) {
3315   // Max stub size: alignment nop, TrampolineStub.
3316   address stub = start_a_stub(max_trampoline_stub_size());
3317   if (stub == nullptr) {
3318     return nullptr;  // CodeBuffer::expand failed
3319   }
3320 
3321   // We are always 4-byte aligned here.
3322   assert_alignment(pc());
3323 
3324   // Create a trampoline stub relocation which relates this trampoline stub
3325   // with the call instruction at insts_call_instruction_offset in the
3326   // instructions code-section.
3327 
3328   // Make sure the address of destination 8-byte aligned after 3 instructions.
3329   align(wordSize, NativeCallTrampolineStub::data_offset);
3330 
3331   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
3332                                                          insts_call_instruction_offset);
3333   const int stub_start_offset = offset();
3334   relocate(rh, [&] {
3335     // Now, create the trampoline stub's code:
3336     // - load the call
3337     // - call
3338     Label target;
3339     ld(t0, target);  // auipc + ld
3340     jr(t0);          // jalr
3341     bind(target);
3342     assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
3343            "should be");
3344     assert(offset() % wordSize == 0, "bad alignment");
3345     emit_int64((int64_t)dest);
3346   });
3347 
3348   const address stub_start_addr = addr_at(stub_start_offset);
3349 
3350   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
3351 
3352   end_a_stub();
3353   return stub_start_addr;
3354 }
3355 
3356 int MacroAssembler::max_trampoline_stub_size() {
3357   // Max stub size: alignment nop, TrampolineStub.
3358   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
3359 }
3360 
3361 int MacroAssembler::static_call_stub_size() {
3362   // (lui, addi, slli, addi, slli, addi) + (lui, addi, slli, addi, slli) + jalr
3363   return 12 * NativeInstruction::instruction_size;
3364 }
3365 
3366 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
3367   switch (dst.getMode()) {
3368     case Address::base_plus_offset:
3369       // This is the expected mode, although we allow all the other
3370       // forms below.
3371       return form_address(tmp, dst.base(), dst.offset());
3372     default:
3373       la(tmp, dst);
3374       return Address(tmp);
3375   }
3376 }
3377 
3378 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3379   assert(((dst.getMode() == Address::base_plus_offset &&
3380            is_simm12(dst.offset())) || is_simm12(value)),
3381           "invalid value and address mode combination");
3382   Address adr = add_memory_helper(dst, tmp2);
3383   assert(!adr.uses(tmp1), "invalid dst for address increment");
3384   ld(tmp1, adr);
3385   add(tmp1, tmp1, value, tmp2);
3386   sd(tmp1, adr);
3387 }
3388 
3389 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3390   assert(((dst.getMode() == Address::base_plus_offset &&
3391            is_simm12(dst.offset())) || is_simm12(value)),
3392           "invalid value and address mode combination");
3393   Address adr = add_memory_helper(dst, tmp2);
3394   assert(!adr.uses(tmp1), "invalid dst for address increment");
3395   lwu(tmp1, adr);
3396   addw(tmp1, tmp1, value, tmp2);
3397   sw(tmp1, adr);
3398 }
3399 
3400 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3401   assert(((dst.getMode() == Address::base_plus_offset &&
3402            is_simm12(dst.offset())) || is_simm12(value)),
3403           "invalid value and address mode combination");
3404   Address adr = add_memory_helper(dst, tmp2);
3405   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3406   ld(tmp1, adr);
3407   sub(tmp1, tmp1, value, tmp2);
3408   sd(tmp1, adr);
3409 }
3410 
3411 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3412   assert(((dst.getMode() == Address::base_plus_offset &&
3413            is_simm12(dst.offset())) || is_simm12(value)),
3414           "invalid value and address mode combination");
3415   Address adr = add_memory_helper(dst, tmp2);
3416   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3417   lwu(tmp1, adr);
3418   subw(tmp1, tmp1, value, tmp2);
3419   sw(tmp1, adr);
3420 }
3421 
3422 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
3423   assert_different_registers(src1, t0);
3424   relocate(src2.rspec(), [&] {
3425     int32_t offset;
3426     la_patchable(t0, src2, offset);
3427     ld(t0, Address(t0, offset));
3428   });
3429   beq(src1, t0, equal);
3430 }
3431 
3432 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
3433   load_method_holder(result, method);
3434   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
3435 }
3436 
3437 void MacroAssembler::load_method_holder(Register holder, Register method) {
3438   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3439   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3440   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
3441 }
3442 
3443 // string indexof
3444 // compute index by trailing zeros
3445 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
3446                                    Register match_mask, Register result,
3447                                    Register ch2, Register tmp,
3448                                    bool haystack_isL) {
3449   int haystack_chr_shift = haystack_isL ? 0 : 1;
3450   srl(match_mask, match_mask, trailing_zeros);
3451   srli(match_mask, match_mask, 1);
3452   srli(tmp, trailing_zeros, LogBitsPerByte);
3453   if (!haystack_isL) andi(tmp, tmp, 0xE);
3454   add(haystack, haystack, tmp);
3455   ld(ch2, Address(haystack));
3456   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
3457   add(result, result, tmp);
3458 }
3459 
3460 // string indexof
3461 // Find pattern element in src, compute match mask,
3462 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
3463 // match mask patterns and corresponding indices would be like:
3464 // - 0x8080808080808080 (Latin1)
3465 // -   7 6 5 4 3 2 1 0  (match index)
3466 // - 0x8000800080008000 (UTF16)
3467 // -   3   2   1   0    (match index)
3468 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
3469                                         Register mask1, Register mask2) {
3470   xorr(src, pattern, src);
3471   sub(match_mask, src, mask1);
3472   orr(src, src, mask2);
3473   notr(src, src);
3474   andr(match_mask, match_mask, src);
3475 }
3476 
3477 #ifdef COMPILER2
3478 // Code for BigInteger::mulAdd intrinsic
3479 // out     = x10
3480 // in      = x11
3481 // offset  = x12  (already out.length-offset)
3482 // len     = x13
3483 // k       = x14
3484 // tmp     = x28
3485 //
3486 // pseudo code from java implementation:
3487 // long kLong = k & LONG_MASK;
3488 // carry = 0;
3489 // offset = out.length-offset - 1;
3490 // for (int j = len - 1; j >= 0; j--) {
3491 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3492 //     out[offset--] = (int)product;
3493 //     carry = product >>> 32;
3494 // }
3495 // return (int)carry;
3496 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3497                              Register len, Register k, Register tmp) {
3498   Label L_tail_loop, L_unroll, L_end;
3499   mv(tmp, out);
3500   mv(out, zr);
3501   blez(len, L_end);
3502   zero_extend(k, k, 32);
3503   slliw(t0, offset, LogBytesPerInt);
3504   add(offset, tmp, t0);
3505   slliw(t0, len, LogBytesPerInt);
3506   add(in, in, t0);
3507 
3508   const int unroll = 8;
3509   mv(tmp, unroll);
3510   blt(len, tmp, L_tail_loop);
3511   bind(L_unroll);
3512   for (int i = 0; i < unroll; i++) {
3513     sub(in, in, BytesPerInt);
3514     lwu(t0, Address(in, 0));
3515     mul(t1, t0, k);
3516     add(t0, t1, out);
3517     sub(offset, offset, BytesPerInt);
3518     lwu(t1, Address(offset, 0));
3519     add(t0, t0, t1);
3520     sw(t0, Address(offset, 0));
3521     srli(out, t0, 32);
3522   }
3523   subw(len, len, tmp);
3524   bge(len, tmp, L_unroll);
3525 
3526   bind(L_tail_loop);
3527   blez(len, L_end);
3528   sub(in, in, BytesPerInt);
3529   lwu(t0, Address(in, 0));
3530   mul(t1, t0, k);
3531   add(t0, t1, out);
3532   sub(offset, offset, BytesPerInt);
3533   lwu(t1, Address(offset, 0));
3534   add(t0, t0, t1);
3535   sw(t0, Address(offset, 0));
3536   srli(out, t0, 32);
3537   subw(len, len, 1);
3538   j(L_tail_loop);
3539 
3540   bind(L_end);
3541 }
3542 
3543 // add two unsigned input and output carry
3544 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
3545 {
3546   assert_different_registers(dst, carry);
3547   assert_different_registers(dst, src2);
3548   add(dst, src1, src2);
3549   sltu(carry, dst, src2);
3550 }
3551 
3552 // add two input with carry
3553 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
3554   assert_different_registers(dst, carry);
3555   add(dst, src1, src2);
3556   add(dst, dst, carry);
3557 }
3558 
3559 // add two unsigned input with carry and output carry
3560 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
3561   assert_different_registers(dst, src2);
3562   adc(dst, src1, src2, carry);
3563   sltu(carry, dst, src2);
3564 }
3565 
3566 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3567                                      Register src1, Register src2, Register carry) {
3568   cad(dest_lo, dest_lo, src1, carry);
3569   add(dest_hi, dest_hi, carry);
3570   cad(dest_lo, dest_lo, src2, carry);
3571   add(final_dest_hi, dest_hi, carry);
3572 }
3573 
3574 /**
3575  * Multiply 32 bit by 32 bit first loop.
3576  */
3577 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
3578                                            Register y, Register y_idx, Register z,
3579                                            Register carry, Register product,
3580                                            Register idx, Register kdx) {
3581   // jlong carry, x[], y[], z[];
3582   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3583   //     long product = y[idx] * x[xstart] + carry;
3584   //     z[kdx] = (int)product;
3585   //     carry = product >>> 32;
3586   // }
3587   // z[xstart] = (int)carry;
3588 
3589   Label L_first_loop, L_first_loop_exit;
3590   blez(idx, L_first_loop_exit);
3591 
3592   shadd(t0, xstart, x, t0, LogBytesPerInt);
3593   lwu(x_xstart, Address(t0, 0));
3594 
3595   bind(L_first_loop);
3596   subw(idx, idx, 1);
3597   shadd(t0, idx, y, t0, LogBytesPerInt);
3598   lwu(y_idx, Address(t0, 0));
3599   mul(product, x_xstart, y_idx);
3600   add(product, product, carry);
3601   srli(carry, product, 32);
3602   subw(kdx, kdx, 1);
3603   shadd(t0, kdx, z, t0, LogBytesPerInt);
3604   sw(product, Address(t0, 0));
3605   bgtz(idx, L_first_loop);
3606 
3607   bind(L_first_loop_exit);
3608 }
3609 
3610 /**
3611  * Multiply 64 bit by 64 bit first loop.
3612  */
3613 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3614                                            Register y, Register y_idx, Register z,
3615                                            Register carry, Register product,
3616                                            Register idx, Register kdx) {
3617   //
3618   //  jlong carry, x[], y[], z[];
3619   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3620   //    huge_128 product = y[idx] * x[xstart] + carry;
3621   //    z[kdx] = (jlong)product;
3622   //    carry  = (jlong)(product >>> 64);
3623   //  }
3624   //  z[xstart] = carry;
3625   //
3626 
3627   Label L_first_loop, L_first_loop_exit;
3628   Label L_one_x, L_one_y, L_multiply;
3629 
3630   subw(xstart, xstart, 1);
3631   bltz(xstart, L_one_x);
3632 
3633   shadd(t0, xstart, x, t0, LogBytesPerInt);
3634   ld(x_xstart, Address(t0, 0));
3635   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3636 
3637   bind(L_first_loop);
3638   subw(idx, idx, 1);
3639   bltz(idx, L_first_loop_exit);
3640   subw(idx, idx, 1);
3641   bltz(idx, L_one_y);
3642 
3643   shadd(t0, idx, y, t0, LogBytesPerInt);
3644   ld(y_idx, Address(t0, 0));
3645   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
3646   bind(L_multiply);
3647 
3648   mulhu(t0, x_xstart, y_idx);
3649   mul(product, x_xstart, y_idx);
3650   cad(product, product, carry, t1);
3651   adc(carry, t0, zr, t1);
3652 
3653   subw(kdx, kdx, 2);
3654   ror_imm(product, product, 32); // back to big-endian
3655   shadd(t0, kdx, z, t0, LogBytesPerInt);
3656   sd(product, Address(t0, 0));
3657 
3658   j(L_first_loop);
3659 
3660   bind(L_one_y);
3661   lwu(y_idx, Address(y, 0));
3662   j(L_multiply);
3663 
3664   bind(L_one_x);
3665   lwu(x_xstart, Address(x, 0));
3666   j(L_first_loop);
3667 
3668   bind(L_first_loop_exit);
3669 }
3670 
3671 /**
3672  * Multiply 128 bit by 128 bit. Unrolled inner loop.
3673  *
3674  */
3675 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3676                                              Register carry, Register carry2,
3677                                              Register idx, Register jdx,
3678                                              Register yz_idx1, Register yz_idx2,
3679                                              Register tmp, Register tmp3, Register tmp4,
3680                                              Register tmp6, Register product_hi) {
3681   //   jlong carry, x[], y[], z[];
3682   //   int kdx = xstart+1;
3683   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3684   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3685   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3686   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3687   //     carry  = (jlong)(tmp4 >>> 64);
3688   //     z[kdx+idx+1] = (jlong)tmp3;
3689   //     z[kdx+idx] = (jlong)tmp4;
3690   //   }
3691   //   idx += 2;
3692   //   if (idx > 0) {
3693   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3694   //     z[kdx+idx] = (jlong)yz_idx1;
3695   //     carry  = (jlong)(yz_idx1 >>> 64);
3696   //   }
3697   //
3698 
3699   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3700 
3701   srliw(jdx, idx, 2);
3702 
3703   bind(L_third_loop);
3704 
3705   subw(jdx, jdx, 1);
3706   bltz(jdx, L_third_loop_exit);
3707   subw(idx, idx, 4);
3708 
3709   shadd(t0, idx, y, t0, LogBytesPerInt);
3710   ld(yz_idx2, Address(t0, 0));
3711   ld(yz_idx1, Address(t0, wordSize));
3712 
3713   shadd(tmp6, idx, z, t0, LogBytesPerInt);
3714 
3715   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3716   ror_imm(yz_idx2, yz_idx2, 32);
3717 
3718   ld(t1, Address(tmp6, 0));
3719   ld(t0, Address(tmp6, wordSize));
3720 
3721   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3722   mulhu(tmp4, product_hi, yz_idx1);
3723 
3724   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
3725   ror_imm(t1, t1, 32, tmp);
3726 
3727   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
3728   mulhu(carry2, product_hi, yz_idx2);
3729 
3730   cad(tmp3, tmp3, carry, carry);
3731   adc(tmp4, tmp4, zr, carry);
3732   cad(tmp3, tmp3, t0, t0);
3733   cadc(tmp4, tmp4, tmp, t0);
3734   adc(carry, carry2, zr, t0);
3735   cad(tmp4, tmp4, t1, carry2);
3736   adc(carry, carry, zr, carry2);
3737 
3738   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
3739   ror_imm(tmp4, tmp4, 32);
3740   sd(tmp4, Address(tmp6, 0));
3741   sd(tmp3, Address(tmp6, wordSize));
3742 
3743   j(L_third_loop);
3744 
3745   bind(L_third_loop_exit);
3746 
3747   andi(idx, idx, 0x3);
3748   beqz(idx, L_post_third_loop_done);
3749 
3750   Label L_check_1;
3751   subw(idx, idx, 2);
3752   bltz(idx, L_check_1);
3753 
3754   shadd(t0, idx, y, t0, LogBytesPerInt);
3755   ld(yz_idx1, Address(t0, 0));
3756   ror_imm(yz_idx1, yz_idx1, 32);
3757 
3758   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3759   mulhu(tmp4, product_hi, yz_idx1);
3760 
3761   shadd(t0, idx, z, t0, LogBytesPerInt);
3762   ld(yz_idx2, Address(t0, 0));
3763   ror_imm(yz_idx2, yz_idx2, 32, tmp);
3764 
3765   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
3766 
3767   ror_imm(tmp3, tmp3, 32, tmp);
3768   sd(tmp3, Address(t0, 0));
3769 
3770   bind(L_check_1);
3771 
3772   andi(idx, idx, 0x1);
3773   subw(idx, idx, 1);
3774   bltz(idx, L_post_third_loop_done);
3775   shadd(t0, idx, y, t0, LogBytesPerInt);
3776   lwu(tmp4, Address(t0, 0));
3777   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
3778   mulhu(carry2, tmp4, product_hi);
3779 
3780   shadd(t0, idx, z, t0, LogBytesPerInt);
3781   lwu(tmp4, Address(t0, 0));
3782 
3783   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
3784 
3785   shadd(t0, idx, z, t0, LogBytesPerInt);
3786   sw(tmp3, Address(t0, 0));
3787 
3788   slli(t0, carry2, 32);
3789   srli(carry, tmp3, 32);
3790   orr(carry, carry, t0);
3791 
3792   bind(L_post_third_loop_done);
3793 }
3794 
3795 /**
3796  * Code for BigInteger::multiplyToLen() intrinsic.
3797  *
3798  * x10: x
3799  * x11: xlen
3800  * x12: y
3801  * x13: ylen
3802  * x14: z
3803  * x15: zlen
3804  * x16: tmp1
3805  * x17: tmp2
3806  * x7:  tmp3
3807  * x28: tmp4
3808  * x29: tmp5
3809  * x30: tmp6
3810  * x31: tmp7
3811  */
3812 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3813                                      Register z, Register zlen,
3814                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3815                                      Register tmp5, Register tmp6, Register product_hi) {
3816   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3817 
3818   const Register idx = tmp1;
3819   const Register kdx = tmp2;
3820   const Register xstart = tmp3;
3821 
3822   const Register y_idx = tmp4;
3823   const Register carry = tmp5;
3824   const Register product = xlen;
3825   const Register x_xstart = zlen; // reuse register
3826 
3827   mv(idx, ylen); // idx = ylen;
3828   mv(kdx, zlen); // kdx = xlen+ylen;
3829   mv(carry, zr); // carry = 0;
3830 
3831   Label L_multiply_64_x_64_loop, L_done;
3832 
3833   subw(xstart, xlen, 1);
3834   bltz(xstart, L_done);
3835 
3836   const Register jdx = tmp1;
3837 
3838   if (AvoidUnalignedAccesses) {
3839     // Check if x and y are both 8-byte aligned.
3840     orr(t0, xlen, ylen);
3841     test_bit(t0, t0, 0);
3842     beqz(t0, L_multiply_64_x_64_loop);
3843 
3844     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3845     shadd(t0, xstart, z, t0, LogBytesPerInt);
3846     sw(carry, Address(t0, 0));
3847 
3848     Label L_second_loop_unaligned;
3849     bind(L_second_loop_unaligned);
3850     mv(carry, zr);
3851     mv(jdx, ylen);
3852     subw(xstart, xstart, 1);
3853     bltz(xstart, L_done);
3854     sub(sp, sp, 2 * wordSize);
3855     sd(z, Address(sp, 0));
3856     sd(zr, Address(sp, wordSize));
3857     shadd(t0, xstart, z, t0, LogBytesPerInt);
3858     addi(z, t0, 4);
3859     shadd(t0, xstart, x, t0, LogBytesPerInt);
3860     lwu(product, Address(t0, 0));
3861     Label L_third_loop, L_third_loop_exit;
3862 
3863     blez(jdx, L_third_loop_exit);
3864 
3865     bind(L_third_loop);
3866     subw(jdx, jdx, 1);
3867     shadd(t0, jdx, y, t0, LogBytesPerInt);
3868     lwu(t0, Address(t0, 0));
3869     mul(t1, t0, product);
3870     add(t0, t1, carry);
3871     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
3872     lwu(t1, Address(tmp6, 0));
3873     add(t0, t0, t1);
3874     sw(t0, Address(tmp6, 0));
3875     srli(carry, t0, 32);
3876     bgtz(jdx, L_third_loop);
3877 
3878     bind(L_third_loop_exit);
3879     ld(z, Address(sp, 0));
3880     addi(sp, sp, 2 * wordSize);
3881     shadd(t0, xstart, z, t0, LogBytesPerInt);
3882     sw(carry, Address(t0, 0));
3883 
3884     j(L_second_loop_unaligned);
3885   }
3886 
3887   bind(L_multiply_64_x_64_loop);
3888   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3889 
3890   Label L_second_loop_aligned;
3891   beqz(kdx, L_second_loop_aligned);
3892 
3893   Label L_carry;
3894   subw(kdx, kdx, 1);
3895   beqz(kdx, L_carry);
3896 
3897   shadd(t0, kdx, z, t0, LogBytesPerInt);
3898   sw(carry, Address(t0, 0));
3899   srli(carry, carry, 32);
3900   subw(kdx, kdx, 1);
3901 
3902   bind(L_carry);
3903   shadd(t0, kdx, z, t0, LogBytesPerInt);
3904   sw(carry, Address(t0, 0));
3905 
3906   // Second and third (nested) loops.
3907   //
3908   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3909   //   carry = 0;
3910   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3911   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3912   //                    (z[k] & LONG_MASK) + carry;
3913   //     z[k] = (int)product;
3914   //     carry = product >>> 32;
3915   //   }
3916   //   z[i] = (int)carry;
3917   // }
3918   //
3919   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3920 
3921   bind(L_second_loop_aligned);
3922   mv(carry, zr); // carry = 0;
3923   mv(jdx, ylen); // j = ystart+1
3924 
3925   subw(xstart, xstart, 1); // i = xstart-1;
3926   bltz(xstart, L_done);
3927 
3928   sub(sp, sp, 4 * wordSize);
3929   sd(z, Address(sp, 0));
3930 
3931   Label L_last_x;
3932   shadd(t0, xstart, z, t0, LogBytesPerInt);
3933   addi(z, t0, 4);
3934   subw(xstart, xstart, 1); // i = xstart-1;
3935   bltz(xstart, L_last_x);
3936 
3937   shadd(t0, xstart, x, t0, LogBytesPerInt);
3938   ld(product_hi, Address(t0, 0));
3939   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
3940 
3941   Label L_third_loop_prologue;
3942   bind(L_third_loop_prologue);
3943 
3944   sd(ylen, Address(sp, wordSize));
3945   sd(x, Address(sp, 2 * wordSize));
3946   sd(xstart, Address(sp, 3 * wordSize));
3947   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3948                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3949   ld(z, Address(sp, 0));
3950   ld(ylen, Address(sp, wordSize));
3951   ld(x, Address(sp, 2 * wordSize));
3952   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
3953   addi(sp, sp, 4 * wordSize);
3954 
3955   addiw(tmp3, xlen, 1);
3956   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3957   sw(carry, Address(t0, 0));
3958 
3959   subw(tmp3, tmp3, 1);
3960   bltz(tmp3, L_done);
3961 
3962   srli(carry, carry, 32);
3963   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3964   sw(carry, Address(t0, 0));
3965   j(L_second_loop_aligned);
3966 
3967   // Next infrequent code is moved outside loops.
3968   bind(L_last_x);
3969   lwu(product_hi, Address(x, 0));
3970   j(L_third_loop_prologue);
3971 
3972   bind(L_done);
3973 }
3974 #endif
3975 
3976 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
3977 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
3978 // shift 16 bits once.
3979 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
3980   if (UseZbb) {
3981     assert_different_registers(Rd, Rs, tmp1);
3982     int step = isLL ? 8 : 16;
3983     ctz(Rd, Rs);
3984     andi(tmp1, Rd, step - 1);
3985     sub(Rd, Rd, tmp1);
3986     return;
3987   }
3988 
3989   assert_different_registers(Rd, Rs, tmp1, tmp2);
3990   Label Loop;
3991   int step = isLL ? 8 : 16;
3992   mv(Rd, -step);
3993   mv(tmp2, Rs);
3994 
3995   bind(Loop);
3996   addi(Rd, Rd, step);
3997   andi(tmp1, tmp2, ((1 << step) - 1));
3998   srli(tmp2, tmp2, step);
3999   beqz(tmp1, Loop);
4000 }
4001 
4002 // This instruction reads adjacent 4 bytes from the lower half of source register,
4003 // inflate into a register, for example:
4004 // Rs: A7A6A5A4A3A2A1A0
4005 // Rd: 00A300A200A100A0
4006 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4007   assert_different_registers(Rd, Rs, tmp1, tmp2);
4008 
4009   mv(tmp1, 0xFF000000); // first byte mask at lower word
4010   andr(Rd, Rs, tmp1);
4011   for (int i = 0; i < 2; i++) {
4012     slli(Rd, Rd, wordSize);
4013     srli(tmp1, tmp1, wordSize);
4014     andr(tmp2, Rs, tmp1);
4015     orr(Rd, Rd, tmp2);
4016   }
4017   slli(Rd, Rd, wordSize);
4018   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
4019   orr(Rd, Rd, tmp2);
4020 }
4021 
4022 // This instruction reads adjacent 4 bytes from the upper half of source register,
4023 // inflate into a register, for example:
4024 // Rs: A7A6A5A4A3A2A1A0
4025 // Rd: 00A700A600A500A4
4026 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4027   assert_different_registers(Rd, Rs, tmp1, tmp2);
4028   srli(Rs, Rs, 32);   // only upper 32 bits are needed
4029   inflate_lo32(Rd, Rs, tmp1, tmp2);
4030 }
4031 
4032 // The size of the blocks erased by the zero_blocks stub.  We must
4033 // handle anything smaller than this ourselves in zero_words().
4034 const int MacroAssembler::zero_words_block_size = 8;
4035 
4036 // zero_words() is used by C2 ClearArray patterns.  It is as small as
4037 // possible, handling small word counts locally and delegating
4038 // anything larger to the zero_blocks stub.  It is expanded many times
4039 // in compiled code, so it is important to keep it short.
4040 
4041 // ptr:   Address of a buffer to be zeroed.
4042 // cnt:   Count in HeapWords.
4043 //
4044 // ptr, cnt, and t0 are clobbered.
4045 address MacroAssembler::zero_words(Register ptr, Register cnt) {
4046   assert(is_power_of_2(zero_words_block_size), "adjust this");
4047   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
4048   assert_different_registers(cnt, t0);
4049 
4050   BLOCK_COMMENT("zero_words {");
4051 
4052   mv(t0, zero_words_block_size);
4053   Label around, done, done16;
4054   bltu(cnt, t0, around);
4055   {
4056     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::riscv::zero_blocks());
4057     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
4058     if (StubRoutines::riscv::complete()) {
4059       address tpc = trampoline_call(zero_blocks);
4060       if (tpc == nullptr) {
4061         DEBUG_ONLY(reset_labels(around));
4062         postcond(pc() == badAddress);
4063         return nullptr;
4064       }
4065     } else {
4066       jal(zero_blocks);
4067     }
4068   }
4069   bind(around);
4070   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4071     Label l;
4072     test_bit(t0, cnt, exact_log2(i));
4073     beqz(t0, l);
4074     for (int j = 0; j < i; j++) {
4075       sd(zr, Address(ptr, j * wordSize));
4076     }
4077     addi(ptr, ptr, i * wordSize);
4078     bind(l);
4079   }
4080   {
4081     Label l;
4082     test_bit(t0, cnt, 0);
4083     beqz(t0, l);
4084     sd(zr, Address(ptr, 0));
4085     bind(l);
4086   }
4087 
4088   BLOCK_COMMENT("} zero_words");
4089   postcond(pc() != badAddress);
4090   return pc();
4091 }
4092 
4093 #define SmallArraySize (18 * BytesPerLong)
4094 
4095 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
4096 // cnt:   Immediate count in HeapWords.
4097 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
4098   assert_different_registers(base, t0, t1);
4099 
4100   BLOCK_COMMENT("zero_words {");
4101 
4102   if (cnt <= SmallArraySize / BytesPerLong) {
4103     for (int i = 0; i < (int)cnt; i++) {
4104       sd(zr, Address(base, i * wordSize));
4105     }
4106   } else {
4107     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
4108     int remainder = cnt % unroll;
4109     for (int i = 0; i < remainder; i++) {
4110       sd(zr, Address(base, i * wordSize));
4111     }
4112 
4113     Label loop;
4114     Register cnt_reg = t0;
4115     Register loop_base = t1;
4116     cnt = cnt - remainder;
4117     mv(cnt_reg, cnt);
4118     add(loop_base, base, remainder * wordSize);
4119     bind(loop);
4120     sub(cnt_reg, cnt_reg, unroll);
4121     for (int i = 0; i < unroll; i++) {
4122       sd(zr, Address(loop_base, i * wordSize));
4123     }
4124     add(loop_base, loop_base, unroll * wordSize);
4125     bnez(cnt_reg, loop);
4126   }
4127 
4128   BLOCK_COMMENT("} zero_words");
4129 }
4130 
4131 // base:   Address of a buffer to be filled, 8 bytes aligned.
4132 // cnt:    Count in 8-byte unit.
4133 // value:  Value to be filled with.
4134 // base will point to the end of the buffer after filling.
4135 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
4136 //  Algorithm:
4137 //
4138 //    t0 = cnt & 7
4139 //    cnt -= t0
4140 //    p += t0
4141 //    switch (t0):
4142 //      switch start:
4143 //      do while cnt
4144 //        cnt -= 8
4145 //          p[-8] = value
4146 //        case 7:
4147 //          p[-7] = value
4148 //        case 6:
4149 //          p[-6] = value
4150 //          // ...
4151 //        case 1:
4152 //          p[-1] = value
4153 //        case 0:
4154 //          p += 8
4155 //      do-while end
4156 //    switch end
4157 
4158   assert_different_registers(base, cnt, value, t0, t1);
4159 
4160   Label fini, skip, entry, loop;
4161   const int unroll = 8; // Number of sd instructions we'll unroll
4162 
4163   beqz(cnt, fini);
4164 
4165   andi(t0, cnt, unroll - 1);
4166   sub(cnt, cnt, t0);
4167   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
4168   shadd(base, t0, base, t1, 3);
4169   la(t1, entry);
4170   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
4171   sub(t1, t1, t0);
4172   jr(t1);
4173 
4174   bind(loop);
4175   add(base, base, unroll * 8);
4176   for (int i = -unroll; i < 0; i++) {
4177     sd(value, Address(base, i * 8));
4178   }
4179   bind(entry);
4180   sub(cnt, cnt, unroll);
4181   bgez(cnt, loop);
4182 
4183   bind(fini);
4184 }
4185 
4186 // Zero blocks of memory by using CBO.ZERO.
4187 //
4188 // Aligns the base address first sufficiently for CBO.ZERO, then uses
4189 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
4190 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4191 // in cnt.
4192 //
4193 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4194 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
4195 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
4196   Label initial_table_end, loop;
4197 
4198   // Align base with cache line size.
4199   neg(tmp1, base);
4200   andi(tmp1, tmp1, CacheLineSize - 1);
4201 
4202   // tmp1: the number of bytes to be filled to align the base with cache line size.
4203   add(base, base, tmp1);
4204   srai(tmp2, tmp1, 3);
4205   sub(cnt, cnt, tmp2);
4206   srli(tmp2, tmp1, 1);
4207   la(tmp1, initial_table_end);
4208   sub(tmp2, tmp1, tmp2);
4209   jr(tmp2);
4210   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
4211     sd(zr, Address(base, i));
4212   }
4213   bind(initial_table_end);
4214 
4215   mv(tmp1, CacheLineSize / wordSize);
4216   bind(loop);
4217   cbo_zero(base);
4218   sub(cnt, cnt, tmp1);
4219   add(base, base, CacheLineSize);
4220   bge(cnt, tmp1, loop);
4221 }
4222 
4223 // java.lang.Math.round(float a)
4224 // Returns the closest int to the argument, with ties rounding to positive infinity.
4225 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
4226   // this instructions calling sequence provides performance improvement on all tested devices;
4227   // don't change it without re-verification
4228   Label done;
4229   mv(t0, jint_cast(0.5f));
4230   fmv_w_x(ftmp, t0);
4231 
4232   // dst = 0 if NaN
4233   feq_s(t0, src, src); // replacing fclass with feq as performance optimization
4234   mv(dst, zr);
4235   beqz(t0, done);
4236 
4237   // dst = (src + 0.5f) rounded down towards negative infinity
4238   //   Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
4239   //   RDN is required for fadd_s, RNE gives incorrect results:
4240   //     --------------------------------------------------------------------
4241   //     fadd.s rne (src + 0.5f): src = 8388609.000000  ftmp = 8388610.000000
4242   //     fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
4243   //     --------------------------------------------------------------------
4244   //     fadd.s rdn (src + 0.5f): src = 8388609.000000  ftmp = 8388609.000000
4245   //     fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
4246   //     --------------------------------------------------------------------
4247   fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
4248   fcvt_w_s(dst, ftmp, RoundingMode::rdn);
4249 
4250   bind(done);
4251 }
4252 
4253 // java.lang.Math.round(double a)
4254 // Returns the closest long to the argument, with ties rounding to positive infinity.
4255 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
4256   // this instructions calling sequence provides performance improvement on all tested devices;
4257   // don't change it without re-verification
4258   Label done;
4259   mv(t0, julong_cast(0.5));
4260   fmv_d_x(ftmp, t0);
4261 
4262   // dst = 0 if NaN
4263   feq_d(t0, src, src); // replacing fclass with feq as performance optimization
4264   mv(dst, zr);
4265   beqz(t0, done);
4266 
4267   // dst = (src + 0.5) rounded down towards negative infinity
4268   fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
4269   fcvt_l_d(dst, ftmp, RoundingMode::rdn);
4270 
4271   bind(done);
4272 }
4273 
4274 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
4275 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
4276   Label done;                                                                             \
4277   assert_different_registers(dst, tmp);                                                   \
4278   fclass_##FLOATSIG(tmp, src);                                                            \
4279   mv(dst, zr);                                                                            \
4280   /* check if src is NaN */                                                               \
4281   andi(tmp, tmp, 0b1100000000);                                                           \
4282   bnez(tmp, done);                                                                        \
4283   FLOATCVT(dst, src);                                                                     \
4284   bind(done);                                                                             \
4285 }
4286 
4287 FCVT_SAFE(fcvt_w_s, s);
4288 FCVT_SAFE(fcvt_l_s, s);
4289 FCVT_SAFE(fcvt_w_d, d);
4290 FCVT_SAFE(fcvt_l_d, d);
4291 
4292 #undef FCVT_SAFE
4293 
4294 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
4295 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
4296                                          FloatRegister Rs2, int unordered_result) {     \
4297   Label Ldone;                                                                          \
4298   if (unordered_result < 0) {                                                           \
4299     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
4300     /* installs 1 if gt else 0 */                                                       \
4301     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
4302     /* Rs1 > Rs2, install 1 */                                                          \
4303     bgtz(result, Ldone);                                                                \
4304     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4305     addi(result, result, -1);                                                           \
4306     /* Rs1 = Rs2, install 0 */                                                          \
4307     /* NaN or Rs1 < Rs2, install -1 */                                                  \
4308     bind(Ldone);                                                                        \
4309   } else {                                                                              \
4310     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
4311     /* installs 1 if gt or unordered else 0 */                                          \
4312     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
4313     /* Rs1 < Rs2, install -1 */                                                         \
4314     bgtz(result, Ldone);                                                                \
4315     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4316     addi(result, result, -1);                                                           \
4317     /* Rs1 = Rs2, install 0 */                                                          \
4318     /* NaN or Rs1 > Rs2, install 1 */                                                   \
4319     bind(Ldone);                                                                        \
4320     neg(result, result);                                                                \
4321   }                                                                                     \
4322 }
4323 
4324 FCMP(float, s);
4325 FCMP(double, d);
4326 
4327 #undef FCMP
4328 
4329 // Zero words; len is in bytes
4330 // Destroys all registers except addr
4331 // len must be a nonzero multiple of wordSize
4332 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
4333   assert_different_registers(addr, len, tmp, t0, t1);
4334 
4335 #ifdef ASSERT
4336   {
4337     Label L;
4338     andi(t0, len, BytesPerWord - 1);
4339     beqz(t0, L);
4340     stop("len is not a multiple of BytesPerWord");
4341     bind(L);
4342   }
4343 #endif // ASSERT
4344 
4345 #ifndef PRODUCT
4346   block_comment("zero memory");
4347 #endif // PRODUCT
4348 
4349   Label loop;
4350   Label entry;
4351 
4352   // Algorithm:
4353   //
4354   //  t0 = cnt & 7
4355   //  cnt -= t0
4356   //  p += t0
4357   //  switch (t0) {
4358   //    do {
4359   //      cnt -= 8
4360   //        p[-8] = 0
4361   //      case 7:
4362   //        p[-7] = 0
4363   //      case 6:
4364   //        p[-6] = 0
4365   //        ...
4366   //      case 1:
4367   //        p[-1] = 0
4368   //      case 0:
4369   //        p += 8
4370   //     } while (cnt)
4371   //  }
4372 
4373   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
4374 
4375   srli(len, len, LogBytesPerWord);
4376   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
4377   sub(len, len, t0);          // cnt -= unroll
4378   // tmp always points to the end of the region we're about to zero
4379   shadd(tmp, t0, addr, t1, LogBytesPerWord);
4380   la(t1, entry);
4381   slli(t0, t0, 2);
4382   sub(t1, t1, t0);
4383   jr(t1);
4384   bind(loop);
4385   sub(len, len, unroll);
4386   for (int i = -unroll; i < 0; i++) {
4387     sd(zr, Address(tmp, i * wordSize));
4388   }
4389   bind(entry);
4390   add(tmp, tmp, unroll * wordSize);
4391   bnez(len, loop);
4392 }
4393 
4394 // shift left by shamt and add
4395 // Rd = (Rs1 << shamt) + Rs2
4396 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
4397   if (UseZba) {
4398     if (shamt == 1) {
4399       sh1add(Rd, Rs1, Rs2);
4400       return;
4401     } else if (shamt == 2) {
4402       sh2add(Rd, Rs1, Rs2);
4403       return;
4404     } else if (shamt == 3) {
4405       sh3add(Rd, Rs1, Rs2);
4406       return;
4407     }
4408   }
4409 
4410   if (shamt != 0) {
4411     slli(tmp, Rs1, shamt);
4412     add(Rd, Rs2, tmp);
4413   } else {
4414     add(Rd, Rs1, Rs2);
4415   }
4416 }
4417 
4418 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
4419   if (UseZba && bits == 32) {
4420     zext_w(dst, src);
4421     return;
4422   }
4423 
4424   if (UseZbb && bits == 16) {
4425     zext_h(dst, src);
4426     return;
4427   }
4428 
4429   if (bits == 8) {
4430     zext_b(dst, src);
4431   } else {
4432     slli(dst, src, XLEN - bits);
4433     srli(dst, dst, XLEN - bits);
4434   }
4435 }
4436 
4437 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
4438   if (UseZbb) {
4439     if (bits == 8) {
4440       sext_b(dst, src);
4441       return;
4442     } else if (bits == 16) {
4443       sext_h(dst, src);
4444       return;
4445     }
4446   }
4447 
4448   if (bits == 32) {
4449     sext_w(dst, src);
4450   } else {
4451     slli(dst, src, XLEN - bits);
4452     srai(dst, dst, XLEN - bits);
4453   }
4454 }
4455 
4456 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
4457 {
4458   if (src1 == src2) {
4459     mv(dst, zr);
4460     return;
4461   }
4462   Label done;
4463   Register left = src1;
4464   Register right = src2;
4465   if (dst == src1) {
4466     assert_different_registers(dst, src2, tmp);
4467     mv(tmp, src1);
4468     left = tmp;
4469   } else if (dst == src2) {
4470     assert_different_registers(dst, src1, tmp);
4471     mv(tmp, src2);
4472     right = tmp;
4473   }
4474 
4475   // installs 1 if gt else 0
4476   slt(dst, right, left);
4477   bnez(dst, done);
4478   slt(dst, left, right);
4479   // dst = -1 if lt; else if eq , dst = 0
4480   neg(dst, dst);
4481   bind(done);
4482 }
4483 
4484 // The java_calling_convention describes stack locations as ideal slots on
4485 // a frame with no abi restrictions. Since we must observe abi restrictions
4486 // (like the placement of the register window) the slots must be biased by
4487 // the following value.
4488 static int reg2offset_in(VMReg r) {
4489   // Account for saved fp and ra
4490   // This should really be in_preserve_stack_slots
4491   return r->reg2stack() * VMRegImpl::stack_slot_size;
4492 }
4493 
4494 static int reg2offset_out(VMReg r) {
4495   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
4496 }
4497 
4498 // On 64 bit we will store integer like items to the stack as
4499 // 64 bits items (riscv64 abi) even though java would only store
4500 // 32bits for a parameter. On 32bit it will simply be 32 bits
4501 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
4502 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
4503   if (src.first()->is_stack()) {
4504     if (dst.first()->is_stack()) {
4505       // stack to stack
4506       ld(tmp, Address(fp, reg2offset_in(src.first())));
4507       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4508     } else {
4509       // stack to reg
4510       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4511     }
4512   } else if (dst.first()->is_stack()) {
4513     // reg to stack
4514     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4515   } else {
4516     if (dst.first() != src.first()) {
4517       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
4518     }
4519   }
4520 }
4521 
4522 // An oop arg. Must pass a handle not the oop itself
4523 void MacroAssembler::object_move(OopMap* map,
4524                                  int oop_handle_offset,
4525                                  int framesize_in_slots,
4526                                  VMRegPair src,
4527                                  VMRegPair dst,
4528                                  bool is_receiver,
4529                                  int* receiver_offset) {
4530   assert_cond(map != nullptr && receiver_offset != nullptr);
4531 
4532   // must pass a handle. First figure out the location we use as a handle
4533   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
4534 
4535   // See if oop is null if it is we need no handle
4536 
4537   if (src.first()->is_stack()) {
4538     // Oop is already on the stack as an argument
4539     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
4540     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
4541     if (is_receiver) {
4542       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
4543     }
4544 
4545     ld(t0, Address(fp, reg2offset_in(src.first())));
4546     la(rHandle, Address(fp, reg2offset_in(src.first())));
4547     // conditionally move a null
4548     Label notZero1;
4549     bnez(t0, notZero1);
4550     mv(rHandle, zr);
4551     bind(notZero1);
4552   } else {
4553 
4554     // Oop is in a register we must store it to the space we reserve
4555     // on the stack for oop_handles and pass a handle if oop is non-null
4556 
4557     const Register rOop = src.first()->as_Register();
4558     int oop_slot = -1;
4559     if (rOop == j_rarg0) {
4560       oop_slot = 0;
4561     } else if (rOop == j_rarg1) {
4562       oop_slot = 1;
4563     } else if (rOop == j_rarg2) {
4564       oop_slot = 2;
4565     } else if (rOop == j_rarg3) {
4566       oop_slot = 3;
4567     } else if (rOop == j_rarg4) {
4568       oop_slot = 4;
4569     } else if (rOop == j_rarg5) {
4570       oop_slot = 5;
4571     } else if (rOop == j_rarg6) {
4572       oop_slot = 6;
4573     } else {
4574       assert(rOop == j_rarg7, "wrong register");
4575       oop_slot = 7;
4576     }
4577 
4578     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
4579     int offset = oop_slot * VMRegImpl::stack_slot_size;
4580 
4581     map->set_oop(VMRegImpl::stack2reg(oop_slot));
4582     // Store oop in handle area, may be null
4583     sd(rOop, Address(sp, offset));
4584     if (is_receiver) {
4585       *receiver_offset = offset;
4586     }
4587 
4588     //rOop maybe the same as rHandle
4589     if (rOop == rHandle) {
4590       Label isZero;
4591       beqz(rOop, isZero);
4592       la(rHandle, Address(sp, offset));
4593       bind(isZero);
4594     } else {
4595       Label notZero2;
4596       la(rHandle, Address(sp, offset));
4597       bnez(rOop, notZero2);
4598       mv(rHandle, zr);
4599       bind(notZero2);
4600     }
4601   }
4602 
4603   // If arg is on the stack then place it otherwise it is already in correct reg.
4604   if (dst.first()->is_stack()) {
4605     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
4606   }
4607 }
4608 
4609 // A float arg may have to do float reg int reg conversion
4610 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
4611   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4612          src.first()->is_reg() && dst.first()->is_reg() ||
4613          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4614   if (src.first()->is_stack()) {
4615     if (dst.first()->is_stack()) {
4616       lwu(tmp, Address(fp, reg2offset_in(src.first())));
4617       sw(tmp, Address(sp, reg2offset_out(dst.first())));
4618     } else if (dst.first()->is_Register()) {
4619       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4620     } else {
4621       ShouldNotReachHere();
4622     }
4623   } else if (src.first() != dst.first()) {
4624     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4625       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4626     } else {
4627       ShouldNotReachHere();
4628     }
4629   }
4630 }
4631 
4632 // A long move
4633 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
4634   if (src.first()->is_stack()) {
4635     if (dst.first()->is_stack()) {
4636       // stack to stack
4637       ld(tmp, Address(fp, reg2offset_in(src.first())));
4638       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4639     } else {
4640       // stack to reg
4641       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4642     }
4643   } else if (dst.first()->is_stack()) {
4644     // reg to stack
4645     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4646   } else {
4647     if (dst.first() != src.first()) {
4648       mv(dst.first()->as_Register(), src.first()->as_Register());
4649     }
4650   }
4651 }
4652 
4653 // A double move
4654 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
4655   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4656          src.first()->is_reg() && dst.first()->is_reg() ||
4657          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4658   if (src.first()->is_stack()) {
4659     if (dst.first()->is_stack()) {
4660       ld(tmp, Address(fp, reg2offset_in(src.first())));
4661       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4662     } else if (dst.first()-> is_Register()) {
4663       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4664     } else {
4665       ShouldNotReachHere();
4666     }
4667   } else if (src.first() != dst.first()) {
4668     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4669       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4670     } else {
4671       ShouldNotReachHere();
4672     }
4673   }
4674 }
4675 
4676 void MacroAssembler::rt_call(address dest, Register tmp) {
4677   CodeBlob *cb = CodeCache::find_blob(dest);
4678   RuntimeAddress target(dest);
4679   if (cb) {
4680     far_call(target);
4681   } else {
4682     relocate(target.rspec(), [&] {
4683       int32_t offset;
4684       la_patchable(tmp, target, offset);
4685       jalr(x1, tmp, offset);
4686     });
4687   }
4688 }
4689 
4690 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
4691   assert(bit_pos < 64, "invalid bit range");
4692   if (UseZbs) {
4693     bexti(Rd, Rs, bit_pos);
4694     return;
4695   }
4696   int64_t imm = (int64_t)(1UL << bit_pos);
4697   if (is_simm12(imm)) {
4698     and_imm12(Rd, Rs, imm);
4699   } else {
4700     srli(Rd, Rs, bit_pos);
4701     and_imm12(Rd, Rd, 1);
4702   }
4703 }
4704 
4705 // Implements lightweight-locking.
4706 // Branches to slow upon failure to lock the object.
4707 // Falls through upon success.
4708 //
4709 //  - obj: the object to be locked
4710 //  - hdr: the header, already loaded from obj, will be destroyed
4711 //  - tmp1, tmp2: temporary registers, will be destroyed
4712 void MacroAssembler::lightweight_lock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow) {
4713   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4714   assert_different_registers(obj, hdr, tmp1, tmp2, t0);
4715 
4716   // Check if we would have space on lock-stack for the object.
4717   lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4718   mv(tmp2, (unsigned)LockStack::end_offset());
4719   bge(tmp1, tmp2, slow, /* is_far */ true);
4720 
4721   // Load (object->mark() | 1) into hdr
4722   ori(hdr, hdr, markWord::unlocked_value);
4723   // Clear lock-bits, into tmp2
4724   xori(tmp2, hdr, markWord::unlocked_value);
4725 
4726   // Try to swing header from unlocked to locked
4727   Label success;
4728   cmpxchgptr(hdr, tmp2, obj, tmp1, success, &slow);
4729   bind(success);
4730 
4731   // After successful lock, push object on lock-stack
4732   lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4733   add(tmp2, xthread, tmp1);
4734   sd(obj, Address(tmp2, 0));
4735   addw(tmp1, tmp1, oopSize);
4736   sw(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4737 }
4738 
4739 // Implements ligthweight-unlocking.
4740 // Branches to slow upon failure.
4741 // Falls through upon success.
4742 //
4743 // - obj: the object to be unlocked
4744 // - hdr: the (pre-loaded) header of the object
4745 // - tmp1, tmp2: temporary registers
4746 void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow) {
4747   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4748   assert_different_registers(obj, hdr, tmp1, tmp2, t0);
4749 
4750 #ifdef ASSERT
4751   {
4752     // The following checks rely on the fact that LockStack is only ever modified by
4753     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4754     // entries after inflation will happen delayed in that case.
4755 
4756     // Check for lock-stack underflow.
4757     Label stack_ok;
4758     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4759     mv(tmp2, (unsigned)LockStack::start_offset());
4760     bgt(tmp1, tmp2, stack_ok);
4761     STOP("Lock-stack underflow");
4762     bind(stack_ok);
4763   }
4764   {
4765     // Check if the top of the lock-stack matches the unlocked object.
4766     Label tos_ok;
4767     subw(tmp1, tmp1, oopSize);
4768     add(tmp1, xthread, tmp1);
4769     ld(tmp1, Address(tmp1, 0));
4770     beq(tmp1, obj, tos_ok);
4771     STOP("Top of lock-stack does not match the unlocked object");
4772     bind(tos_ok);
4773   }
4774   {
4775     // Check that hdr is fast-locked.
4776    Label hdr_ok;
4777     andi(tmp1, hdr, markWord::lock_mask_in_place);
4778     beqz(tmp1, hdr_ok);
4779     STOP("Header is not fast-locked");
4780     bind(hdr_ok);
4781   }
4782 #endif
4783 
4784   // Load the new header (unlocked) into tmp1
4785   ori(tmp1, hdr, markWord::unlocked_value);
4786 
4787   // Try to swing header from locked to unlocked
4788   Label success;
4789   cmpxchgptr(hdr, tmp1, obj, tmp2, success, &slow);
4790   bind(success);
4791 
4792   // After successful unlock, pop object from lock-stack
4793   lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4794   subw(tmp1, tmp1, oopSize);
4795 #ifdef ASSERT
4796   add(tmp2, xthread, tmp1);
4797   sd(zr, Address(tmp2, 0));
4798 #endif
4799   sw(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4800 }