1 /*
   2  * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/assembler.hpp"
  29 #include "asm/assembler.inline.hpp"
  30 #include "compiler/disassembler.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/cardTableBarrierSet.hpp"
  35 #include "gc/shared/collectedHeap.hpp"
  36 #include "interpreter/bytecodeHistogram.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_riscv.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedKlass.inline.hpp"
  43 #include "oops/compressedOops.inline.hpp"
  44 #include "oops/klass.inline.hpp"
  45 #include "oops/oop.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/javaThread.hpp"
  48 #include "runtime/jniHandles.inline.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/stubRoutines.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/compile.hpp"
  54 #include "opto/node.hpp"
  55 #include "opto/output.hpp"
  56 #endif
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #else
  61 #define BLOCK_COMMENT(str) block_comment(str)
  62 #endif
  63 #define STOP(str) stop(str);
  64 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
  65 
  66 static void pass_arg0(MacroAssembler* masm, Register arg) {
  67   if (c_rarg0 != arg) {
  68     masm->mv(c_rarg0, arg);
  69   }
  70 }
  71 
  72 static void pass_arg1(MacroAssembler* masm, Register arg) {
  73   if (c_rarg1 != arg) {
  74     masm->mv(c_rarg1, arg);
  75   }
  76 }
  77 
  78 static void pass_arg2(MacroAssembler* masm, Register arg) {
  79   if (c_rarg2 != arg) {
  80     masm->mv(c_rarg2, arg);
  81   }
  82 }
  83 
  84 static void pass_arg3(MacroAssembler* masm, Register arg) {
  85   if (c_rarg3 != arg) {
  86     masm->mv(c_rarg3, arg);
  87   }
  88 }
  89 
  90 void MacroAssembler::push_cont_fastpath(Register java_thread) {
  91   if (!Continuations::enabled()) return;
  92   Label done;
  93   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
  94   bleu(sp, t0, done);
  95   sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
  96   bind(done);
  97 }
  98 
  99 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
 100   if (!Continuations::enabled()) return;
 101   Label done;
 102   ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
 103   bltu(sp, t0, done);
 104   sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
 105   bind(done);
 106 }
 107 
 108 int MacroAssembler::align(int modulus, int extra_offset) {
 109   CompressibleRegion cr(this);
 110   intptr_t before = offset();
 111   while ((offset() + extra_offset) % modulus != 0) { nop(); }
 112   return (int)(offset() - before);
 113 }
 114 
 115 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 116   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 117 }
 118 
 119 // Implementation of call_VM versions
 120 
 121 void MacroAssembler::call_VM(Register oop_result,
 122                              address entry_point,
 123                              bool check_exceptions) {
 124   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 125 }
 126 
 127 void MacroAssembler::call_VM(Register oop_result,
 128                              address entry_point,
 129                              Register arg_1,
 130                              bool check_exceptions) {
 131   pass_arg1(this, arg_1);
 132   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 133 }
 134 
 135 void MacroAssembler::call_VM(Register oop_result,
 136                              address entry_point,
 137                              Register arg_1,
 138                              Register arg_2,
 139                              bool check_exceptions) {
 140   assert_different_registers(arg_1, c_rarg2);
 141   pass_arg2(this, arg_2);
 142   pass_arg1(this, arg_1);
 143   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 144 }
 145 
 146 void MacroAssembler::call_VM(Register oop_result,
 147                              address entry_point,
 148                              Register arg_1,
 149                              Register arg_2,
 150                              Register arg_3,
 151                              bool check_exceptions) {
 152   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 153   assert_different_registers(arg_2, c_rarg3);
 154   pass_arg3(this, arg_3);
 155 
 156   pass_arg2(this, arg_2);
 157 
 158   pass_arg1(this, arg_1);
 159   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 160 }
 161 
 162 void MacroAssembler::call_VM(Register oop_result,
 163                              Register last_java_sp,
 164                              address entry_point,
 165                              int number_of_arguments,
 166                              bool check_exceptions) {
 167   call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 168 }
 169 
 170 void MacroAssembler::call_VM(Register oop_result,
 171                              Register last_java_sp,
 172                              address entry_point,
 173                              Register arg_1,
 174                              bool check_exceptions) {
 175   pass_arg1(this, arg_1);
 176   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 177 }
 178 
 179 void MacroAssembler::call_VM(Register oop_result,
 180                              Register last_java_sp,
 181                              address entry_point,
 182                              Register arg_1,
 183                              Register arg_2,
 184                              bool check_exceptions) {
 185 
 186   assert_different_registers(arg_1, c_rarg2);
 187   pass_arg2(this, arg_2);
 188   pass_arg1(this, arg_1);
 189   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 190 }
 191 
 192 void MacroAssembler::call_VM(Register oop_result,
 193                              Register last_java_sp,
 194                              address entry_point,
 195                              Register arg_1,
 196                              Register arg_2,
 197                              Register arg_3,
 198                              bool check_exceptions) {
 199   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 200   assert_different_registers(arg_2, c_rarg3);
 201   pass_arg3(this, arg_3);
 202   pass_arg2(this, arg_2);
 203   pass_arg1(this, arg_1);
 204   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 205 }
 206 
 207 void MacroAssembler::post_call_nop() {
 208   if (!Continuations::enabled()) {
 209     return;
 210   }
 211   relocate(post_call_nop_Relocation::spec(), [&] {
 212     InlineSkippedInstructionsCounter skipCounter(this);
 213     nop();
 214     li32(zr, 0);
 215   });
 216 }
 217 
 218 // these are no-ops overridden by InterpreterMacroAssembler
 219 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
 220 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
 221 
 222 // Calls to C land
 223 //
 224 // When entering C land, the fp, & esp of the last Java frame have to be recorded
 225 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 226 // has to be reset to 0. This is required to allow proper stack traversal.
 227 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 228                                          Register last_java_fp,
 229                                          Register last_java_pc,
 230                                          Register tmp) {
 231 
 232   if (last_java_pc->is_valid()) {
 233       sd(last_java_pc, Address(xthread,
 234                                JavaThread::frame_anchor_offset() +
 235                                JavaFrameAnchor::last_Java_pc_offset()));
 236   }
 237 
 238   // determine last_java_sp register
 239   if (last_java_sp == sp) {
 240     mv(tmp, sp);
 241     last_java_sp = tmp;
 242   } else if (!last_java_sp->is_valid()) {
 243     last_java_sp = esp;
 244   }
 245 
 246   sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
 247 
 248   // last_java_fp is optional
 249   if (last_java_fp->is_valid()) {
 250     sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
 251   }
 252 }
 253 
 254 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 255                                          Register last_java_fp,
 256                                          address  last_java_pc,
 257                                          Register tmp) {
 258   assert(last_java_pc != nullptr, "must provide a valid PC");
 259 
 260   la(tmp, last_java_pc);
 261   sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
 262 
 263   set_last_Java_frame(last_java_sp, last_java_fp, noreg, tmp);
 264 }
 265 
 266 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 267                                          Register last_java_fp,
 268                                          Label &L,
 269                                          Register tmp) {
 270   if (L.is_bound()) {
 271     set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
 272   } else {
 273     L.add_patch_at(code(), locator());
 274     IncompressibleRegion ir(this);  // the label address will be patched back.
 275     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
 276   }
 277 }
 278 
 279 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 280   // we must set sp to zero to clear frame
 281   sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
 282 
 283   // must clear fp, so that compiled frames are not confused; it is
 284   // possible that we need it only for debugging
 285   if (clear_fp) {
 286     sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
 287   }
 288 
 289   // Always clear the pc because it could have been set by make_walkable()
 290   sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
 291 }
 292 
 293 void MacroAssembler::call_VM_base(Register oop_result,
 294                                   Register java_thread,
 295                                   Register last_java_sp,
 296                                   address  entry_point,
 297                                   int      number_of_arguments,
 298                                   bool     check_exceptions) {
 299    // determine java_thread register
 300   if (!java_thread->is_valid()) {
 301     java_thread = xthread;
 302   }
 303   // determine last_java_sp register
 304   if (!last_java_sp->is_valid()) {
 305     last_java_sp = esp;
 306   }
 307 
 308   // debugging support
 309   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 310   assert(java_thread == xthread, "unexpected register");
 311 
 312   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 313   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 314 
 315   // push java thread (becomes first argument of C function)
 316   mv(c_rarg0, java_thread);
 317 
 318   // set last Java frame before call
 319   assert(last_java_sp != fp, "can't use fp");
 320 
 321   Label l;
 322   set_last_Java_frame(last_java_sp, fp, l, t0);
 323 
 324   // do the call, remove parameters
 325   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 326 
 327   // reset last Java frame
 328   // Only interpreter should have to clear fp
 329   reset_last_Java_frame(true);
 330 
 331    // C++ interp handles this in the interpreter
 332   check_and_handle_popframe(java_thread);
 333   check_and_handle_earlyret(java_thread);
 334 
 335   if (check_exceptions) {
 336     // check for pending exceptions (java_thread is set upon return)
 337     ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 338     Label ok;
 339     beqz(t0, ok);
 340     RuntimeAddress target(StubRoutines::forward_exception_entry());
 341     relocate(target.rspec(), [&] {
 342       int32_t offset;
 343       la_patchable(t0, target, offset);
 344       jalr(x0, t0, offset);
 345     });
 346     bind(ok);
 347   }
 348 
 349   // get oop result if there is one and reset the value in the thread
 350   if (oop_result->is_valid()) {
 351     get_vm_result(oop_result, java_thread);
 352   }
 353 }
 354 
 355 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 356   ld(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 357   sd(zr, Address(java_thread, JavaThread::vm_result_offset()));
 358   verify_oop_msg(oop_result, "broken oop in call_VM_base");
 359 }
 360 
 361 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 362   ld(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 363   sd(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 364 }
 365 
 366 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
 367   assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
 368   assert_different_registers(klass, xthread, tmp);
 369 
 370   Label L_fallthrough, L_tmp;
 371   if (L_fast_path == nullptr) {
 372     L_fast_path = &L_fallthrough;
 373   } else if (L_slow_path == nullptr) {
 374     L_slow_path = &L_fallthrough;
 375   }
 376 
 377   // Fast path check: class is fully initialized
 378   lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
 379   sub(tmp, tmp, InstanceKlass::fully_initialized);
 380   beqz(tmp, *L_fast_path);
 381 
 382   // Fast path check: current thread is initializer thread
 383   ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
 384 
 385   if (L_slow_path == &L_fallthrough) {
 386     beq(xthread, tmp, *L_fast_path);
 387     bind(*L_slow_path);
 388   } else if (L_fast_path == &L_fallthrough) {
 389     bne(xthread, tmp, *L_slow_path);
 390     bind(*L_fast_path);
 391   } else {
 392     Unimplemented();
 393   }
 394 }
 395 
 396 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
 397   if (!VerifyOops) { return; }
 398 
 399   // Pass register number to verify_oop_subroutine
 400   const char* b = nullptr;
 401   {
 402     ResourceMark rm;
 403     stringStream ss;
 404     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
 405     b = code_string(ss.as_string());
 406   }
 407   BLOCK_COMMENT("verify_oop {");
 408 
 409   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 410 
 411   mv(c_rarg0, reg); // c_rarg0 : x10
 412   {
 413     // The length of the instruction sequence emitted should not depend
 414     // on the address of the char buffer so that the size of mach nodes for
 415     // scratch emit and normal emit matches.
 416     IncompressibleRegion ir(this);  // Fixed length
 417     movptr(t0, (address) b);
 418   }
 419 
 420   // call indirectly to solve generation ordering problem
 421   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 422   relocate(target.rspec(), [&] {
 423     int32_t offset;
 424     la_patchable(t1, target, offset);
 425     ld(t1, Address(t1, offset));
 426   });
 427   jalr(t1);
 428 
 429   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 430 
 431   BLOCK_COMMENT("} verify_oop");
 432 }
 433 
 434 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
 435   if (!VerifyOops) {
 436     return;
 437   }
 438 
 439   const char* b = nullptr;
 440   {
 441     ResourceMark rm;
 442     stringStream ss;
 443     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
 444     b = code_string(ss.as_string());
 445   }
 446   BLOCK_COMMENT("verify_oop_addr {");
 447 
 448   push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 449 
 450   if (addr.uses(sp)) {
 451     la(x10, addr);
 452     ld(x10, Address(x10, 4 * wordSize));
 453   } else {
 454     ld(x10, addr);
 455   }
 456 
 457   {
 458     // The length of the instruction sequence emitted should not depend
 459     // on the address of the char buffer so that the size of mach nodes for
 460     // scratch emit and normal emit matches.
 461     IncompressibleRegion ir(this);  // Fixed length
 462     movptr(t0, (address) b);
 463   }
 464 
 465   // call indirectly to solve generation ordering problem
 466   ExternalAddress target(StubRoutines::verify_oop_subroutine_entry_address());
 467   relocate(target.rspec(), [&] {
 468     int32_t offset;
 469     la_patchable(t1, target, offset);
 470     ld(t1, Address(t1, offset));
 471   });
 472   jalr(t1);
 473 
 474   pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
 475 
 476   BLOCK_COMMENT("} verify_oop_addr");
 477 }
 478 
 479 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 480                                          int extra_slot_offset) {
 481   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 482   int stackElementSize = Interpreter::stackElementSize;
 483   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 484 #ifdef ASSERT
 485   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 486   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 487 #endif
 488   if (arg_slot.is_constant()) {
 489     return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
 490   } else {
 491     assert_different_registers(t0, arg_slot.as_register());
 492     shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
 493     return Address(t0, offset);
 494   }
 495 }
 496 
 497 #ifndef PRODUCT
 498 extern "C" void findpc(intptr_t x);
 499 #endif
 500 
 501 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
 502 {
 503   // In order to get locks to work, we need to fake a in_VM state
 504   if (ShowMessageBoxOnError) {
 505     JavaThread* thread = JavaThread::current();
 506     JavaThreadState saved_state = thread->thread_state();
 507     thread->set_thread_state(_thread_in_vm);
 508 #ifndef PRODUCT
 509     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 510       ttyLocker ttyl;
 511       BytecodeCounter::print();
 512     }
 513 #endif
 514     if (os::message_box(msg, "Execution stopped, print registers?")) {
 515       ttyLocker ttyl;
 516       tty->print_cr(" pc = 0x%016lx", pc);
 517 #ifndef PRODUCT
 518       tty->cr();
 519       findpc(pc);
 520       tty->cr();
 521 #endif
 522       tty->print_cr(" x0 = 0x%016lx", regs[0]);
 523       tty->print_cr(" x1 = 0x%016lx", regs[1]);
 524       tty->print_cr(" x2 = 0x%016lx", regs[2]);
 525       tty->print_cr(" x3 = 0x%016lx", regs[3]);
 526       tty->print_cr(" x4 = 0x%016lx", regs[4]);
 527       tty->print_cr(" x5 = 0x%016lx", regs[5]);
 528       tty->print_cr(" x6 = 0x%016lx", regs[6]);
 529       tty->print_cr(" x7 = 0x%016lx", regs[7]);
 530       tty->print_cr(" x8 = 0x%016lx", regs[8]);
 531       tty->print_cr(" x9 = 0x%016lx", regs[9]);
 532       tty->print_cr("x10 = 0x%016lx", regs[10]);
 533       tty->print_cr("x11 = 0x%016lx", regs[11]);
 534       tty->print_cr("x12 = 0x%016lx", regs[12]);
 535       tty->print_cr("x13 = 0x%016lx", regs[13]);
 536       tty->print_cr("x14 = 0x%016lx", regs[14]);
 537       tty->print_cr("x15 = 0x%016lx", regs[15]);
 538       tty->print_cr("x16 = 0x%016lx", regs[16]);
 539       tty->print_cr("x17 = 0x%016lx", regs[17]);
 540       tty->print_cr("x18 = 0x%016lx", regs[18]);
 541       tty->print_cr("x19 = 0x%016lx", regs[19]);
 542       tty->print_cr("x20 = 0x%016lx", regs[20]);
 543       tty->print_cr("x21 = 0x%016lx", regs[21]);
 544       tty->print_cr("x22 = 0x%016lx", regs[22]);
 545       tty->print_cr("x23 = 0x%016lx", regs[23]);
 546       tty->print_cr("x24 = 0x%016lx", regs[24]);
 547       tty->print_cr("x25 = 0x%016lx", regs[25]);
 548       tty->print_cr("x26 = 0x%016lx", regs[26]);
 549       tty->print_cr("x27 = 0x%016lx", regs[27]);
 550       tty->print_cr("x28 = 0x%016lx", regs[28]);
 551       tty->print_cr("x30 = 0x%016lx", regs[30]);
 552       tty->print_cr("x31 = 0x%016lx", regs[31]);
 553       BREAKPOINT;
 554     }
 555   }
 556   fatal("DEBUG MESSAGE: %s", msg);
 557 }
 558 
 559 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
 560   assert_different_registers(value, tmp1, tmp2);
 561   Label done, tagged, weak_tagged;
 562 
 563   beqz(value, done);           // Use null as-is.
 564   // Test for tag.
 565   andi(tmp1, value, JNIHandles::tag_mask);
 566   bnez(tmp1, tagged);
 567 
 568   // Resolve local handle
 569   access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
 570   verify_oop(value);
 571   j(done);
 572 
 573   bind(tagged);
 574   // Test for jweak tag.
 575   STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
 576   test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
 577   bnez(tmp1, weak_tagged);
 578 
 579   // Resolve global handle
 580   access_load_at(T_OBJECT, IN_NATIVE, value,
 581                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 582   verify_oop(value);
 583   j(done);
 584 
 585   bind(weak_tagged);
 586   // Resolve jweak.
 587   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
 588                  Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
 589   verify_oop(value);
 590 
 591   bind(done);
 592 }
 593 
 594 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
 595   assert_different_registers(value, tmp1, tmp2);
 596   Label done;
 597 
 598   beqz(value, done);           // Use null as-is.
 599 
 600 #ifdef ASSERT
 601   {
 602     STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
 603     Label valid_global_tag;
 604     test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
 605     bnez(tmp1, valid_global_tag);
 606     stop("non global jobject using resolve_global_jobject");
 607     bind(valid_global_tag);
 608   }
 609 #endif
 610 
 611   // Resolve global handle
 612   access_load_at(T_OBJECT, IN_NATIVE, value,
 613                  Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
 614   verify_oop(value);
 615 
 616   bind(done);
 617 }
 618 
 619 void MacroAssembler::stop(const char* msg) {
 620   BLOCK_COMMENT(msg);
 621   illegal_instruction(Assembler::csr::time);
 622   emit_int64((uintptr_t)msg);
 623 }
 624 
 625 void MacroAssembler::unimplemented(const char* what) {
 626   const char* buf = nullptr;
 627   {
 628     ResourceMark rm;
 629     stringStream ss;
 630     ss.print("unimplemented: %s", what);
 631     buf = code_string(ss.as_string());
 632   }
 633   stop(buf);
 634 }
 635 
 636 void MacroAssembler::emit_static_call_stub() {
 637   IncompressibleRegion ir(this);  // Fixed length: see CompiledStaticCall::to_interp_stub_size().
 638   // CompiledDirectStaticCall::set_to_interpreted knows the
 639   // exact layout of this stub.
 640 
 641   mov_metadata(xmethod, (Metadata*)nullptr);
 642 
 643   // Jump to the entry point of the c2i stub.
 644   int32_t offset = 0;
 645   movptr(t0, 0, offset);
 646   jalr(x0, t0, offset);
 647 }
 648 
 649 void MacroAssembler::call_VM_leaf_base(address entry_point,
 650                                        int number_of_arguments,
 651                                        Label *retaddr) {
 652   push_reg(RegSet::of(t0, xmethod), sp);   // push << t0 & xmethod >> to sp
 653   call(entry_point);
 654   if (retaddr != nullptr) {
 655     bind(*retaddr);
 656   }
 657   pop_reg(RegSet::of(t0, xmethod), sp);   // pop << t0 & xmethod >> from sp
 658 }
 659 
 660 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 661   call_VM_leaf_base(entry_point, number_of_arguments);
 662 }
 663 
 664 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 665   pass_arg0(this, arg_0);
 666   call_VM_leaf_base(entry_point, 1);
 667 }
 668 
 669 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 670   assert_different_registers(arg_1, c_rarg0);
 671   pass_arg0(this, arg_0);
 672   pass_arg1(this, arg_1);
 673   call_VM_leaf_base(entry_point, 2);
 674 }
 675 
 676 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
 677                                   Register arg_1, Register arg_2) {
 678   assert_different_registers(arg_1, c_rarg0);
 679   assert_different_registers(arg_2, c_rarg0, c_rarg1);
 680   pass_arg0(this, arg_0);
 681   pass_arg1(this, arg_1);
 682   pass_arg2(this, arg_2);
 683   call_VM_leaf_base(entry_point, 3);
 684 }
 685 
 686 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 687   pass_arg0(this, arg_0);
 688   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 689 }
 690 
 691 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 692 
 693   assert_different_registers(arg_0, c_rarg1);
 694   pass_arg1(this, arg_1);
 695   pass_arg0(this, arg_0);
 696   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 697 }
 698 
 699 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 700   assert_different_registers(arg_0, c_rarg1, c_rarg2);
 701   assert_different_registers(arg_1, c_rarg2);
 702   pass_arg2(this, arg_2);
 703   pass_arg1(this, arg_1);
 704   pass_arg0(this, arg_0);
 705   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 706 }
 707 
 708 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 709   assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
 710   assert_different_registers(arg_1, c_rarg2, c_rarg3);
 711   assert_different_registers(arg_2, c_rarg3);
 712 
 713   pass_arg3(this, arg_3);
 714   pass_arg2(this, arg_2);
 715   pass_arg1(this, arg_1);
 716   pass_arg0(this, arg_0);
 717   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 718 }
 719 
 720 void MacroAssembler::la(Register Rd, const address dest) {
 721   int64_t offset = dest - pc();
 722   if (is_simm32(offset)) {
 723     auipc(Rd, (int32_t)offset + 0x800);  //0x800, Note:the 11th sign bit
 724     addi(Rd, Rd, ((int64_t)offset << 52) >> 52);
 725   } else {
 726     movptr(Rd, dest);
 727   }
 728 }
 729 
 730 void MacroAssembler::la(Register Rd, const Address &adr) {
 731   switch (adr.getMode()) {
 732     case Address::literal: {
 733       relocInfo::relocType rtype = adr.rspec().reloc()->type();
 734       if (rtype == relocInfo::none) {
 735         mv(Rd, (intptr_t)(adr.target()));
 736       } else {
 737         relocate(adr.rspec(), [&] {
 738           movptr(Rd, adr.target());
 739         });
 740       }
 741       break;
 742     }
 743     case Address::base_plus_offset: {
 744       Address new_adr = legitimize_address(Rd, adr);
 745       if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
 746         addi(Rd, new_adr.base(), new_adr.offset());
 747       }
 748       break;
 749     }
 750     default:
 751       ShouldNotReachHere();
 752   }
 753 }
 754 
 755 void MacroAssembler::la(Register Rd, Label &label) {
 756   IncompressibleRegion ir(this);   // the label address may be patched back.
 757   wrap_label(Rd, label, &MacroAssembler::la);
 758 }
 759 
 760 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
 761   lui(Rd, (uint32_t)imm << 12);
 762   srli(Rd, Rd, 12);
 763 }
 764 
 765 void MacroAssembler::li32(Register Rd, int32_t imm) {
 766   // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
 767   int64_t upper = imm, lower = imm;
 768   lower = (imm << 20) >> 20;
 769   upper -= lower;
 770   upper = (int32_t)upper;
 771   // lui Rd, imm[31:12] + imm[11]
 772   lui(Rd, upper);
 773   // use addiw to distinguish li32 to li64
 774   addiw(Rd, Rd, lower);
 775 }
 776 
 777 void MacroAssembler::li64(Register Rd, int64_t imm) {
 778   // Load upper 32 bits. upper = imm[63:32], but if imm[31] == 1 or
 779   // (imm[31:20] == 0x7ff && imm[19] == 1), upper = imm[63:32] + 1.
 780   int64_t lower = imm & 0xffffffff;
 781   lower -= ((lower << 44) >> 44);
 782   int64_t tmp_imm = ((uint64_t)(imm & 0xffffffff00000000)) + (uint64_t)lower;
 783   int32_t upper = (tmp_imm - (int32_t)lower) >> 32;
 784 
 785   // Load upper 32 bits
 786   int64_t up = upper, lo = upper;
 787   lo = (lo << 52) >> 52;
 788   up -= lo;
 789   up = (int32_t)up;
 790   lui(Rd, up);
 791   addi(Rd, Rd, lo);
 792 
 793   // Load the rest 32 bits.
 794   slli(Rd, Rd, 12);
 795   addi(Rd, Rd, (int32_t)lower >> 20);
 796   slli(Rd, Rd, 12);
 797   lower = ((int32_t)imm << 12) >> 20;
 798   addi(Rd, Rd, lower);
 799   slli(Rd, Rd, 8);
 800   lower = imm & 0xff;
 801   addi(Rd, Rd, lower);
 802 }
 803 
 804 void MacroAssembler::li(Register Rd, int64_t imm) {
 805   // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
 806   // li -> c.li
 807   if (do_compress() && (is_simm6(imm) && Rd != x0)) {
 808     c_li(Rd, imm);
 809     return;
 810   }
 811 
 812   int shift = 12;
 813   int64_t upper = imm, lower = imm;
 814   // Split imm to a lower 12-bit sign-extended part and the remainder,
 815   // because addi will sign-extend the lower imm.
 816   lower = ((int32_t)imm << 20) >> 20;
 817   upper -= lower;
 818 
 819   // Test whether imm is a 32-bit integer.
 820   if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
 821         (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
 822     while (((upper >> shift) & 1) == 0) { shift++; }
 823     upper >>= shift;
 824     li(Rd, upper);
 825     slli(Rd, Rd, shift);
 826     if (lower != 0) {
 827       addi(Rd, Rd, lower);
 828     }
 829   } else {
 830     // 32-bit integer
 831     Register hi_Rd = zr;
 832     if (upper != 0) {
 833       lui(Rd, (int32_t)upper);
 834       hi_Rd = Rd;
 835     }
 836     if (lower != 0 || hi_Rd == zr) {
 837       addiw(Rd, hi_Rd, lower);
 838     }
 839   }
 840 }
 841 
 842 #define INSN(NAME, REGISTER)                                       \
 843   void MacroAssembler::NAME(const address dest, Register temp) {   \
 844     assert_cond(dest != nullptr);                                  \
 845     int64_t distance = dest - pc();                                \
 846     if (is_simm21(distance) && ((distance % 2) == 0)) {            \
 847       Assembler::jal(REGISTER, distance);                          \
 848     } else {                                                       \
 849       assert(temp != noreg, "expecting a register");               \
 850       int32_t offset = 0;                                          \
 851       movptr(temp, dest, offset);                                  \
 852       Assembler::jalr(REGISTER, temp, offset);                     \
 853     }                                                              \
 854   }                                                                \
 855 
 856   INSN(j,   x0);
 857   INSN(jal, x1);
 858 
 859 #undef INSN
 860 
 861 #define INSN(NAME, REGISTER)                                       \
 862   void MacroAssembler::NAME(const Address &adr, Register temp) {   \
 863     switch (adr.getMode()) {                                       \
 864       case Address::literal: {                                     \
 865         relocate(adr.rspec(), [&] {                                \
 866           NAME(adr.target(), temp);                                \
 867         });                                                        \
 868         break;                                                     \
 869       }                                                            \
 870       case Address::base_plus_offset: {                            \
 871         int32_t offset = ((int32_t)adr.offset() << 20) >> 20;      \
 872         la(temp, Address(adr.base(), adr.offset() - offset));      \
 873         Assembler::jalr(REGISTER, temp, offset);                   \
 874         break;                                                     \
 875       }                                                            \
 876       default:                                                     \
 877         ShouldNotReachHere();                                      \
 878     }                                                              \
 879   }
 880 
 881   INSN(j,   x0);
 882   INSN(jal, x1);
 883 
 884 #undef INSN
 885 
 886 #define INSN(NAME)                                                                    \
 887   void MacroAssembler::NAME(Register Rd, const address dest, Register temp) {         \
 888     assert_cond(dest != nullptr);                                                     \
 889     int64_t distance = dest - pc();                                                   \
 890     if (is_simm21(distance) && ((distance % 2) == 0)) {                               \
 891       Assembler::NAME(Rd, distance);                                                  \
 892     } else {                                                                          \
 893       assert_different_registers(Rd, temp);                                           \
 894       int32_t offset = 0;                                                             \
 895       movptr(temp, dest, offset);                                                     \
 896       jalr(Rd, temp, offset);                                                         \
 897     }                                                                                 \
 898   }                                                                                   \
 899   void MacroAssembler::NAME(Register Rd, Label &L, Register temp) {                   \
 900     assert_different_registers(Rd, temp);                                             \
 901     wrap_label(Rd, L, temp, &MacroAssembler::NAME);                                   \
 902   }
 903 
 904   INSN(jal);
 905 
 906 #undef INSN
 907 
 908 #define INSN(NAME, REGISTER)                                       \
 909   void MacroAssembler::NAME(Label &l, Register temp) {             \
 910     jal(REGISTER, l, temp);                                        \
 911   }                                                                \
 912 
 913   INSN(j,   x0);
 914   INSN(jal, x1);
 915 
 916 #undef INSN
 917 
 918 void MacroAssembler::wrap_label(Register Rt, Label &L, Register tmp, load_insn_by_temp insn) {
 919   if (L.is_bound()) {
 920     (this->*insn)(Rt, target(L), tmp);
 921   } else {
 922     L.add_patch_at(code(), locator());
 923     (this->*insn)(Rt, pc(), tmp);
 924   }
 925 }
 926 
 927 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
 928   if (L.is_bound()) {
 929     (this->*insn)(Rt, target(L));
 930   } else {
 931     L.add_patch_at(code(), locator());
 932     (this->*insn)(Rt, pc());
 933   }
 934 }
 935 
 936 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
 937                                 compare_and_branch_insn insn,
 938                                 compare_and_branch_label_insn neg_insn, bool is_far) {
 939   if (is_far) {
 940     Label done;
 941     (this->*neg_insn)(r1, r2, done, /* is_far */ false);
 942     j(L);
 943     bind(done);
 944   } else {
 945     if (L.is_bound()) {
 946       (this->*insn)(r1, r2, target(L));
 947     } else {
 948       L.add_patch_at(code(), locator());
 949       (this->*insn)(r1, r2, pc());
 950     }
 951   }
 952 }
 953 
 954 #define INSN(NAME, NEG_INSN)                                                              \
 955   void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) {          \
 956     wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far);    \
 957   }
 958 
 959   INSN(beq,  bne);
 960   INSN(bne,  beq);
 961   INSN(blt,  bge);
 962   INSN(bge,  blt);
 963   INSN(bltu, bgeu);
 964   INSN(bgeu, bltu);
 965 
 966 #undef INSN
 967 
 968 #define INSN(NAME)                                                                \
 969   void MacroAssembler::NAME##z(Register Rs, const address dest) {                 \
 970     NAME(Rs, zr, dest);                                                           \
 971   }                                                                               \
 972   void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) {              \
 973     NAME(Rs, zr, l, is_far);                                                      \
 974   }                                                                               \
 975 
 976   INSN(beq);
 977   INSN(bne);
 978   INSN(blt);
 979   INSN(ble);
 980   INSN(bge);
 981   INSN(bgt);
 982 
 983 #undef INSN
 984 
 985 #define INSN(NAME, NEG_INSN)                                                      \
 986   void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) {       \
 987     NEG_INSN(Rt, Rs, dest);                                                       \
 988   }                                                                               \
 989   void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) {    \
 990     NEG_INSN(Rt, Rs, l, is_far);                                                  \
 991   }
 992 
 993   INSN(bgt,  blt);
 994   INSN(ble,  bge);
 995   INSN(bgtu, bltu);
 996   INSN(bleu, bgeu);
 997 
 998 #undef INSN
 999 
1000 // Float compare branch instructions
1001 
1002 #define INSN(NAME, FLOATCMP, BRANCH)                                                                                    \
1003   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {   \
1004     FLOATCMP##_s(t0, Rs1, Rs2);                                                                                         \
1005     BRANCH(t0, l, is_far);                                                                                              \
1006   }                                                                                                                     \
1007   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) {  \
1008     FLOATCMP##_d(t0, Rs1, Rs2);                                                                                         \
1009     BRANCH(t0, l, is_far);                                                                                              \
1010   }
1011 
1012   INSN(beq, feq, bnez);
1013   INSN(bne, feq, beqz);
1014 
1015 #undef INSN
1016 
1017 
1018 #define INSN(NAME, FLOATCMP1, FLOATCMP2)                                              \
1019   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,   \
1020                                     bool is_far, bool is_unordered) {                 \
1021     if (is_unordered) {                                                               \
1022       /* jump if either source is NaN or condition is expected */                     \
1023       FLOATCMP2##_s(t0, Rs2, Rs1);                                                    \
1024       beqz(t0, l, is_far);                                                            \
1025     } else {                                                                          \
1026       /* jump if no NaN in source and condition is expected */                        \
1027       FLOATCMP1##_s(t0, Rs1, Rs2);                                                    \
1028       bnez(t0, l, is_far);                                                            \
1029     }                                                                                 \
1030   }                                                                                   \
1031   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1032                                      bool is_far, bool is_unordered) {                \
1033     if (is_unordered) {                                                               \
1034       /* jump if either source is NaN or condition is expected */                     \
1035       FLOATCMP2##_d(t0, Rs2, Rs1);                                                    \
1036       beqz(t0, l, is_far);                                                            \
1037     } else {                                                                          \
1038       /* jump if no NaN in source and condition is expected */                        \
1039       FLOATCMP1##_d(t0, Rs1, Rs2);                                                    \
1040       bnez(t0, l, is_far);                                                            \
1041     }                                                                                 \
1042   }
1043 
1044   INSN(ble, fle, flt);
1045   INSN(blt, flt, fle);
1046 
1047 #undef INSN
1048 
1049 #define INSN(NAME, CMP)                                                              \
1050   void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l,  \
1051                                     bool is_far, bool is_unordered) {                \
1052     float_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                  \
1053   }                                                                                  \
1054   void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1055                                      bool is_far, bool is_unordered) {               \
1056     double_##CMP(Rs2, Rs1, l, is_far, is_unordered);                                 \
1057   }
1058 
1059   INSN(bgt, blt);
1060   INSN(bge, ble);
1061 
1062 #undef INSN
1063 
1064 
1065 #define INSN(NAME, CSR)                       \
1066   void MacroAssembler::NAME(Register Rd) {    \
1067     csrr(Rd, CSR);                            \
1068   }
1069 
1070   INSN(rdinstret,  CSR_INSTRET);
1071   INSN(rdcycle,    CSR_CYCLE);
1072   INSN(rdtime,     CSR_TIME);
1073   INSN(frcsr,      CSR_FCSR);
1074   INSN(frrm,       CSR_FRM);
1075   INSN(frflags,    CSR_FFLAGS);
1076 
1077 #undef INSN
1078 
1079 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1080   csrrs(Rd, csr, x0);
1081 }
1082 
1083 #define INSN(NAME, OPFUN)                                      \
1084   void MacroAssembler::NAME(unsigned csr, Register Rs) {       \
1085     OPFUN(x0, csr, Rs);                                        \
1086   }
1087 
1088   INSN(csrw, csrrw);
1089   INSN(csrs, csrrs);
1090   INSN(csrc, csrrc);
1091 
1092 #undef INSN
1093 
1094 #define INSN(NAME, OPFUN)                                      \
1095   void MacroAssembler::NAME(unsigned csr, unsigned imm) {      \
1096     OPFUN(x0, csr, imm);                                       \
1097   }
1098 
1099   INSN(csrwi, csrrwi);
1100   INSN(csrsi, csrrsi);
1101   INSN(csrci, csrrci);
1102 
1103 #undef INSN
1104 
1105 #define INSN(NAME, CSR)                                      \
1106   void MacroAssembler::NAME(Register Rd, Register Rs) {      \
1107     csrrw(Rd, CSR, Rs);                                      \
1108   }
1109 
1110   INSN(fscsr,   CSR_FCSR);
1111   INSN(fsrm,    CSR_FRM);
1112   INSN(fsflags, CSR_FFLAGS);
1113 
1114 #undef INSN
1115 
1116 #define INSN(NAME)                              \
1117   void MacroAssembler::NAME(Register Rs) {      \
1118     NAME(x0, Rs);                               \
1119   }
1120 
1121   INSN(fscsr);
1122   INSN(fsrm);
1123   INSN(fsflags);
1124 
1125 #undef INSN
1126 
1127 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1128   guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1129   csrrwi(Rd, CSR_FRM, imm);
1130 }
1131 
1132 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1133    csrrwi(Rd, CSR_FFLAGS, imm);
1134 }
1135 
1136 #define INSN(NAME)                             \
1137   void MacroAssembler::NAME(unsigned imm) {    \
1138     NAME(x0, imm);                             \
1139   }
1140 
1141   INSN(fsrmi);
1142   INSN(fsflagsi);
1143 
1144 #undef INSN
1145 
1146 void MacroAssembler::push_reg(Register Rs)
1147 {
1148   addi(esp, esp, 0 - wordSize);
1149   sd(Rs, Address(esp, 0));
1150 }
1151 
1152 void MacroAssembler::pop_reg(Register Rd)
1153 {
1154   ld(Rd, Address(esp, 0));
1155   addi(esp, esp, wordSize);
1156 }
1157 
1158 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1159   int count = 0;
1160   // Scan bitset to accumulate register pairs
1161   for (int reg = 31; reg >= 0; reg--) {
1162     if ((1U << 31) & bitset) {
1163       regs[count++] = reg;
1164     }
1165     bitset <<= 1;
1166   }
1167   return count;
1168 }
1169 
1170 // Push integer registers in the bitset supplied. Don't push sp.
1171 // Return the number of words pushed
1172 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1173   DEBUG_ONLY(int words_pushed = 0;)
1174   unsigned char regs[32];
1175   int count = bitset_to_regs(bitset, regs);
1176   // reserve one slot to align for odd count
1177   int offset = is_even(count) ? 0 : wordSize;
1178 
1179   if (count) {
1180     addi(stack, stack, -count * wordSize - offset);
1181   }
1182   for (int i = count - 1; i >= 0; i--) {
1183     sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1184     DEBUG_ONLY(words_pushed++;)
1185   }
1186 
1187   assert(words_pushed == count, "oops, pushed != count");
1188 
1189   return count;
1190 }
1191 
1192 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1193   DEBUG_ONLY(int words_popped = 0;)
1194   unsigned char regs[32];
1195   int count = bitset_to_regs(bitset, regs);
1196   // reserve one slot to align for odd count
1197   int offset = is_even(count) ? 0 : wordSize;
1198 
1199   for (int i = count - 1; i >= 0; i--) {
1200     ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1201     DEBUG_ONLY(words_popped++;)
1202   }
1203 
1204   if (count) {
1205     addi(stack, stack, count * wordSize + offset);
1206   }
1207   assert(words_popped == count, "oops, popped != count");
1208 
1209   return count;
1210 }
1211 
1212 // Push floating-point registers in the bitset supplied.
1213 // Return the number of words pushed
1214 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1215   DEBUG_ONLY(int words_pushed = 0;)
1216   unsigned char regs[32];
1217   int count = bitset_to_regs(bitset, regs);
1218   int push_slots = count + (count & 1);
1219 
1220   if (count) {
1221     addi(stack, stack, -push_slots * wordSize);
1222   }
1223 
1224   for (int i = count - 1; i >= 0; i--) {
1225     fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1226     DEBUG_ONLY(words_pushed++;)
1227   }
1228 
1229   assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1230 
1231   return count;
1232 }
1233 
1234 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1235   DEBUG_ONLY(int words_popped = 0;)
1236   unsigned char regs[32];
1237   int count = bitset_to_regs(bitset, regs);
1238   int pop_slots = count + (count & 1);
1239 
1240   for (int i = count - 1; i >= 0; i--) {
1241     fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1242     DEBUG_ONLY(words_popped++;)
1243   }
1244 
1245   if (count) {
1246     addi(stack, stack, pop_slots * wordSize);
1247   }
1248 
1249   assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1250 
1251   return count;
1252 }
1253 
1254 #ifdef COMPILER2
1255 // Push vector registers in the bitset supplied.
1256 // Return the number of words pushed
1257 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
1258   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1259 
1260   // Scan bitset to accumulate register pairs
1261   unsigned char regs[32];
1262   int count = bitset_to_regs(bitset, regs);
1263 
1264   for (int i = 0; i < count; i++) {
1265     sub(stack, stack, vector_size_in_bytes);
1266     vs1r_v(as_VectorRegister(regs[i]), stack);
1267   }
1268 
1269   return count * vector_size_in_bytes / wordSize;
1270 }
1271 
1272 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
1273   int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
1274 
1275   // Scan bitset to accumulate register pairs
1276   unsigned char regs[32];
1277   int count = bitset_to_regs(bitset, regs);
1278 
1279   for (int i = count - 1; i >= 0; i--) {
1280     vl1r_v(as_VectorRegister(regs[i]), stack);
1281     add(stack, stack, vector_size_in_bytes);
1282   }
1283 
1284   return count * vector_size_in_bytes / wordSize;
1285 }
1286 #endif // COMPILER2
1287 
1288 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
1289   // Push integer registers x7, x10-x17, x28-x31.
1290   push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1291 
1292   // Push float registers f0-f7, f10-f17, f28-f31.
1293   addi(sp, sp, - wordSize * 20);
1294   int offset = 0;
1295   for (int i = 0; i < 32; i++) {
1296     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1297       fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1298     }
1299   }
1300 }
1301 
1302 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
1303   int offset = 0;
1304   for (int i = 0; i < 32; i++) {
1305     if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
1306       fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
1307     }
1308   }
1309   addi(sp, sp, wordSize * 20);
1310 
1311   pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
1312 }
1313 
1314 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
1315   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1316   push_reg(RegSet::range(x5, x31), sp);
1317 
1318   // float registers
1319   addi(sp, sp, - 32 * wordSize);
1320   for (int i = 0; i < 32; i++) {
1321     fsd(as_FloatRegister(i), Address(sp, i * wordSize));
1322   }
1323 
1324   // vector registers
1325   if (save_vectors) {
1326     sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
1327     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1328     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1329       add(t0, sp, vector_size_in_bytes * i);
1330       vse64_v(as_VectorRegister(i), t0);
1331     }
1332   }
1333 }
1334 
1335 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
1336   // vector registers
1337   if (restore_vectors) {
1338     vsetvli(t0, x0, Assembler::e64, Assembler::m8);
1339     for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
1340       vle64_v(as_VectorRegister(i), sp);
1341       add(sp, sp, vector_size_in_bytes * 8);
1342     }
1343   }
1344 
1345   // float registers
1346   for (int i = 0; i < 32; i++) {
1347     fld(as_FloatRegister(i), Address(sp, i * wordSize));
1348   }
1349   addi(sp, sp, 32 * wordSize);
1350 
1351   // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
1352   pop_reg(RegSet::range(x5, x31), sp);
1353 }
1354 
1355 static int patch_offset_in_jal(address branch, int64_t offset) {
1356   assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
1357          "offset is too large to be patched in one jal instruction!\n");
1358   Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1);                       // offset[20]    ==> branch[31]
1359   Assembler::patch(branch, 30, 21, (offset >> 1)  & 0x3ff);                     // offset[10:1]  ==> branch[30:21]
1360   Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1);                       // offset[11]    ==> branch[20]
1361   Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff);                      // offset[19:12] ==> branch[19:12]
1362   return NativeInstruction::instruction_size;                                   // only one instruction
1363 }
1364 
1365 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
1366   assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
1367          "offset is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n");
1368   Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1);                       // offset[12]    ==> branch[31]
1369   Assembler::patch(branch, 30, 25, (offset >> 5)  & 0x3f);                      // offset[10:5]  ==> branch[30:25]
1370   Assembler::patch(branch, 7,  7,  (offset >> 11) & 0x1);                       // offset[11]    ==> branch[7]
1371   Assembler::patch(branch, 11, 8,  (offset >> 1)  & 0xf);                       // offset[4:1]   ==> branch[11:8]
1372   return NativeInstruction::instruction_size;                                   // only one instruction
1373 }
1374 
1375 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
1376   const int PC_RELATIVE_INSTRUCTION_NUM = 2;                                    // auipc, addi/jalr/load
1377   Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff);         // Auipc.          offset[31:12]  ==> branch[31:12]
1378   Assembler::patch(branch + 4, 31, 20, offset & 0xfff);                         // Addi/Jalr/Load. offset[11:0]   ==> branch[31:20]
1379   return PC_RELATIVE_INSTRUCTION_NUM * NativeInstruction::instruction_size;
1380 }
1381 
1382 static int patch_addr_in_movptr(address branch, address target) {
1383   const int MOVPTR_INSTRUCTIONS_NUM = 6;                                        // lui + addi + slli + addi + slli + addi/jalr/load
1384   int32_t lower = ((intptr_t)target << 35) >> 35;
1385   int64_t upper = ((intptr_t)target - lower) >> 29;
1386   Assembler::patch(branch + 0,  31, 12, upper & 0xfffff);                       // Lui.             target[48:29] + target[28] ==> branch[31:12]
1387   Assembler::patch(branch + 4,  31, 20, (lower >> 17) & 0xfff);                 // Addi.            target[28:17] ==> branch[31:20]
1388   Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff);                  // Addi.            target[16: 6] ==> branch[31:20]
1389   Assembler::patch(branch + 20, 31, 20, lower & 0x3f);                          // Addi/Jalr/Load.  target[ 5: 0] ==> branch[31:20]
1390   return MOVPTR_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1391 }
1392 
1393 static int patch_imm_in_li64(address branch, address target) {
1394   const int LI64_INSTRUCTIONS_NUM = 8;                                          // lui + addi + slli + addi + slli + addi + slli + addi
1395   int64_t lower = (intptr_t)target & 0xffffffff;
1396   lower = lower - ((lower << 44) >> 44);
1397   int64_t tmp_imm = ((uint64_t)((intptr_t)target & 0xffffffff00000000)) + (uint64_t)lower;
1398   int32_t upper =  (tmp_imm - (int32_t)lower) >> 32;
1399   int64_t tmp_upper = upper, tmp_lower = upper;
1400   tmp_lower = (tmp_lower << 52) >> 52;
1401   tmp_upper -= tmp_lower;
1402   tmp_upper >>= 12;
1403   // Load upper 32 bits. Upper = target[63:32], but if target[31] = 1 or (target[31:20] == 0x7ff && target[19] == 1),
1404   // upper = target[63:32] + 1.
1405   Assembler::patch(branch + 0,  31, 12, tmp_upper & 0xfffff);                       // Lui.
1406   Assembler::patch(branch + 4,  31, 20, tmp_lower & 0xfff);                         // Addi.
1407   // Load the rest 32 bits.
1408   Assembler::patch(branch + 12, 31, 20, ((int32_t)lower >> 20) & 0xfff);            // Addi.
1409   Assembler::patch(branch + 20, 31, 20, (((intptr_t)target << 44) >> 52) & 0xfff);  // Addi.
1410   Assembler::patch(branch + 28, 31, 20, (intptr_t)target & 0xff);                   // Addi.
1411   return LI64_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1412 }
1413 
1414 static int patch_imm_in_li16u(address branch, uint16_t target) {
1415   Assembler::patch(branch, 31, 12, target); // patch lui only
1416   return NativeInstruction::instruction_size;
1417 }
1418 
1419 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
1420   const int LI32_INSTRUCTIONS_NUM = 2;                                          // lui + addiw
1421   int64_t upper = (intptr_t)target;
1422   int32_t lower = (((int32_t)target) << 20) >> 20;
1423   upper -= lower;
1424   upper = (int32_t)upper;
1425   Assembler::patch(branch + 0,  31, 12, (upper >> 12) & 0xfffff);               // Lui.
1426   Assembler::patch(branch + 4,  31, 20, lower & 0xfff);                         // Addiw.
1427   return LI32_INSTRUCTIONS_NUM * NativeInstruction::instruction_size;
1428 }
1429 
1430 static long get_offset_of_jal(address insn_addr) {
1431   assert_cond(insn_addr != nullptr);
1432   long offset = 0;
1433   unsigned insn = Assembler::ld_instr(insn_addr);
1434   long val = (long)Assembler::sextract(insn, 31, 12);
1435   offset |= ((val >> 19) & 0x1) << 20;
1436   offset |= (val & 0xff) << 12;
1437   offset |= ((val >> 8) & 0x1) << 11;
1438   offset |= ((val >> 9) & 0x3ff) << 1;
1439   offset = (offset << 43) >> 43;
1440   return offset;
1441 }
1442 
1443 static long get_offset_of_conditional_branch(address insn_addr) {
1444   long offset = 0;
1445   assert_cond(insn_addr != nullptr);
1446   unsigned insn = Assembler::ld_instr(insn_addr);
1447   offset = (long)Assembler::sextract(insn, 31, 31);
1448   offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
1449   offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
1450   offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
1451   offset = (offset << 41) >> 41;
1452   return offset;
1453 }
1454 
1455 static long get_offset_of_pc_relative(address insn_addr) {
1456   long offset = 0;
1457   assert_cond(insn_addr != nullptr);
1458   offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12;                               // Auipc.
1459   offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                                  // Addi/Jalr/Load.
1460   offset = (offset << 32) >> 32;
1461   return offset;
1462 }
1463 
1464 static address get_target_of_movptr(address insn_addr) {
1465   assert_cond(insn_addr != nullptr);
1466   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
1467   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17;                 // Addi.
1468   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6;                 // Addi.
1469   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20));                      // Addi/Jalr/Load.
1470   return (address) target_address;
1471 }
1472 
1473 static address get_target_of_li64(address insn_addr) {
1474   assert_cond(insn_addr != nullptr);
1475   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 44; // Lui.
1476   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 32;                 // Addi.
1477   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 20;                // Addi.
1478   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)) << 8;                 // Addi.
1479   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 28), 31, 20));                      // Addi.
1480   return (address)target_address;
1481 }
1482 
1483 address MacroAssembler::get_target_of_li32(address insn_addr) {
1484   assert_cond(insn_addr != nullptr);
1485   intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
1486   target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20));                       // Addiw.
1487   return (address)target_address;
1488 }
1489 
1490 // Patch any kind of instruction; there may be several instructions.
1491 // Return the total length (in bytes) of the instructions.
1492 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
1493   assert_cond(branch != nullptr);
1494   int64_t offset = target - branch;
1495   if (NativeInstruction::is_jal_at(branch)) {                         // jal
1496     return patch_offset_in_jal(branch, offset);
1497   } else if (NativeInstruction::is_branch_at(branch)) {               // beq/bge/bgeu/blt/bltu/bne
1498     return patch_offset_in_conditional_branch(branch, offset);
1499   } else if (NativeInstruction::is_pc_relative_at(branch)) {          // auipc, addi/jalr/load
1500     return patch_offset_in_pc_relative(branch, offset);
1501   } else if (NativeInstruction::is_movptr_at(branch)) {               // movptr
1502     return patch_addr_in_movptr(branch, target);
1503   } else if (NativeInstruction::is_li64_at(branch)) {                 // li64
1504     return patch_imm_in_li64(branch, target);
1505   } else if (NativeInstruction::is_li32_at(branch)) {                 // li32
1506     int64_t imm = (intptr_t)target;
1507     return patch_imm_in_li32(branch, (int32_t)imm);
1508   } else if (NativeInstruction::is_li16u_at(branch)) {
1509     int64_t imm = (intptr_t)target;
1510     return patch_imm_in_li16u(branch, (uint16_t)imm);
1511   } else {
1512 #ifdef ASSERT
1513     tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
1514                   Assembler::ld_instr(branch), p2i(branch));
1515     Disassembler::decode(branch - 16, branch + 16);
1516 #endif
1517     ShouldNotReachHere();
1518     return -1;
1519   }
1520 }
1521 
1522 address MacroAssembler::target_addr_for_insn(address insn_addr) {
1523   long offset = 0;
1524   assert_cond(insn_addr != nullptr);
1525   if (NativeInstruction::is_jal_at(insn_addr)) {                     // jal
1526     offset = get_offset_of_jal(insn_addr);
1527   } else if (NativeInstruction::is_branch_at(insn_addr)) {           // beq/bge/bgeu/blt/bltu/bne
1528     offset = get_offset_of_conditional_branch(insn_addr);
1529   } else if (NativeInstruction::is_pc_relative_at(insn_addr)) {      // auipc, addi/jalr/load
1530     offset = get_offset_of_pc_relative(insn_addr);
1531   } else if (NativeInstruction::is_movptr_at(insn_addr)) {           // movptr
1532     return get_target_of_movptr(insn_addr);
1533   } else if (NativeInstruction::is_li64_at(insn_addr)) {             // li64
1534     return get_target_of_li64(insn_addr);
1535   } else if (NativeInstruction::is_li32_at(insn_addr)) {             // li32
1536     return get_target_of_li32(insn_addr);
1537   } else {
1538     ShouldNotReachHere();
1539   }
1540   return address(((uintptr_t)insn_addr + offset));
1541 }
1542 
1543 int MacroAssembler::patch_oop(address insn_addr, address o) {
1544   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
1545   // narrow OOPs by setting the upper 16 bits in the first
1546   // instruction.
1547   if (NativeInstruction::is_li32_at(insn_addr)) {
1548     // Move narrow OOP
1549     uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
1550     return patch_imm_in_li32(insn_addr, (int32_t)n);
1551   } else if (NativeInstruction::is_movptr_at(insn_addr)) {
1552     // Move wide OOP
1553     return patch_addr_in_movptr(insn_addr, o);
1554   }
1555   ShouldNotReachHere();
1556   return -1;
1557 }
1558 
1559 void MacroAssembler::reinit_heapbase() {
1560   if (UseCompressedOops) {
1561     if (Universe::is_fully_initialized()) {
1562       mv(xheapbase, CompressedOops::ptrs_base());
1563     } else {
1564       ExternalAddress target(CompressedOops::ptrs_base_addr());
1565       relocate(target.rspec(), [&] {
1566         int32_t offset;
1567         la_patchable(xheapbase, target, offset);
1568         ld(xheapbase, Address(xheapbase, offset));
1569       });
1570     }
1571   }
1572 }
1573 
1574 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset) {
1575   int64_t imm64 = (int64_t)addr;
1576 #ifndef PRODUCT
1577   {
1578     char buffer[64];
1579     snprintf(buffer, sizeof(buffer), "0x%" PRIx64, imm64);
1580     block_comment(buffer);
1581   }
1582 #endif
1583   assert((uintptr_t)imm64 < (1ull << 48), "48-bit overflow in address constant");
1584   // Load upper 31 bits
1585   int64_t imm = imm64 >> 17;
1586   int64_t upper = imm, lower = imm;
1587   lower = (lower << 52) >> 52;
1588   upper -= lower;
1589   upper = (int32_t)upper;
1590   lui(Rd, upper);
1591   addi(Rd, Rd, lower);
1592 
1593   // Load the rest 17 bits.
1594   slli(Rd, Rd, 11);
1595   addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
1596   slli(Rd, Rd, 6);
1597 
1598   // This offset will be used by following jalr/ld.
1599   offset = imm64 & 0x3f;
1600 }
1601 
1602 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register temp) {
1603   if (is_simm12(increment)) {
1604     addi(Rd, Rn, increment);
1605   } else {
1606     assert_different_registers(Rn, temp);
1607     li(temp, increment);
1608     add(Rd, Rn, temp);
1609   }
1610 }
1611 
1612 void MacroAssembler::addw(Register Rd, Register Rn, int32_t increment, Register temp) {
1613   if (is_simm12(increment)) {
1614     addiw(Rd, Rn, increment);
1615   } else {
1616     assert_different_registers(Rn, temp);
1617     li(temp, increment);
1618     addw(Rd, Rn, temp);
1619   }
1620 }
1621 
1622 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register temp) {
1623   if (is_simm12(-decrement)) {
1624     addi(Rd, Rn, -decrement);
1625   } else {
1626     assert_different_registers(Rn, temp);
1627     li(temp, decrement);
1628     sub(Rd, Rn, temp);
1629   }
1630 }
1631 
1632 void MacroAssembler::subw(Register Rd, Register Rn, int32_t decrement, Register temp) {
1633   if (is_simm12(-decrement)) {
1634     addiw(Rd, Rn, -decrement);
1635   } else {
1636     assert_different_registers(Rn, temp);
1637     li(temp, decrement);
1638     subw(Rd, Rn, temp);
1639   }
1640 }
1641 
1642 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
1643   andr(Rd, Rs1, Rs2);
1644   sign_extend(Rd, Rd, 32);
1645 }
1646 
1647 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
1648   orr(Rd, Rs1, Rs2);
1649   sign_extend(Rd, Rd, 32);
1650 }
1651 
1652 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
1653   xorr(Rd, Rs1, Rs2);
1654   sign_extend(Rd, Rd, 32);
1655 }
1656 
1657 // Rd = Rs1 & (~Rd2)
1658 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
1659   if (UseZbb) {
1660     Assembler::andn(Rd, Rs1, Rs2);
1661     return;
1662   }
1663 
1664   notr(Rd, Rs2);
1665   andr(Rd, Rs1, Rd);
1666 }
1667 
1668 // Rd = Rs1 | (~Rd2)
1669 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
1670   if (UseZbb) {
1671     Assembler::orn(Rd, Rs1, Rs2);
1672     return;
1673   }
1674 
1675   notr(Rd, Rs2);
1676   orr(Rd, Rs1, Rd);
1677 }
1678 
1679 // Note: load_unsigned_short used to be called load_unsigned_word.
1680 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1681   int off = offset();
1682   lhu(dst, src);
1683   return off;
1684 }
1685 
1686 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1687   int off = offset();
1688   lbu(dst, src);
1689   return off;
1690 }
1691 
1692 int MacroAssembler::load_signed_short(Register dst, Address src) {
1693   int off = offset();
1694   lh(dst, src);
1695   return off;
1696 }
1697 
1698 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1699   int off = offset();
1700   lb(dst, src);
1701   return off;
1702 }
1703 
1704 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
1705   switch (size_in_bytes) {
1706     case  8:  ld(dst, src); break;
1707     case  4:  is_signed ? lw(dst, src) : lwu(dst, src); break;
1708     case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1709     case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1710     default:  ShouldNotReachHere();
1711   }
1712 }
1713 
1714 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
1715   switch (size_in_bytes) {
1716     case  8:  sd(src, dst); break;
1717     case  4:  sw(src, dst); break;
1718     case  2:  sh(src, dst); break;
1719     case  1:  sb(src, dst); break;
1720     default:  ShouldNotReachHere();
1721   }
1722 }
1723 
1724 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
1725 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1726   if (granularity != 1 && granularity != 2) {
1727     ShouldNotReachHere();
1728   }
1729   if (AvoidUnalignedAccesses && (granularity != 2)) {
1730     assert_different_registers(dst, tmp);
1731     assert_different_registers(tmp, src.base());
1732     is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
1733     slli(tmp, tmp, 8);
1734     lbu(dst, src);
1735     add(dst, dst, tmp);
1736   } else {
1737     is_signed ? lh(dst, src) : lhu(dst, src);
1738   }
1739 }
1740 
1741 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
1742 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
1743   if (AvoidUnalignedAccesses && (granularity != 4)) {
1744     switch(granularity) {
1745       case 1:
1746         assert_different_registers(dst, tmp, src.base());
1747         lbu(dst, src);
1748         lbu(tmp, Address(src.base(), src.offset() + 1));
1749         slli(tmp, tmp, 8);
1750         add(dst, dst, tmp);
1751         lbu(tmp, Address(src.base(), src.offset() + 2));
1752         slli(tmp, tmp, 16);
1753         add(dst, dst, tmp);
1754         is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
1755         slli(tmp, tmp, 24);
1756         add(dst, dst, tmp);
1757         break;
1758       case 2:
1759         assert_different_registers(dst, tmp);
1760         assert_different_registers(tmp, src.base());
1761         is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
1762         slli(tmp, tmp, 16);
1763         lhu(dst, src);
1764         add(dst, dst, tmp);
1765         break;
1766       default:
1767         ShouldNotReachHere();
1768     }
1769   } else {
1770     is_signed ? lw(dst, src) : lwu(dst, src);
1771   }
1772 }
1773 
1774 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
1775 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
1776   if (AvoidUnalignedAccesses && (granularity != 8)) {
1777     switch(granularity){
1778       case 1:
1779         assert_different_registers(dst, tmp, src.base());
1780         lbu(dst, src);
1781         lbu(tmp, Address(src.base(), src.offset() + 1));
1782         slli(tmp, tmp, 8);
1783         add(dst, dst, tmp);
1784         lbu(tmp, Address(src.base(), src.offset() + 2));
1785         slli(tmp, tmp, 16);
1786         add(dst, dst, tmp);
1787         lbu(tmp, Address(src.base(), src.offset() + 3));
1788         slli(tmp, tmp, 24);
1789         add(dst, dst, tmp);
1790         lbu(tmp, Address(src.base(), src.offset() + 4));
1791         slli(tmp, tmp, 32);
1792         add(dst, dst, tmp);
1793         lbu(tmp, Address(src.base(), src.offset() + 5));
1794         slli(tmp, tmp, 40);
1795         add(dst, dst, tmp);
1796         lbu(tmp, Address(src.base(), src.offset() + 6));
1797         slli(tmp, tmp, 48);
1798         add(dst, dst, tmp);
1799         lbu(tmp, Address(src.base(), src.offset() + 7));
1800         slli(tmp, tmp, 56);
1801         add(dst, dst, tmp);
1802         break;
1803       case 2:
1804         assert_different_registers(dst, tmp, src.base());
1805         lhu(dst, src);
1806         lhu(tmp, Address(src.base(), src.offset() + 2));
1807         slli(tmp, tmp, 16);
1808         add(dst, dst, tmp);
1809         lhu(tmp, Address(src.base(), src.offset() + 4));
1810         slli(tmp, tmp, 32);
1811         add(dst, dst, tmp);
1812         lhu(tmp, Address(src.base(), src.offset() + 6));
1813         slli(tmp, tmp, 48);
1814         add(dst, dst, tmp);
1815         break;
1816       case 4:
1817         assert_different_registers(dst, tmp);
1818         assert_different_registers(tmp, src.base());
1819         lwu(tmp, Address(src.base(), src.offset() + 4));
1820         slli(tmp, tmp, 32);
1821         lwu(dst, src);
1822         add(dst, dst, tmp);
1823         break;
1824       default:
1825         ShouldNotReachHere();
1826     }
1827   } else {
1828     ld(dst, src);
1829   }
1830 }
1831 
1832 
1833 // reverse bytes in halfword in lower 16 bits and sign-extend
1834 // Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits)
1835 void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) {
1836   if (UseZbb) {
1837     rev8(Rd, Rs);
1838     srai(Rd, Rd, 48);
1839     return;
1840   }
1841   assert_different_registers(Rs, tmp);
1842   assert_different_registers(Rd, tmp);
1843   srli(tmp, Rs, 8);
1844   andi(tmp, tmp, 0xFF);
1845   slli(Rd, Rs, 56);
1846   srai(Rd, Rd, 48); // sign-extend
1847   orr(Rd, Rd, tmp);
1848 }
1849 
1850 // reverse bytes in lower word and sign-extend
1851 // Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits)
1852 void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1853   if (UseZbb) {
1854     rev8(Rd, Rs);
1855     srai(Rd, Rd, 32);
1856     return;
1857   }
1858   assert_different_registers(Rs, tmp1, tmp2);
1859   assert_different_registers(Rd, tmp1, tmp2);
1860   revb_h_w_u(Rd, Rs, tmp1, tmp2);
1861   slli(tmp2, Rd, 48);
1862   srai(tmp2, tmp2, 32); // sign-extend
1863   srli(Rd, Rd, 16);
1864   orr(Rd, Rd, tmp2);
1865 }
1866 
1867 // reverse bytes in halfword in lower 16 bits and zero-extend
1868 // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1869 void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) {
1870   if (UseZbb) {
1871     rev8(Rd, Rs);
1872     srli(Rd, Rd, 48);
1873     return;
1874   }
1875   assert_different_registers(Rs, tmp);
1876   assert_different_registers(Rd, tmp);
1877   srli(tmp, Rs, 8);
1878   andi(tmp, tmp, 0xFF);
1879   andi(Rd, Rs, 0xFF);
1880   slli(Rd, Rd, 8);
1881   orr(Rd, Rd, tmp);
1882 }
1883 
1884 // reverse bytes in halfwords in lower 32 bits and zero-extend
1885 // Rd[31:0] = Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] (zero-extend to 64 bits)
1886 void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1887   if (UseZbb) {
1888     rev8(Rd, Rs);
1889     rori(Rd, Rd, 32);
1890     roriw(Rd, Rd, 16);
1891     zero_extend(Rd, Rd, 32);
1892     return;
1893   }
1894   assert_different_registers(Rs, tmp1, tmp2);
1895   assert_different_registers(Rd, tmp1, tmp2);
1896   srli(tmp2, Rs, 16);
1897   revb_h_h_u(tmp2, tmp2, tmp1);
1898   revb_h_h_u(Rd, Rs, tmp1);
1899   slli(tmp2, tmp2, 16);
1900   orr(Rd, Rd, tmp2);
1901 }
1902 
1903 // This method is only used for revb_h
1904 // Rd = Rs[47:0] Rs[55:48] Rs[63:56]
1905 void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1906   assert_different_registers(Rs, tmp1, tmp2);
1907   assert_different_registers(Rd, tmp1);
1908   srli(tmp1, Rs, 48);
1909   andi(tmp2, tmp1, 0xFF);
1910   slli(tmp2, tmp2, 8);
1911   srli(tmp1, tmp1, 8);
1912   orr(tmp1, tmp1, tmp2);
1913   slli(Rd, Rs, 16);
1914   orr(Rd, Rd, tmp1);
1915 }
1916 
1917 // reverse bytes in each halfword
1918 // Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8]
1919 void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1920   if (UseZbb) {
1921     assert_different_registers(Rs, tmp1);
1922     assert_different_registers(Rd, tmp1);
1923     rev8(Rd, Rs);
1924     zero_extend(tmp1, Rd, 32);
1925     roriw(tmp1, tmp1, 16);
1926     slli(tmp1, tmp1, 32);
1927     srli(Rd, Rd, 32);
1928     roriw(Rd, Rd, 16);
1929     zero_extend(Rd, Rd, 32);
1930     orr(Rd, Rd, tmp1);
1931     return;
1932   }
1933   assert_different_registers(Rs, tmp1, tmp2);
1934   assert_different_registers(Rd, tmp1, tmp2);
1935   revb_h_helper(Rd, Rs, tmp1, tmp2);
1936   for (int i = 0; i < 3; ++i) {
1937     revb_h_helper(Rd, Rd, tmp1, tmp2);
1938   }
1939 }
1940 
1941 // reverse bytes in each word
1942 // Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
1943 void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1944   if (UseZbb) {
1945     rev8(Rd, Rs);
1946     rori(Rd, Rd, 32);
1947     return;
1948   }
1949   assert_different_registers(Rs, tmp1, tmp2);
1950   assert_different_registers(Rd, tmp1, tmp2);
1951   revb(Rd, Rs, tmp1, tmp2);
1952   ror_imm(Rd, Rd, 32);
1953 }
1954 
1955 // reverse bytes in doubleword
1956 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
1957 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
1958   if (UseZbb) {
1959     rev8(Rd, Rs);
1960     return;
1961   }
1962   assert_different_registers(Rs, tmp1, tmp2);
1963   assert_different_registers(Rd, tmp1, tmp2);
1964   andi(tmp1, Rs, 0xFF);
1965   slli(tmp1, tmp1, 8);
1966   for (int step = 8; step < 56; step += 8) {
1967     srli(tmp2, Rs, step);
1968     andi(tmp2, tmp2, 0xFF);
1969     orr(tmp1, tmp1, tmp2);
1970     slli(tmp1, tmp1, 8);
1971   }
1972   srli(Rd, Rs, 56);
1973   andi(Rd, Rd, 0xFF);
1974   orr(Rd, tmp1, Rd);
1975 }
1976 
1977 // rotate right with shift bits
1978 void MacroAssembler::ror_imm(Register dst, Register src, uint32_t shift, Register tmp)
1979 {
1980   if (UseZbb) {
1981     rori(dst, src, shift);
1982     return;
1983   }
1984 
1985   assert_different_registers(dst, tmp);
1986   assert_different_registers(src, tmp);
1987   assert(shift < 64, "shift amount must be < 64");
1988   slli(tmp, src, 64 - shift);
1989   srli(dst, src, shift);
1990   orr(dst, dst, tmp);
1991 }
1992 
1993 // rotate left with shift bits, 32-bit version
1994 void MacroAssembler::rolw_imm(Register dst, Register src, uint32_t shift, Register tmp) {
1995   if (UseZbb) {
1996     // no roliw available
1997     roriw(dst, src, 32 - shift);
1998     return;
1999   }
2000 
2001   assert_different_registers(dst, tmp);
2002   assert_different_registers(src, tmp);
2003   assert(shift < 32, "shift amount must be < 32");
2004   srliw(tmp, src, 32 - shift);
2005   slliw(dst, src, shift);
2006   orr(dst, dst, tmp);
2007 }
2008 
2009 void MacroAssembler::andi(Register Rd, Register Rn, int64_t imm, Register tmp) {
2010   if (is_simm12(imm)) {
2011     and_imm12(Rd, Rn, imm);
2012   } else {
2013     assert_different_registers(Rn, tmp);
2014     mv(tmp, imm);
2015     andr(Rd, Rn, tmp);
2016   }
2017 }
2018 
2019 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
2020   ld(tmp1, adr);
2021   if (src.is_register()) {
2022     orr(tmp1, tmp1, src.as_register());
2023   } else {
2024     if (is_simm12(src.as_constant())) {
2025       ori(tmp1, tmp1, src.as_constant());
2026     } else {
2027       assert_different_registers(tmp1, tmp2);
2028       mv(tmp2, src.as_constant());
2029       orr(tmp1, tmp1, tmp2);
2030     }
2031   }
2032   sd(tmp1, adr);
2033 }
2034 
2035 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp1, Register tmp2, Label &L) {
2036   assert_different_registers(oop, trial_klass, tmp1, tmp2);
2037   if (UseCompressedClassPointers) {
2038     lwu(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2039     if (CompressedKlassPointers::base() == nullptr) {
2040       slli(tmp1, tmp1, CompressedKlassPointers::shift());
2041       beq(trial_klass, tmp1, L);
2042       return;
2043     }
2044     decode_klass_not_null(tmp1, tmp2);
2045   } else {
2046     ld(tmp1, Address(oop, oopDesc::klass_offset_in_bytes()));
2047   }
2048   beq(trial_klass, tmp1, L);
2049 }
2050 
2051 // Move an oop into a register.
2052 void MacroAssembler::movoop(Register dst, jobject obj) {
2053   int oop_index;
2054   if (obj == nullptr) {
2055     oop_index = oop_recorder()->allocate_oop_index(obj);
2056   } else {
2057 #ifdef ASSERT
2058     {
2059       ThreadInVMfromUnknown tiv;
2060       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
2061     }
2062 #endif
2063     oop_index = oop_recorder()->find_index(obj);
2064   }
2065   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2066 
2067   if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
2068     mv(dst, Address((address)obj, rspec));
2069   } else {
2070     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2071     ld_constant(dst, Address(dummy, rspec));
2072   }
2073 }
2074 
2075 // Move a metadata address into a register.
2076 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2077   int oop_index;
2078   if (obj == nullptr) {
2079     oop_index = oop_recorder()->allocate_metadata_index(obj);
2080   } else {
2081     oop_index = oop_recorder()->find_index(obj);
2082   }
2083   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2084   mv(dst, Address((address)obj, rspec));
2085 }
2086 
2087 // Writes to stack successive pages until offset reached to check for
2088 // stack overflow + shadow pages.  This clobbers tmp.
2089 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2090   assert_different_registers(tmp, size, t0);
2091   // Bang stack for total size given plus shadow page size.
2092   // Bang one page at a time because large size can bang beyond yellow and
2093   // red zones.
2094   mv(t0, (int)os::vm_page_size());
2095   Label loop;
2096   bind(loop);
2097   sub(tmp, sp, t0);
2098   subw(size, size, t0);
2099   sd(size, Address(tmp));
2100   bgtz(size, loop);
2101 
2102   // Bang down shadow pages too.
2103   // At this point, (tmp-0) is the last address touched, so don't
2104   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2105   // was post-decremented.)  Skip this address by starting at i=1, and
2106   // touch a few more pages below.  N.B.  It is important to touch all
2107   // the way down to and including i=StackShadowPages.
2108   for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
2109     // this could be any sized move but this is can be a debugging crumb
2110     // so the bigger the better.
2111     sub(tmp, tmp, (int)os::vm_page_size());
2112     sd(size, Address(tmp, 0));
2113   }
2114 }
2115 
2116 SkipIfEqual::SkipIfEqual(MacroAssembler* masm, const bool* flag_addr, bool value) {
2117   int32_t offset = 0;
2118   _masm = masm;
2119   ExternalAddress target((address)flag_addr);
2120   _masm->relocate(target.rspec(), [&] {
2121     int32_t offset;
2122     _masm->la_patchable(t0, target, offset);
2123     _masm->lbu(t0, Address(t0, offset));
2124   });
2125   if (value) {
2126     _masm->bnez(t0, _label);
2127   } else {
2128     _masm->beqz(t0, _label);
2129   }
2130 }
2131 
2132 SkipIfEqual::~SkipIfEqual() {
2133   _masm->bind(_label);
2134   _masm = nullptr;
2135 }
2136 
2137 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
2138   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2139   ld(dst, Address(xmethod, Method::const_offset()));
2140   ld(dst, Address(dst, ConstMethod::constants_offset()));
2141   ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
2142   ld(dst, Address(dst, mirror_offset));
2143   resolve_oop_handle(dst, tmp1, tmp2);
2144 }
2145 
2146 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
2147   // OopHandle::resolve is an indirection.
2148   assert_different_registers(result, tmp1, tmp2);
2149   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
2150 }
2151 
2152 // ((WeakHandle)result).resolve()
2153 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
2154   assert_different_registers(result, tmp1, tmp2);
2155   Label resolved;
2156 
2157   // A null weak handle resolves to null.
2158   beqz(result, resolved);
2159 
2160   // Only 64 bit platforms support GCs that require a tmp register
2161   // Only IN_HEAP loads require a thread_tmp register
2162   // WeakHandle::resolve is an indirection like jweak.
2163   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
2164                  result, Address(result), tmp1, tmp2);
2165   bind(resolved);
2166 }
2167 
2168 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
2169                                     Register dst, Address src,
2170                                     Register tmp1, Register tmp2) {
2171   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2172   decorators = AccessInternal::decorator_fixup(decorators, type);
2173   bool as_raw = (decorators & AS_RAW) != 0;
2174   if (as_raw) {
2175     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
2176   } else {
2177     bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
2178   }
2179 }
2180 
2181 void MacroAssembler::null_check(Register reg, int offset) {
2182   if (needs_explicit_null_check(offset)) {
2183     // provoke OS null exception if reg is null by
2184     // accessing M[reg] w/o changing any registers
2185     // NOTE: this is plenty to provoke a segv
2186     ld(zr, Address(reg, 0));
2187   } else {
2188     // nothing to do, (later) access of M[reg + offset]
2189     // will provoke OS null exception if reg is null
2190   }
2191 }
2192 
2193 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
2194                                      Address dst, Register val,
2195                                      Register tmp1, Register tmp2, Register tmp3) {
2196   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2197   decorators = AccessInternal::decorator_fixup(decorators, type);
2198   bool as_raw = (decorators & AS_RAW) != 0;
2199   if (as_raw) {
2200     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2201   } else {
2202     bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
2203   }
2204 }
2205 
2206 // Algorithm must match CompressedOops::encode.
2207 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2208   verify_oop_msg(s, "broken oop in encode_heap_oop");
2209   if (CompressedOops::base() == nullptr) {
2210     if (CompressedOops::shift() != 0) {
2211       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2212       srli(d, s, LogMinObjAlignmentInBytes);
2213     } else {
2214       mv(d, s);
2215     }
2216   } else {
2217     Label notNull;
2218     sub(d, s, xheapbase);
2219     bgez(d, notNull);
2220     mv(d, zr);
2221     bind(notNull);
2222     if (CompressedOops::shift() != 0) {
2223       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2224       srli(d, d, CompressedOops::shift());
2225     }
2226   }
2227 }
2228 
2229 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
2230   assert_different_registers(dst, tmp);
2231   assert_different_registers(src, tmp);
2232   if (UseCompressedClassPointers) {
2233     lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2234     decode_klass_not_null(dst, tmp);
2235   } else {
2236     ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2237   }
2238 }
2239 
2240 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
2241   // FIXME: Should this be a store release? concurrent gcs assumes
2242   // klass length is valid if klass field is not null.
2243   if (UseCompressedClassPointers) {
2244     encode_klass_not_null(src, tmp);
2245     sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2246   } else {
2247     sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2248   }
2249 }
2250 
2251 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2252   if (UseCompressedClassPointers) {
2253     // Store to klass gap in destination
2254     sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2255   }
2256 }
2257 
2258 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
2259   assert_different_registers(r, tmp);
2260   decode_klass_not_null(r, r, tmp);
2261 }
2262 
2263 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
2264   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2265 
2266   if (CompressedKlassPointers::base() == nullptr) {
2267     if (CompressedKlassPointers::shift() != 0) {
2268       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2269       slli(dst, src, LogKlassAlignmentInBytes);
2270     } else {
2271       mv(dst, src);
2272     }
2273     return;
2274   }
2275 
2276   Register xbase = dst;
2277   if (dst == src) {
2278     xbase = tmp;
2279   }
2280 
2281   assert_different_registers(src, xbase);
2282   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2283 
2284   if (CompressedKlassPointers::shift() != 0) {
2285     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2286     assert_different_registers(t0, xbase);
2287     shadd(dst, src, xbase, t0, LogKlassAlignmentInBytes);
2288   } else {
2289     add(dst, xbase, src);
2290   }
2291 }
2292 
2293 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
2294   assert_different_registers(r, tmp);
2295   encode_klass_not_null(r, r, tmp);
2296 }
2297 
2298 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
2299   assert(UseCompressedClassPointers, "should only be used for compressed headers");
2300 
2301   if (CompressedKlassPointers::base() == nullptr) {
2302     if (CompressedKlassPointers::shift() != 0) {
2303       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2304       srli(dst, src, LogKlassAlignmentInBytes);
2305     } else {
2306       mv(dst, src);
2307     }
2308     return;
2309   }
2310 
2311   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
2312       CompressedKlassPointers::shift() == 0) {
2313     zero_extend(dst, src, 32);
2314     return;
2315   }
2316 
2317   Register xbase = dst;
2318   if (dst == src) {
2319     xbase = tmp;
2320   }
2321 
2322   assert_different_registers(src, xbase);
2323   mv(xbase, (uintptr_t)CompressedKlassPointers::base());
2324   sub(dst, src, xbase);
2325   if (CompressedKlassPointers::shift() != 0) {
2326     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
2327     srli(dst, dst, LogKlassAlignmentInBytes);
2328   }
2329 }
2330 
2331 void MacroAssembler::decode_heap_oop_not_null(Register r) {
2332   decode_heap_oop_not_null(r, r);
2333 }
2334 
2335 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2336   assert(UseCompressedOops, "should only be used for compressed headers");
2337   assert(Universe::heap() != nullptr, "java heap should be initialized");
2338   // Cannot assert, unverified entry point counts instructions (see .ad file)
2339   // vtableStubs also counts instructions in pd_code_size_limit.
2340   // Also do not verify_oop as this is called by verify_oop.
2341   if (CompressedOops::shift() != 0) {
2342     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
2343     slli(dst, src, LogMinObjAlignmentInBytes);
2344     if (CompressedOops::base() != nullptr) {
2345       add(dst, xheapbase, dst);
2346     }
2347   } else {
2348     assert(CompressedOops::base() == nullptr, "sanity");
2349     mv(dst, src);
2350   }
2351 }
2352 
2353 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2354   if (CompressedOops::base() == nullptr) {
2355     if (CompressedOops::shift() != 0 || d != s) {
2356       slli(d, s, CompressedOops::shift());
2357     }
2358   } else {
2359     Label done;
2360     mv(d, s);
2361     beqz(s, done);
2362     shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
2363     bind(done);
2364   }
2365   verify_oop_msg(d, "broken oop in decode_heap_oop");
2366 }
2367 
2368 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
2369                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
2370   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
2371 }
2372 
2373 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
2374                                    Register tmp2, DecoratorSet decorators) {
2375   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
2376 }
2377 
2378 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
2379                                             Register tmp2, DecoratorSet decorators) {
2380   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
2381 }
2382 
2383 // Used for storing nulls.
2384 void MacroAssembler::store_heap_oop_null(Address dst) {
2385   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
2386 }
2387 
2388 int MacroAssembler::corrected_idivl(Register result, Register rs1, Register rs2,
2389                                     bool want_remainder)
2390 {
2391   // Full implementation of Java idiv and irem.  The function
2392   // returns the (pc) offset of the div instruction - may be needed
2393   // for implicit exceptions.
2394   //
2395   // input : rs1: dividend
2396   //         rs2: divisor
2397   //
2398   // result: either
2399   //         quotient  (= rs1 idiv rs2)
2400   //         remainder (= rs1 irem rs2)
2401 
2402 
2403   int idivl_offset = offset();
2404   if (!want_remainder) {
2405     divw(result, rs1, rs2);
2406   } else {
2407     remw(result, rs1, rs2); // result = rs1 % rs2;
2408   }
2409   return idivl_offset;
2410 }
2411 
2412 int MacroAssembler::corrected_idivq(Register result, Register rs1, Register rs2,
2413                                     bool want_remainder)
2414 {
2415   // Full implementation of Java ldiv and lrem.  The function
2416   // returns the (pc) offset of the div instruction - may be needed
2417   // for implicit exceptions.
2418   //
2419   // input : rs1: dividend
2420   //         rs2: divisor
2421   //
2422   // result: either
2423   //         quotient  (= rs1 idiv rs2)
2424   //         remainder (= rs1 irem rs2)
2425 
2426   int idivq_offset = offset();
2427   if (!want_remainder) {
2428     div(result, rs1, rs2);
2429   } else {
2430     rem(result, rs1, rs2); // result = rs1 % rs2;
2431   }
2432   return idivq_offset;
2433 }
2434 
2435 // Look up the method for a megamorpic invkkeinterface call.
2436 // The target method is determined by <intf_klass, itable_index>.
2437 // The receiver klass is in recv_klass.
2438 // On success, the result will be in method_result, and execution falls through.
2439 // On failure, execution transfers to the given label.
2440 void MacroAssembler::lookup_interface_method(Register recv_klass,
2441                                              Register intf_klass,
2442                                              RegisterOrConstant itable_index,
2443                                              Register method_result,
2444                                              Register scan_tmp,
2445                                              Label& L_no_such_interface,
2446                                              bool return_method) {
2447   assert_different_registers(recv_klass, intf_klass, scan_tmp);
2448   assert_different_registers(method_result, intf_klass, scan_tmp);
2449   assert(recv_klass != method_result || !return_method,
2450          "recv_klass can be destroyed when mehtid isn't needed");
2451   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
2452          "caller must be same register for non-constant itable index as for method");
2453 
2454   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
2455   int vtable_base = in_bytes(Klass::vtable_start_offset());
2456   int itentry_off = in_bytes(itableMethodEntry::method_offset());
2457   int scan_step   = itableOffsetEntry::size() * wordSize;
2458   int vte_size    = vtableEntry::size_in_bytes();
2459   assert(vte_size == wordSize, "else adjust times_vte_scale");
2460 
2461   lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
2462 
2463   // %%% Could store the aligned, prescaled offset in the klassoop.
2464   shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
2465   add(scan_tmp, scan_tmp, vtable_base);
2466 
2467   if (return_method) {
2468     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
2469     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
2470     if (itable_index.is_register()) {
2471       slli(t0, itable_index.as_register(), 3);
2472     } else {
2473       mv(t0, itable_index.as_constant() << 3);
2474     }
2475     add(recv_klass, recv_klass, t0);
2476     if (itentry_off) {
2477       add(recv_klass, recv_klass, itentry_off);
2478     }
2479   }
2480 
2481   Label search, found_method;
2482 
2483   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2484   beq(intf_klass, method_result, found_method);
2485   bind(search);
2486   // Check that the previous entry is non-null. A null entry means that
2487   // the receiver class doesn't implement the interface, and wasn't the
2488   // same as when the caller was compiled.
2489   beqz(method_result, L_no_such_interface, /* is_far */ true);
2490   addi(scan_tmp, scan_tmp, scan_step);
2491   ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
2492   bne(intf_klass, method_result, search);
2493 
2494   bind(found_method);
2495 
2496   // Got a hit.
2497   if (return_method) {
2498     lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
2499     add(method_result, recv_klass, scan_tmp);
2500     ld(method_result, Address(method_result));
2501   }
2502 }
2503 
2504 // virtual method calling
2505 void MacroAssembler::lookup_virtual_method(Register recv_klass,
2506                                            RegisterOrConstant vtable_index,
2507                                            Register method_result) {
2508   const ByteSize base = Klass::vtable_start_offset();
2509   assert(vtableEntry::size() * wordSize == 8,
2510          "adjust the scaling in the code below");
2511   int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
2512 
2513   if (vtable_index.is_register()) {
2514     shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
2515     ld(method_result, Address(method_result, vtable_offset_in_bytes));
2516   } else {
2517     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
2518     ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
2519   }
2520 }
2521 
2522 void MacroAssembler::membar(uint32_t order_constraint) {
2523   address prev = pc() - NativeMembar::instruction_size;
2524   address last = code()->last_insn();
2525 
2526   if (last != nullptr && nativeInstruction_at(last)->is_membar() && prev == last) {
2527     NativeMembar *bar = NativeMembar_at(prev);
2528     // We are merging two memory barrier instructions.  On RISCV we
2529     // can do this simply by ORing them together.
2530     bar->set_kind(bar->get_kind() | order_constraint);
2531     BLOCK_COMMENT("merged membar");
2532   } else {
2533     code()->set_last_insn(pc());
2534 
2535     uint32_t predecessor = 0;
2536     uint32_t successor = 0;
2537 
2538     membar_mask_to_pred_succ(order_constraint, predecessor, successor);
2539     fence(predecessor, successor);
2540   }
2541 }
2542 
2543 // Form an address from base + offset in Rd. Rd my or may not
2544 // actually be used: you must use the Address that is returned. It
2545 // is up to you to ensure that the shift provided matches the size
2546 // of your data.
2547 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
2548   if (is_simm12(byte_offset)) { // 12: imm in range 2^12
2549     return Address(base, byte_offset);
2550   }
2551 
2552   assert_different_registers(Rd, base, noreg);
2553 
2554   // Do it the hard way
2555   mv(Rd, byte_offset);
2556   add(Rd, base, Rd);
2557   return Address(Rd);
2558 }
2559 
2560 void MacroAssembler::check_klass_subtype(Register sub_klass,
2561                                          Register super_klass,
2562                                          Register tmp_reg,
2563                                          Label& L_success) {
2564   Label L_failure;
2565   check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
2566   check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
2567   bind(L_failure);
2568 }
2569 
2570 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
2571   ld(t0, Address(xthread, JavaThread::polling_word_offset()));
2572   if (acquire) {
2573     membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
2574   }
2575   if (at_return) {
2576     bgtu(in_nmethod ? sp : fp, t0, slow_path, /* is_far */ true);
2577   } else {
2578     test_bit(t0, t0, exact_log2(SafepointMechanism::poll_bit()));
2579     bnez(t0, slow_path, true /* is_far */);
2580   }
2581 }
2582 
2583 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2584                                 Label &succeed, Label *fail) {
2585   assert_different_registers(addr, tmp);
2586   assert_different_registers(newv, tmp);
2587   assert_different_registers(oldv, tmp);
2588 
2589   // oldv holds comparison value
2590   // newv holds value to write in exchange
2591   // addr identifies memory word to compare against/update
2592   Label retry_load, nope;
2593   bind(retry_load);
2594   // Load reserved from the memory location
2595   lr_d(tmp, addr, Assembler::aqrl);
2596   // Fail and exit if it is not what we expect
2597   bne(tmp, oldv, nope);
2598   // If the store conditional succeeds, tmp will be zero
2599   sc_d(tmp, newv, addr, Assembler::rl);
2600   beqz(tmp, succeed);
2601   // Retry only when the store conditional failed
2602   j(retry_load);
2603 
2604   bind(nope);
2605   membar(AnyAny);
2606   mv(oldv, tmp);
2607   if (fail != nullptr) {
2608     j(*fail);
2609   }
2610 }
2611 
2612 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2613                                         Label &succeed, Label *fail) {
2614   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2615   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2616 }
2617 
2618 void MacroAssembler::load_reserved(Register addr,
2619                                    enum operand_size size,
2620                                    Assembler::Aqrl acquire) {
2621   switch (size) {
2622     case int64:
2623       lr_d(t0, addr, acquire);
2624       break;
2625     case int32:
2626       lr_w(t0, addr, acquire);
2627       break;
2628     case uint32:
2629       lr_w(t0, addr, acquire);
2630       zero_extend(t0, t0, 32);
2631       break;
2632     default:
2633       ShouldNotReachHere();
2634   }
2635 }
2636 
2637 void MacroAssembler::store_conditional(Register addr,
2638                                        Register new_val,
2639                                        enum operand_size size,
2640                                        Assembler::Aqrl release) {
2641   switch (size) {
2642     case int64:
2643       sc_d(t0, new_val, addr, release);
2644       break;
2645     case int32:
2646     case uint32:
2647       sc_w(t0, new_val, addr, release);
2648       break;
2649     default:
2650       ShouldNotReachHere();
2651   }
2652 }
2653 
2654 
2655 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected,
2656                                                  Register new_val,
2657                                                  enum operand_size size,
2658                                                  Register tmp1, Register tmp2, Register tmp3) {
2659   assert(size == int8 || size == int16, "unsupported operand size");
2660 
2661   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3;
2662 
2663   andi(shift, addr, 3);
2664   slli(shift, shift, 3);
2665 
2666   andi(aligned_addr, addr, ~3);
2667 
2668   if (size == int8) {
2669     mv(mask, 0xff);
2670   } else {
2671     // size == int16 case
2672     mv(mask, -1);
2673     zero_extend(mask, mask, 16);
2674   }
2675   sll(mask, mask, shift);
2676 
2677   xori(not_mask, mask, -1);
2678 
2679   sll(expected, expected, shift);
2680   andr(expected, expected, mask);
2681 
2682   sll(new_val, new_val, shift);
2683   andr(new_val, new_val, mask);
2684 }
2685 
2686 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
2687 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w,
2688 // which are forced to work with 4-byte aligned address.
2689 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
2690                                           Register new_val,
2691                                           enum operand_size size,
2692                                           Assembler::Aqrl acquire, Assembler::Aqrl release,
2693                                           Register result, bool result_as_bool,
2694                                           Register tmp1, Register tmp2, Register tmp3) {
2695   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2696   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2697   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2698 
2699   Label retry, fail, done;
2700 
2701   bind(retry);
2702   lr_w(old, aligned_addr, acquire);
2703   andr(tmp, old, mask);
2704   bne(tmp, expected, fail);
2705 
2706   andr(tmp, old, not_mask);
2707   orr(tmp, tmp, new_val);
2708   sc_w(tmp, tmp, aligned_addr, release);
2709   bnez(tmp, retry);
2710 
2711   if (result_as_bool) {
2712     mv(result, 1);
2713     j(done);
2714 
2715     bind(fail);
2716     mv(result, zr);
2717 
2718     bind(done);
2719   } else {
2720     andr(tmp, old, mask);
2721 
2722     bind(fail);
2723     srl(result, tmp, shift);
2724 
2725     if (size == int8) {
2726       sign_extend(result, result, 8);
2727     } else {
2728       // size == int16 case
2729       sign_extend(result, result, 16);
2730     }
2731   }
2732 }
2733 
2734 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
2735 // the weak CAS stuff. The major difference is that it just failed when store conditional
2736 // failed.
2737 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
2738                                                Register new_val,
2739                                                enum operand_size size,
2740                                                Assembler::Aqrl acquire, Assembler::Aqrl release,
2741                                                Register result,
2742                                                Register tmp1, Register tmp2, Register tmp3) {
2743   Register aligned_addr = t1, shift = tmp1, mask = tmp2, not_mask = tmp3, old = result, tmp = t0;
2744   assert_different_registers(addr, old, mask, not_mask, new_val, expected, shift, tmp);
2745   cmpxchg_narrow_value_helper(addr, expected, new_val, size, tmp1, tmp2, tmp3);
2746 
2747   Label fail, done;
2748 
2749   lr_w(old, aligned_addr, acquire);
2750   andr(tmp, old, mask);
2751   bne(tmp, expected, fail);
2752 
2753   andr(tmp, old, not_mask);
2754   orr(tmp, tmp, new_val);
2755   sc_w(tmp, tmp, aligned_addr, release);
2756   bnez(tmp, fail);
2757 
2758   // Success
2759   mv(result, 1);
2760   j(done);
2761 
2762   // Fail
2763   bind(fail);
2764   mv(result, zr);
2765 
2766   bind(done);
2767 }
2768 
2769 void MacroAssembler::cmpxchg(Register addr, Register expected,
2770                              Register new_val,
2771                              enum operand_size size,
2772                              Assembler::Aqrl acquire, Assembler::Aqrl release,
2773                              Register result, bool result_as_bool) {
2774   assert(size != int8 && size != int16, "unsupported operand size");
2775   assert_different_registers(addr, t0);
2776   assert_different_registers(expected, t0);
2777   assert_different_registers(new_val, t0);
2778 
2779   Label retry_load, done, ne_done;
2780   bind(retry_load);
2781   load_reserved(addr, size, acquire);
2782   bne(t0, expected, ne_done);
2783   store_conditional(addr, new_val, size, release);
2784   bnez(t0, retry_load);
2785 
2786   // equal, succeed
2787   if (result_as_bool) {
2788     mv(result, 1);
2789   } else {
2790     mv(result, expected);
2791   }
2792   j(done);
2793 
2794   // not equal, failed
2795   bind(ne_done);
2796   if (result_as_bool) {
2797     mv(result, zr);
2798   } else {
2799     mv(result, t0);
2800   }
2801 
2802   bind(done);
2803 }
2804 
2805 void MacroAssembler::cmpxchg_weak(Register addr, Register expected,
2806                                   Register new_val,
2807                                   enum operand_size size,
2808                                   Assembler::Aqrl acquire, Assembler::Aqrl release,
2809                                   Register result) {
2810   assert_different_registers(addr, t0);
2811   assert_different_registers(expected, t0);
2812   assert_different_registers(new_val, t0);
2813 
2814   Label fail, done;
2815   load_reserved(addr, size, acquire);
2816   bne(t0, expected, fail);
2817   store_conditional(addr, new_val, size, release);
2818   bnez(t0, fail);
2819 
2820   // Success
2821   mv(result, 1);
2822   j(done);
2823 
2824   // Fail
2825   bind(fail);
2826   mv(result, zr);
2827 
2828   bind(done);
2829 }
2830 
2831 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE)                                              \
2832 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2833   prev = prev->is_valid() ? prev : zr;                                                      \
2834   if (incr.is_register()) {                                                                 \
2835     AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE));              \
2836   } else {                                                                                  \
2837     mv(t0, incr.as_constant());                                                             \
2838     AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE));                              \
2839   }                                                                                         \
2840   return;                                                                                   \
2841 }
2842 
2843 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
2844 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
2845 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
2846 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
2847 
2848 #undef ATOMIC_OP
2849 
2850 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE)                                       \
2851 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {      \
2852   prev = prev->is_valid() ? prev : zr;                                               \
2853   AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE));                       \
2854   return;                                                                            \
2855 }
2856 
2857 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
2858 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
2859 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
2860 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
2861 
2862 #undef ATOMIC_XCHG
2863 
2864 #define ATOMIC_XCHGU(OP1, OP2)                                                       \
2865 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) {     \
2866   atomic_##OP2(prev, newv, addr);                                                    \
2867   zero_extend(prev, prev, 32);                                                       \
2868   return;                                                                            \
2869 }
2870 
2871 ATOMIC_XCHGU(xchgwu, xchgw)
2872 ATOMIC_XCHGU(xchgalwu, xchgalw)
2873 
2874 #undef ATOMIC_XCHGU
2875 
2876 void MacroAssembler::far_jump(Address entry, Register tmp) {
2877   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2878   assert(CodeCache::find_blob(entry.target()) != nullptr,
2879          "destination of far call not found in code cache");
2880   assert(entry.rspec().type() == relocInfo::external_word_type
2881         || entry.rspec().type() == relocInfo::runtime_call_type
2882         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2883   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2884   if (far_branches()) {
2885     // We can use auipc + jalr here because we know that the total size of
2886     // the code cache cannot exceed 2Gb.
2887     relocate(entry.rspec(), [&] {
2888       int32_t offset;
2889       la_patchable(tmp, entry, offset);
2890       jalr(x0, tmp, offset);
2891     });
2892   } else {
2893     j(entry);
2894   }
2895 }
2896 
2897 void MacroAssembler::far_call(Address entry, Register tmp) {
2898   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
2899   assert(CodeCache::find_blob(entry.target()) != nullptr,
2900          "destination of far call not found in code cache");
2901   assert(entry.rspec().type() == relocInfo::external_word_type
2902         || entry.rspec().type() == relocInfo::runtime_call_type
2903         || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
2904   IncompressibleRegion ir(this);  // Fixed length: see MacroAssembler::far_branch_size()
2905   if (far_branches()) {
2906     // We can use auipc + jalr here because we know that the total size of
2907     // the code cache cannot exceed 2Gb.
2908     relocate(entry.rspec(), [&] {
2909       int32_t offset;
2910       la_patchable(tmp, entry, offset);
2911       jalr(x1, tmp, offset); // link
2912     });
2913   } else {
2914     jal(entry); // link
2915   }
2916 }
2917 
2918 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
2919                                                    Register super_klass,
2920                                                    Register tmp_reg,
2921                                                    Label* L_success,
2922                                                    Label* L_failure,
2923                                                    Label* L_slow_path,
2924                                                    Register super_check_offset) {
2925   assert_different_registers(sub_klass, super_klass, tmp_reg);
2926   bool must_load_sco = (super_check_offset == noreg);
2927   if (must_load_sco) {
2928     assert(tmp_reg != noreg, "supply either a temp or a register offset");
2929   } else {
2930     assert_different_registers(sub_klass, super_klass, super_check_offset);
2931   }
2932 
2933   Label L_fallthrough;
2934   int label_nulls = 0;
2935   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
2936   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
2937   if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
2938   assert(label_nulls <= 1, "at most one null in batch");
2939 
2940   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2941   int sco_offset = in_bytes(Klass::super_check_offset_offset());
2942   Address super_check_offset_addr(super_klass, sco_offset);
2943 
2944   // Hacked jmp, which may only be used just before L_fallthrough.
2945 #define final_jmp(label)                                                \
2946   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
2947   else                            j(label)             /*omit semi*/
2948 
2949   // If the pointers are equal, we are done (e.g., String[] elements).
2950   // This self-check enables sharing of secondary supertype arrays among
2951   // non-primary types such as array-of-interface. Otherwise, each such
2952   // type would need its own customized SSA.
2953   // We move this check to the front of the fast path because many
2954   // type checks are in fact trivially successful in this manner,
2955   // so we get a nicely predicted branch right at the start of the check.
2956   beq(sub_klass, super_klass, *L_success);
2957 
2958   // Check the supertype display:
2959   if (must_load_sco) {
2960     lwu(tmp_reg, super_check_offset_addr);
2961     super_check_offset = tmp_reg;
2962   }
2963   add(t0, sub_klass, super_check_offset);
2964   Address super_check_addr(t0);
2965   ld(t0, super_check_addr); // load displayed supertype
2966 
2967   // This check has worked decisively for primary supers.
2968   // Secondary supers are sought in the super_cache ('super_cache_addr').
2969   // (Secondary supers are interfaces and very deeply nested subtypes.)
2970   // This works in the same check above because of a tricky aliasing
2971   // between the super_Cache and the primary super display elements.
2972   // (The 'super_check_addr' can address either, as the case requires.)
2973   // Note that the cache is updated below if it does not help us find
2974   // what we need immediately.
2975   // So if it was a primary super, we can just fail immediately.
2976   // Otherwise, it's the slow path for us (no success at this point).
2977 
2978   beq(super_klass, t0, *L_success);
2979   mv(t1, sc_offset);
2980   if (L_failure == &L_fallthrough) {
2981     beq(super_check_offset, t1, *L_slow_path);
2982   } else {
2983     bne(super_check_offset, t1, *L_failure, /* is_far */ true);
2984     final_jmp(*L_slow_path);
2985   }
2986 
2987   bind(L_fallthrough);
2988 
2989 #undef final_jmp
2990 }
2991 
2992 // Scans count pointer sized words at [addr] for occurrence of value,
2993 // generic
2994 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
2995                                 Register tmp) {
2996   Label Lloop, Lexit;
2997   beqz(count, Lexit);
2998   bind(Lloop);
2999   ld(tmp, addr);
3000   beq(value, tmp, Lexit);
3001   add(addr, addr, wordSize);
3002   sub(count, count, 1);
3003   bnez(count, Lloop);
3004   bind(Lexit);
3005 }
3006 
3007 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3008                                                    Register super_klass,
3009                                                    Register tmp1_reg,
3010                                                    Register tmp2_reg,
3011                                                    Label* L_success,
3012                                                    Label* L_failure) {
3013   assert_different_registers(sub_klass, super_klass, tmp1_reg);
3014   if (tmp2_reg != noreg) {
3015     assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
3016   }
3017 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
3018 
3019   Label L_fallthrough;
3020   int label_nulls = 0;
3021   if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
3022   if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
3023 
3024   assert(label_nulls <= 1, "at most one null in the batch");
3025 
3026   // A couple of useful fields in sub_klass:
3027   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3028   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3029   Address secondary_supers_addr(sub_klass, ss_offset);
3030   Address super_cache_addr(     sub_klass, sc_offset);
3031 
3032   BLOCK_COMMENT("check_klass_subtype_slow_path");
3033 
3034   // Do a linear scan of the secondary super-klass chain.
3035   // This code is rarely used, so simplicity is a virtue here.
3036   // The repne_scan instruction uses fixed registers, which we must spill.
3037   // Don't worry too much about pre-existing connections with the input regs.
3038 
3039   assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
3040   assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
3041 
3042   RegSet pushed_registers;
3043   if (!IS_A_TEMP(x12)) {
3044     pushed_registers += x12;
3045   }
3046   if (!IS_A_TEMP(x15)) {
3047     pushed_registers += x15;
3048   }
3049 
3050   if (super_klass != x10) {
3051     if (!IS_A_TEMP(x10)) {
3052       pushed_registers += x10;
3053     }
3054   }
3055 
3056   push_reg(pushed_registers, sp);
3057 
3058   // Get super_klass value into x10 (even if it was in x15 or x12)
3059   mv(x10, super_klass);
3060 
3061 #ifndef PRODUCT
3062   mv(t1, (address)&SharedRuntime::_partial_subtype_ctr);
3063   Address pst_counter_addr(t1);
3064   ld(t0, pst_counter_addr);
3065   add(t0, t0, 1);
3066   sd(t0, pst_counter_addr);
3067 #endif // PRODUCT
3068 
3069   // We will consult the secondary-super array.
3070   ld(x15, secondary_supers_addr);
3071   // Load the array length.
3072   lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
3073   // Skip to start of data.
3074   add(x15, x15, Array<Klass*>::base_offset_in_bytes());
3075 
3076   // Set t0 to an obvious invalid value, falling through by default
3077   mv(t0, -1);
3078   // Scan X12 words at [X15] for an occurrence of X10.
3079   repne_scan(x15, x10, x12, t0);
3080 
3081   // pop will restore x10, so we should use a temp register to keep its value
3082   mv(t1, x10);
3083 
3084   // Unspill the temp registers:
3085   pop_reg(pushed_registers, sp);
3086 
3087   bne(t1, t0, *L_failure);
3088 
3089   // Success. Cache the super we found an proceed in triumph.
3090   sd(super_klass, super_cache_addr);
3091 
3092   if (L_success != &L_fallthrough) {
3093     j(*L_success);
3094   }
3095 
3096 #undef IS_A_TEMP
3097 
3098   bind(L_fallthrough);
3099 }
3100 
3101 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
3102 void MacroAssembler::tlab_allocate(Register obj,
3103                                    Register var_size_in_bytes,
3104                                    int con_size_in_bytes,
3105                                    Register tmp1,
3106                                    Register tmp2,
3107                                    Label& slow_case,
3108                                    bool is_far) {
3109   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3110   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
3111 }
3112 
3113 // get_thread() can be called anywhere inside generated code so we
3114 // need to save whatever non-callee save context might get clobbered
3115 // by the call to Thread::current() or, indeed, the call setup code.
3116 void MacroAssembler::get_thread(Register thread) {
3117   // save all call-clobbered regs except thread
3118   RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
3119                       RegSet::range(x28, x31) + ra - thread;
3120   push_reg(saved_regs, sp);
3121 
3122   mv(ra, CAST_FROM_FN_PTR(address, Thread::current));
3123   jalr(ra);
3124   if (thread != c_rarg0) {
3125     mv(thread, c_rarg0);
3126   }
3127 
3128   // restore pushed registers
3129   pop_reg(saved_regs, sp);
3130 }
3131 
3132 void MacroAssembler::load_byte_map_base(Register reg) {
3133   CardTable::CardValue* byte_map_base =
3134     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
3135   mv(reg, (uint64_t)byte_map_base);
3136 }
3137 
3138 void MacroAssembler::la_patchable(Register reg1, const Address &dest, int32_t &offset) {
3139   unsigned long low_address = (uintptr_t)CodeCache::low_bound();
3140   unsigned long high_address = (uintptr_t)CodeCache::high_bound();
3141   unsigned long dest_address = (uintptr_t)dest.target();
3142   long offset_low = dest_address - low_address;
3143   long offset_high = dest_address - high_address;
3144 
3145   assert(dest.getMode() == Address::literal, "la_patchable must be applied to a literal address");
3146   assert((uintptr_t)dest.target() < (1ull << 48), "bad address");
3147 
3148   // RISC-V doesn't compute a page-aligned address, in order to partially
3149   // compensate for the use of *signed* offsets in its base+disp12
3150   // addressing mode (RISC-V's PC-relative reach remains asymmetric
3151   // [-(2G + 2K), 2G - 2K).
3152   if (offset_high >= -((1L << 31) + (1L << 11)) && offset_low < (1L << 31) - (1L << 11)) {
3153     int64_t distance = dest.target() - pc();
3154     auipc(reg1, (int32_t)distance + 0x800);
3155     offset = ((int32_t)distance << 20) >> 20;
3156   } else {
3157     movptr(reg1, dest.target(), offset);
3158   }
3159 }
3160 
3161 void MacroAssembler::build_frame(int framesize) {
3162   assert(framesize >= 2, "framesize must include space for FP/RA");
3163   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3164   sub(sp, sp, framesize);
3165   sd(fp, Address(sp, framesize - 2 * wordSize));
3166   sd(ra, Address(sp, framesize - wordSize));
3167   if (PreserveFramePointer) { add(fp, sp, framesize); }
3168 }
3169 
3170 void MacroAssembler::remove_frame(int framesize) {
3171   assert(framesize >= 2, "framesize must include space for FP/RA");
3172   assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
3173   ld(fp, Address(sp, framesize - 2 * wordSize));
3174   ld(ra, Address(sp, framesize - wordSize));
3175   add(sp, sp, framesize);
3176 }
3177 
3178 void MacroAssembler::reserved_stack_check() {
3179     // testing if reserved zone needs to be enabled
3180     Label no_reserved_zone_enabling;
3181 
3182     ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
3183     bltu(sp, t0, no_reserved_zone_enabling);
3184 
3185     enter();   // RA and FP are live.
3186     mv(c_rarg0, xthread);
3187     RuntimeAddress target(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
3188     relocate(target.rspec(), [&] {
3189       int32_t offset;
3190       la_patchable(t0, target, offset);
3191       jalr(x1, t0, offset);
3192     });
3193     leave();
3194 
3195     // We have already removed our own frame.
3196     // throw_delayed_StackOverflowError will think that it's been
3197     // called by our caller.
3198     target = RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry());
3199     relocate(target.rspec(), [&] {
3200       int32_t offset;
3201       la_patchable(t0, target, offset);
3202       jalr(x0, t0, offset);
3203     });
3204     should_not_reach_here();
3205 
3206     bind(no_reserved_zone_enabling);
3207 }
3208 
3209 // Move the address of the polling page into dest.
3210 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
3211   ld(dest, Address(xthread, JavaThread::polling_page_offset()));
3212 }
3213 
3214 // Read the polling page.  The address of the polling page must
3215 // already be in r.
3216 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
3217   relocate(rtype, [&] {
3218     lwu(zr, Address(r, offset));
3219   });
3220 }
3221 
3222 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3223 #ifdef ASSERT
3224   {
3225     ThreadInVMfromUnknown tiv;
3226     assert (UseCompressedOops, "should only be used for compressed oops");
3227     assert (Universe::heap() != nullptr, "java heap should be initialized");
3228     assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3229     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3230   }
3231 #endif
3232   int oop_index = oop_recorder()->find_index(obj);
3233   relocate(oop_Relocation::spec(oop_index), [&] {
3234     li32(dst, 0xDEADBEEF);
3235   });
3236   zero_extend(dst, dst, 32);
3237 }
3238 
3239 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3240   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3241   assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
3242   int index = oop_recorder()->find_index(k);
3243   assert(!Universe::heap()->is_in(k), "should not be an oop");
3244 
3245   narrowKlass nk = CompressedKlassPointers::encode(k);
3246   relocate(metadata_Relocation::spec(index), [&] {
3247     li32(dst, nk);
3248   });
3249   zero_extend(dst, dst, 32);
3250 }
3251 
3252 // Maybe emit a call via a trampoline. If the code cache is small
3253 // trampolines won't be emitted.
3254 address MacroAssembler::trampoline_call(Address entry) {
3255   assert(entry.rspec().type() == relocInfo::runtime_call_type ||
3256          entry.rspec().type() == relocInfo::opt_virtual_call_type ||
3257          entry.rspec().type() == relocInfo::static_call_type ||
3258          entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
3259 
3260   address target = entry.target();
3261 
3262   // We need a trampoline if branches are far.
3263   if (far_branches()) {
3264     if (!in_scratch_emit_size()) {
3265       if (entry.rspec().type() == relocInfo::runtime_call_type) {
3266         assert(CodeBuffer::supports_shared_stubs(), "must support shared stubs");
3267         code()->share_trampoline_for(entry.target(), offset());
3268       } else {
3269         address stub = emit_trampoline_stub(offset(), target);
3270         if (stub == nullptr) {
3271           postcond(pc() == badAddress);
3272           return nullptr; // CodeCache is full
3273         }
3274       }
3275     }
3276     target = pc();
3277   }
3278 
3279   address call_pc = pc();
3280 #ifdef ASSERT
3281   if (entry.rspec().type() != relocInfo::runtime_call_type) {
3282     assert_alignment(call_pc);
3283   }
3284 #endif
3285   relocate(entry.rspec(), [&] {
3286     jal(target);
3287   });
3288 
3289   postcond(pc() != badAddress);
3290   return call_pc;
3291 }
3292 
3293 address MacroAssembler::ic_call(address entry, jint method_index) {
3294   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
3295   IncompressibleRegion ir(this);  // relocations
3296   movptr(t1, (address)Universe::non_oop_word());
3297   assert_cond(entry != nullptr);
3298   return trampoline_call(Address(entry, rh));
3299 }
3300 
3301 // Emit a trampoline stub for a call to a target which is too far away.
3302 //
3303 // code sequences:
3304 //
3305 // call-site:
3306 //   branch-and-link to <destination> or <trampoline stub>
3307 //
3308 // Related trampoline stub for this call site in the stub section:
3309 //   load the call target from the constant pool
3310 //   branch (RA still points to the call site above)
3311 
3312 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
3313                                              address dest) {
3314   // Max stub size: alignment nop, TrampolineStub.
3315   address stub = start_a_stub(max_trampoline_stub_size());
3316   if (stub == nullptr) {
3317     return nullptr;  // CodeBuffer::expand failed
3318   }
3319 
3320   // We are always 4-byte aligned here.
3321   assert_alignment(pc());
3322 
3323   // Create a trampoline stub relocation which relates this trampoline stub
3324   // with the call instruction at insts_call_instruction_offset in the
3325   // instructions code-section.
3326 
3327   // Make sure the address of destination 8-byte aligned after 3 instructions.
3328   align(wordSize, NativeCallTrampolineStub::data_offset);
3329 
3330   RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
3331                                                          insts_call_instruction_offset);
3332   const int stub_start_offset = offset();
3333   relocate(rh, [&] {
3334     // Now, create the trampoline stub's code:
3335     // - load the call
3336     // - call
3337     Label target;
3338     ld(t0, target);  // auipc + ld
3339     jr(t0);          // jalr
3340     bind(target);
3341     assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
3342            "should be");
3343     assert(offset() % wordSize == 0, "bad alignment");
3344     emit_int64((int64_t)dest);
3345   });
3346 
3347   const address stub_start_addr = addr_at(stub_start_offset);
3348 
3349   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
3350 
3351   end_a_stub();
3352   return stub_start_addr;
3353 }
3354 
3355 int MacroAssembler::max_trampoline_stub_size() {
3356   // Max stub size: alignment nop, TrampolineStub.
3357   return NativeInstruction::instruction_size + NativeCallTrampolineStub::instruction_size;
3358 }
3359 
3360 int MacroAssembler::static_call_stub_size() {
3361   // (lui, addi, slli, addi, slli, addi) + (lui, addi, slli, addi, slli) + jalr
3362   return 12 * NativeInstruction::instruction_size;
3363 }
3364 
3365 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
3366   switch (dst.getMode()) {
3367     case Address::base_plus_offset:
3368       // This is the expected mode, although we allow all the other
3369       // forms below.
3370       return form_address(tmp, dst.base(), dst.offset());
3371     default:
3372       la(tmp, dst);
3373       return Address(tmp);
3374   }
3375 }
3376 
3377 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3378   assert(((dst.getMode() == Address::base_plus_offset &&
3379            is_simm12(dst.offset())) || is_simm12(value)),
3380           "invalid value and address mode combination");
3381   Address adr = add_memory_helper(dst, tmp2);
3382   assert(!adr.uses(tmp1), "invalid dst for address increment");
3383   ld(tmp1, adr);
3384   add(tmp1, tmp1, value, tmp2);
3385   sd(tmp1, adr);
3386 }
3387 
3388 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3389   assert(((dst.getMode() == Address::base_plus_offset &&
3390            is_simm12(dst.offset())) || is_simm12(value)),
3391           "invalid value and address mode combination");
3392   Address adr = add_memory_helper(dst, tmp2);
3393   assert(!adr.uses(tmp1), "invalid dst for address increment");
3394   lwu(tmp1, adr);
3395   addw(tmp1, tmp1, value, tmp2);
3396   sw(tmp1, adr);
3397 }
3398 
3399 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
3400   assert(((dst.getMode() == Address::base_plus_offset &&
3401            is_simm12(dst.offset())) || is_simm12(value)),
3402           "invalid value and address mode combination");
3403   Address adr = add_memory_helper(dst, tmp2);
3404   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3405   ld(tmp1, adr);
3406   sub(tmp1, tmp1, value, tmp2);
3407   sd(tmp1, adr);
3408 }
3409 
3410 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
3411   assert(((dst.getMode() == Address::base_plus_offset &&
3412            is_simm12(dst.offset())) || is_simm12(value)),
3413           "invalid value and address mode combination");
3414   Address adr = add_memory_helper(dst, tmp2);
3415   assert(!adr.uses(tmp1), "invalid dst for address decrement");
3416   lwu(tmp1, adr);
3417   subw(tmp1, tmp1, value, tmp2);
3418   sw(tmp1, adr);
3419 }
3420 
3421 void MacroAssembler::cmpptr(Register src1, Address src2, Label& equal) {
3422   assert_different_registers(src1, t0);
3423   relocate(src2.rspec(), [&] {
3424     int32_t offset;
3425     la_patchable(t0, src2, offset);
3426     ld(t0, Address(t0, offset));
3427   });
3428   beq(src1, t0, equal);
3429 }
3430 
3431 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
3432   load_method_holder(result, method);
3433   ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
3434 }
3435 
3436 void MacroAssembler::load_method_holder(Register holder, Register method) {
3437   ld(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3438   ld(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3439   ld(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
3440 }
3441 
3442 // string indexof
3443 // compute index by trailing zeros
3444 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
3445                                    Register match_mask, Register result,
3446                                    Register ch2, Register tmp,
3447                                    bool haystack_isL) {
3448   int haystack_chr_shift = haystack_isL ? 0 : 1;
3449   srl(match_mask, match_mask, trailing_zeros);
3450   srli(match_mask, match_mask, 1);
3451   srli(tmp, trailing_zeros, LogBitsPerByte);
3452   if (!haystack_isL) andi(tmp, tmp, 0xE);
3453   add(haystack, haystack, tmp);
3454   ld(ch2, Address(haystack));
3455   if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
3456   add(result, result, tmp);
3457 }
3458 
3459 // string indexof
3460 // Find pattern element in src, compute match mask,
3461 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
3462 // match mask patterns and corresponding indices would be like:
3463 // - 0x8080808080808080 (Latin1)
3464 // -   7 6 5 4 3 2 1 0  (match index)
3465 // - 0x8000800080008000 (UTF16)
3466 // -   3   2   1   0    (match index)
3467 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
3468                                         Register mask1, Register mask2) {
3469   xorr(src, pattern, src);
3470   sub(match_mask, src, mask1);
3471   orr(src, src, mask2);
3472   notr(src, src);
3473   andr(match_mask, match_mask, src);
3474 }
3475 
3476 #ifdef COMPILER2
3477 // Code for BigInteger::mulAdd intrinsic
3478 // out     = x10
3479 // in      = x11
3480 // offset  = x12  (already out.length-offset)
3481 // len     = x13
3482 // k       = x14
3483 // tmp     = x28
3484 //
3485 // pseudo code from java implementation:
3486 // long kLong = k & LONG_MASK;
3487 // carry = 0;
3488 // offset = out.length-offset - 1;
3489 // for (int j = len - 1; j >= 0; j--) {
3490 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3491 //     out[offset--] = (int)product;
3492 //     carry = product >>> 32;
3493 // }
3494 // return (int)carry;
3495 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3496                              Register len, Register k, Register tmp) {
3497   Label L_tail_loop, L_unroll, L_end;
3498   mv(tmp, out);
3499   mv(out, zr);
3500   blez(len, L_end);
3501   zero_extend(k, k, 32);
3502   slliw(t0, offset, LogBytesPerInt);
3503   add(offset, tmp, t0);
3504   slliw(t0, len, LogBytesPerInt);
3505   add(in, in, t0);
3506 
3507   const int unroll = 8;
3508   mv(tmp, unroll);
3509   blt(len, tmp, L_tail_loop);
3510   bind(L_unroll);
3511   for (int i = 0; i < unroll; i++) {
3512     sub(in, in, BytesPerInt);
3513     lwu(t0, Address(in, 0));
3514     mul(t1, t0, k);
3515     add(t0, t1, out);
3516     sub(offset, offset, BytesPerInt);
3517     lwu(t1, Address(offset, 0));
3518     add(t0, t0, t1);
3519     sw(t0, Address(offset, 0));
3520     srli(out, t0, 32);
3521   }
3522   subw(len, len, tmp);
3523   bge(len, tmp, L_unroll);
3524 
3525   bind(L_tail_loop);
3526   blez(len, L_end);
3527   sub(in, in, BytesPerInt);
3528   lwu(t0, Address(in, 0));
3529   mul(t1, t0, k);
3530   add(t0, t1, out);
3531   sub(offset, offset, BytesPerInt);
3532   lwu(t1, Address(offset, 0));
3533   add(t0, t0, t1);
3534   sw(t0, Address(offset, 0));
3535   srli(out, t0, 32);
3536   subw(len, len, 1);
3537   j(L_tail_loop);
3538 
3539   bind(L_end);
3540 }
3541 
3542 // add two unsigned input and output carry
3543 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
3544 {
3545   assert_different_registers(dst, carry);
3546   assert_different_registers(dst, src2);
3547   add(dst, src1, src2);
3548   sltu(carry, dst, src2);
3549 }
3550 
3551 // add two input with carry
3552 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
3553   assert_different_registers(dst, carry);
3554   add(dst, src1, src2);
3555   add(dst, dst, carry);
3556 }
3557 
3558 // add two unsigned input with carry and output carry
3559 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
3560   assert_different_registers(dst, src2);
3561   adc(dst, src1, src2, carry);
3562   sltu(carry, dst, src2);
3563 }
3564 
3565 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
3566                                      Register src1, Register src2, Register carry) {
3567   cad(dest_lo, dest_lo, src1, carry);
3568   add(dest_hi, dest_hi, carry);
3569   cad(dest_lo, dest_lo, src2, carry);
3570   add(final_dest_hi, dest_hi, carry);
3571 }
3572 
3573 /**
3574  * Multiply 32 bit by 32 bit first loop.
3575  */
3576 void MacroAssembler::multiply_32_x_32_loop(Register x, Register xstart, Register x_xstart,
3577                                            Register y, Register y_idx, Register z,
3578                                            Register carry, Register product,
3579                                            Register idx, Register kdx) {
3580   // jlong carry, x[], y[], z[];
3581   // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3582   //     long product = y[idx] * x[xstart] + carry;
3583   //     z[kdx] = (int)product;
3584   //     carry = product >>> 32;
3585   // }
3586   // z[xstart] = (int)carry;
3587 
3588   Label L_first_loop, L_first_loop_exit;
3589   blez(idx, L_first_loop_exit);
3590 
3591   shadd(t0, xstart, x, t0, LogBytesPerInt);
3592   lwu(x_xstart, Address(t0, 0));
3593 
3594   bind(L_first_loop);
3595   subw(idx, idx, 1);
3596   shadd(t0, idx, y, t0, LogBytesPerInt);
3597   lwu(y_idx, Address(t0, 0));
3598   mul(product, x_xstart, y_idx);
3599   add(product, product, carry);
3600   srli(carry, product, 32);
3601   subw(kdx, kdx, 1);
3602   shadd(t0, kdx, z, t0, LogBytesPerInt);
3603   sw(product, Address(t0, 0));
3604   bgtz(idx, L_first_loop);
3605 
3606   bind(L_first_loop_exit);
3607 }
3608 
3609 /**
3610  * Multiply 64 bit by 64 bit first loop.
3611  */
3612 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
3613                                            Register y, Register y_idx, Register z,
3614                                            Register carry, Register product,
3615                                            Register idx, Register kdx) {
3616   //
3617   //  jlong carry, x[], y[], z[];
3618   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3619   //    huge_128 product = y[idx] * x[xstart] + carry;
3620   //    z[kdx] = (jlong)product;
3621   //    carry  = (jlong)(product >>> 64);
3622   //  }
3623   //  z[xstart] = carry;
3624   //
3625 
3626   Label L_first_loop, L_first_loop_exit;
3627   Label L_one_x, L_one_y, L_multiply;
3628 
3629   subw(xstart, xstart, 1);
3630   bltz(xstart, L_one_x);
3631 
3632   shadd(t0, xstart, x, t0, LogBytesPerInt);
3633   ld(x_xstart, Address(t0, 0));
3634   ror_imm(x_xstart, x_xstart, 32); // convert big-endian to little-endian
3635 
3636   bind(L_first_loop);
3637   subw(idx, idx, 1);
3638   bltz(idx, L_first_loop_exit);
3639   subw(idx, idx, 1);
3640   bltz(idx, L_one_y);
3641 
3642   shadd(t0, idx, y, t0, LogBytesPerInt);
3643   ld(y_idx, Address(t0, 0));
3644   ror_imm(y_idx, y_idx, 32); // convert big-endian to little-endian
3645   bind(L_multiply);
3646 
3647   mulhu(t0, x_xstart, y_idx);
3648   mul(product, x_xstart, y_idx);
3649   cad(product, product, carry, t1);
3650   adc(carry, t0, zr, t1);
3651 
3652   subw(kdx, kdx, 2);
3653   ror_imm(product, product, 32); // back to big-endian
3654   shadd(t0, kdx, z, t0, LogBytesPerInt);
3655   sd(product, Address(t0, 0));
3656 
3657   j(L_first_loop);
3658 
3659   bind(L_one_y);
3660   lwu(y_idx, Address(y, 0));
3661   j(L_multiply);
3662 
3663   bind(L_one_x);
3664   lwu(x_xstart, Address(x, 0));
3665   j(L_first_loop);
3666 
3667   bind(L_first_loop_exit);
3668 }
3669 
3670 /**
3671  * Multiply 128 bit by 128 bit. Unrolled inner loop.
3672  *
3673  */
3674 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3675                                              Register carry, Register carry2,
3676                                              Register idx, Register jdx,
3677                                              Register yz_idx1, Register yz_idx2,
3678                                              Register tmp, Register tmp3, Register tmp4,
3679                                              Register tmp6, Register product_hi) {
3680   //   jlong carry, x[], y[], z[];
3681   //   int kdx = xstart+1;
3682   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3683   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3684   //     jlong carry2  = (jlong)(tmp3 >>> 64);
3685   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
3686   //     carry  = (jlong)(tmp4 >>> 64);
3687   //     z[kdx+idx+1] = (jlong)tmp3;
3688   //     z[kdx+idx] = (jlong)tmp4;
3689   //   }
3690   //   idx += 2;
3691   //   if (idx > 0) {
3692   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3693   //     z[kdx+idx] = (jlong)yz_idx1;
3694   //     carry  = (jlong)(yz_idx1 >>> 64);
3695   //   }
3696   //
3697 
3698   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3699 
3700   srliw(jdx, idx, 2);
3701 
3702   bind(L_third_loop);
3703 
3704   subw(jdx, jdx, 1);
3705   bltz(jdx, L_third_loop_exit);
3706   subw(idx, idx, 4);
3707 
3708   shadd(t0, idx, y, t0, LogBytesPerInt);
3709   ld(yz_idx2, Address(t0, 0));
3710   ld(yz_idx1, Address(t0, wordSize));
3711 
3712   shadd(tmp6, idx, z, t0, LogBytesPerInt);
3713 
3714   ror_imm(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3715   ror_imm(yz_idx2, yz_idx2, 32);
3716 
3717   ld(t1, Address(tmp6, 0));
3718   ld(t0, Address(tmp6, wordSize));
3719 
3720   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3721   mulhu(tmp4, product_hi, yz_idx1);
3722 
3723   ror_imm(t0, t0, 32, tmp); // convert big-endian to little-endian
3724   ror_imm(t1, t1, 32, tmp);
3725 
3726   mul(tmp, product_hi, yz_idx2); //  yz_idx2 * product_hi -> carry2:tmp
3727   mulhu(carry2, product_hi, yz_idx2);
3728 
3729   cad(tmp3, tmp3, carry, carry);
3730   adc(tmp4, tmp4, zr, carry);
3731   cad(tmp3, tmp3, t0, t0);
3732   cadc(tmp4, tmp4, tmp, t0);
3733   adc(carry, carry2, zr, t0);
3734   cad(tmp4, tmp4, t1, carry2);
3735   adc(carry, carry, zr, carry2);
3736 
3737   ror_imm(tmp3, tmp3, 32); // convert little-endian to big-endian
3738   ror_imm(tmp4, tmp4, 32);
3739   sd(tmp4, Address(tmp6, 0));
3740   sd(tmp3, Address(tmp6, wordSize));
3741 
3742   j(L_third_loop);
3743 
3744   bind(L_third_loop_exit);
3745 
3746   andi(idx, idx, 0x3);
3747   beqz(idx, L_post_third_loop_done);
3748 
3749   Label L_check_1;
3750   subw(idx, idx, 2);
3751   bltz(idx, L_check_1);
3752 
3753   shadd(t0, idx, y, t0, LogBytesPerInt);
3754   ld(yz_idx1, Address(t0, 0));
3755   ror_imm(yz_idx1, yz_idx1, 32);
3756 
3757   mul(tmp3, product_hi, yz_idx1); //  yz_idx1 * product_hi -> tmp4:tmp3
3758   mulhu(tmp4, product_hi, yz_idx1);
3759 
3760   shadd(t0, idx, z, t0, LogBytesPerInt);
3761   ld(yz_idx2, Address(t0, 0));
3762   ror_imm(yz_idx2, yz_idx2, 32, tmp);
3763 
3764   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
3765 
3766   ror_imm(tmp3, tmp3, 32, tmp);
3767   sd(tmp3, Address(t0, 0));
3768 
3769   bind(L_check_1);
3770 
3771   andi(idx, idx, 0x1);
3772   subw(idx, idx, 1);
3773   bltz(idx, L_post_third_loop_done);
3774   shadd(t0, idx, y, t0, LogBytesPerInt);
3775   lwu(tmp4, Address(t0, 0));
3776   mul(tmp3, tmp4, product_hi); //  tmp4 * product_hi -> carry2:tmp3
3777   mulhu(carry2, tmp4, product_hi);
3778 
3779   shadd(t0, idx, z, t0, LogBytesPerInt);
3780   lwu(tmp4, Address(t0, 0));
3781 
3782   add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
3783 
3784   shadd(t0, idx, z, t0, LogBytesPerInt);
3785   sw(tmp3, Address(t0, 0));
3786 
3787   slli(t0, carry2, 32);
3788   srli(carry, tmp3, 32);
3789   orr(carry, carry, t0);
3790 
3791   bind(L_post_third_loop_done);
3792 }
3793 
3794 /**
3795  * Code for BigInteger::multiplyToLen() intrinsic.
3796  *
3797  * x10: x
3798  * x11: xlen
3799  * x12: y
3800  * x13: ylen
3801  * x14: z
3802  * x15: zlen
3803  * x16: tmp1
3804  * x17: tmp2
3805  * x7:  tmp3
3806  * x28: tmp4
3807  * x29: tmp5
3808  * x30: tmp6
3809  * x31: tmp7
3810  */
3811 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3812                                      Register z, Register zlen,
3813                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3814                                      Register tmp5, Register tmp6, Register product_hi) {
3815   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3816 
3817   const Register idx = tmp1;
3818   const Register kdx = tmp2;
3819   const Register xstart = tmp3;
3820 
3821   const Register y_idx = tmp4;
3822   const Register carry = tmp5;
3823   const Register product = xlen;
3824   const Register x_xstart = zlen; // reuse register
3825 
3826   mv(idx, ylen); // idx = ylen;
3827   mv(kdx, zlen); // kdx = xlen+ylen;
3828   mv(carry, zr); // carry = 0;
3829 
3830   Label L_multiply_64_x_64_loop, L_done;
3831 
3832   subw(xstart, xlen, 1);
3833   bltz(xstart, L_done);
3834 
3835   const Register jdx = tmp1;
3836 
3837   if (AvoidUnalignedAccesses) {
3838     // Check if x and y are both 8-byte aligned.
3839     orr(t0, xlen, ylen);
3840     test_bit(t0, t0, 0);
3841     beqz(t0, L_multiply_64_x_64_loop);
3842 
3843     multiply_32_x_32_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3844     shadd(t0, xstart, z, t0, LogBytesPerInt);
3845     sw(carry, Address(t0, 0));
3846 
3847     Label L_second_loop_unaligned;
3848     bind(L_second_loop_unaligned);
3849     mv(carry, zr);
3850     mv(jdx, ylen);
3851     subw(xstart, xstart, 1);
3852     bltz(xstart, L_done);
3853     sub(sp, sp, 2 * wordSize);
3854     sd(z, Address(sp, 0));
3855     sd(zr, Address(sp, wordSize));
3856     shadd(t0, xstart, z, t0, LogBytesPerInt);
3857     addi(z, t0, 4);
3858     shadd(t0, xstart, x, t0, LogBytesPerInt);
3859     lwu(product, Address(t0, 0));
3860     Label L_third_loop, L_third_loop_exit;
3861 
3862     blez(jdx, L_third_loop_exit);
3863 
3864     bind(L_third_loop);
3865     subw(jdx, jdx, 1);
3866     shadd(t0, jdx, y, t0, LogBytesPerInt);
3867     lwu(t0, Address(t0, 0));
3868     mul(t1, t0, product);
3869     add(t0, t1, carry);
3870     shadd(tmp6, jdx, z, t1, LogBytesPerInt);
3871     lwu(t1, Address(tmp6, 0));
3872     add(t0, t0, t1);
3873     sw(t0, Address(tmp6, 0));
3874     srli(carry, t0, 32);
3875     bgtz(jdx, L_third_loop);
3876 
3877     bind(L_third_loop_exit);
3878     ld(z, Address(sp, 0));
3879     addi(sp, sp, 2 * wordSize);
3880     shadd(t0, xstart, z, t0, LogBytesPerInt);
3881     sw(carry, Address(t0, 0));
3882 
3883     j(L_second_loop_unaligned);
3884   }
3885 
3886   bind(L_multiply_64_x_64_loop);
3887   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3888 
3889   Label L_second_loop_aligned;
3890   beqz(kdx, L_second_loop_aligned);
3891 
3892   Label L_carry;
3893   subw(kdx, kdx, 1);
3894   beqz(kdx, L_carry);
3895 
3896   shadd(t0, kdx, z, t0, LogBytesPerInt);
3897   sw(carry, Address(t0, 0));
3898   srli(carry, carry, 32);
3899   subw(kdx, kdx, 1);
3900 
3901   bind(L_carry);
3902   shadd(t0, kdx, z, t0, LogBytesPerInt);
3903   sw(carry, Address(t0, 0));
3904 
3905   // Second and third (nested) loops.
3906   //
3907   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3908   //   carry = 0;
3909   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3910   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3911   //                    (z[k] & LONG_MASK) + carry;
3912   //     z[k] = (int)product;
3913   //     carry = product >>> 32;
3914   //   }
3915   //   z[i] = (int)carry;
3916   // }
3917   //
3918   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3919 
3920   bind(L_second_loop_aligned);
3921   mv(carry, zr); // carry = 0;
3922   mv(jdx, ylen); // j = ystart+1
3923 
3924   subw(xstart, xstart, 1); // i = xstart-1;
3925   bltz(xstart, L_done);
3926 
3927   sub(sp, sp, 4 * wordSize);
3928   sd(z, Address(sp, 0));
3929 
3930   Label L_last_x;
3931   shadd(t0, xstart, z, t0, LogBytesPerInt);
3932   addi(z, t0, 4);
3933   subw(xstart, xstart, 1); // i = xstart-1;
3934   bltz(xstart, L_last_x);
3935 
3936   shadd(t0, xstart, x, t0, LogBytesPerInt);
3937   ld(product_hi, Address(t0, 0));
3938   ror_imm(product_hi, product_hi, 32); // convert big-endian to little-endian
3939 
3940   Label L_third_loop_prologue;
3941   bind(L_third_loop_prologue);
3942 
3943   sd(ylen, Address(sp, wordSize));
3944   sd(x, Address(sp, 2 * wordSize));
3945   sd(xstart, Address(sp, 3 * wordSize));
3946   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3947                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3948   ld(z, Address(sp, 0));
3949   ld(ylen, Address(sp, wordSize));
3950   ld(x, Address(sp, 2 * wordSize));
3951   ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
3952   addi(sp, sp, 4 * wordSize);
3953 
3954   addiw(tmp3, xlen, 1);
3955   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3956   sw(carry, Address(t0, 0));
3957 
3958   subw(tmp3, tmp3, 1);
3959   bltz(tmp3, L_done);
3960 
3961   srli(carry, carry, 32);
3962   shadd(t0, tmp3, z, t0, LogBytesPerInt);
3963   sw(carry, Address(t0, 0));
3964   j(L_second_loop_aligned);
3965 
3966   // Next infrequent code is moved outside loops.
3967   bind(L_last_x);
3968   lwu(product_hi, Address(x, 0));
3969   j(L_third_loop_prologue);
3970 
3971   bind(L_done);
3972 }
3973 #endif
3974 
3975 // Count bits of trailing zero chars from lsb to msb until first non-zero element.
3976 // For LL case, one byte for one element, so shift 8 bits once, and for other case,
3977 // shift 16 bits once.
3978 void MacroAssembler::ctzc_bit(Register Rd, Register Rs, bool isLL, Register tmp1, Register tmp2) {
3979   if (UseZbb) {
3980     assert_different_registers(Rd, Rs, tmp1);
3981     int step = isLL ? 8 : 16;
3982     ctz(Rd, Rs);
3983     andi(tmp1, Rd, step - 1);
3984     sub(Rd, Rd, tmp1);
3985     return;
3986   }
3987 
3988   assert_different_registers(Rd, Rs, tmp1, tmp2);
3989   Label Loop;
3990   int step = isLL ? 8 : 16;
3991   mv(Rd, -step);
3992   mv(tmp2, Rs);
3993 
3994   bind(Loop);
3995   addi(Rd, Rd, step);
3996   andi(tmp1, tmp2, ((1 << step) - 1));
3997   srli(tmp2, tmp2, step);
3998   beqz(tmp1, Loop);
3999 }
4000 
4001 // This instruction reads adjacent 4 bytes from the lower half of source register,
4002 // inflate into a register, for example:
4003 // Rs: A7A6A5A4A3A2A1A0
4004 // Rd: 00A300A200A100A0
4005 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4006   assert_different_registers(Rd, Rs, tmp1, tmp2);
4007 
4008   mv(tmp1, 0xFF000000); // first byte mask at lower word
4009   andr(Rd, Rs, tmp1);
4010   for (int i = 0; i < 2; i++) {
4011     slli(Rd, Rd, wordSize);
4012     srli(tmp1, tmp1, wordSize);
4013     andr(tmp2, Rs, tmp1);
4014     orr(Rd, Rd, tmp2);
4015   }
4016   slli(Rd, Rd, wordSize);
4017   andi(tmp2, Rs, 0xFF); // last byte mask at lower word
4018   orr(Rd, Rd, tmp2);
4019 }
4020 
4021 // This instruction reads adjacent 4 bytes from the upper half of source register,
4022 // inflate into a register, for example:
4023 // Rs: A7A6A5A4A3A2A1A0
4024 // Rd: 00A700A600A500A4
4025 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
4026   assert_different_registers(Rd, Rs, tmp1, tmp2);
4027   srli(Rs, Rs, 32);   // only upper 32 bits are needed
4028   inflate_lo32(Rd, Rs, tmp1, tmp2);
4029 }
4030 
4031 // The size of the blocks erased by the zero_blocks stub.  We must
4032 // handle anything smaller than this ourselves in zero_words().
4033 const int MacroAssembler::zero_words_block_size = 8;
4034 
4035 // zero_words() is used by C2 ClearArray patterns.  It is as small as
4036 // possible, handling small word counts locally and delegating
4037 // anything larger to the zero_blocks stub.  It is expanded many times
4038 // in compiled code, so it is important to keep it short.
4039 
4040 // ptr:   Address of a buffer to be zeroed.
4041 // cnt:   Count in HeapWords.
4042 //
4043 // ptr, cnt, and t0 are clobbered.
4044 address MacroAssembler::zero_words(Register ptr, Register cnt) {
4045   assert(is_power_of_2(zero_words_block_size), "adjust this");
4046   assert(ptr == x28 && cnt == x29, "mismatch in register usage");
4047   assert_different_registers(cnt, t0);
4048 
4049   BLOCK_COMMENT("zero_words {");
4050 
4051   mv(t0, zero_words_block_size);
4052   Label around, done, done16;
4053   bltu(cnt, t0, around);
4054   {
4055     RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::riscv::zero_blocks());
4056     assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
4057     if (StubRoutines::riscv::complete()) {
4058       address tpc = trampoline_call(zero_blocks);
4059       if (tpc == nullptr) {
4060         DEBUG_ONLY(reset_labels(around));
4061         postcond(pc() == badAddress);
4062         return nullptr;
4063       }
4064     } else {
4065       jal(zero_blocks);
4066     }
4067   }
4068   bind(around);
4069   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4070     Label l;
4071     test_bit(t0, cnt, exact_log2(i));
4072     beqz(t0, l);
4073     for (int j = 0; j < i; j++) {
4074       sd(zr, Address(ptr, j * wordSize));
4075     }
4076     addi(ptr, ptr, i * wordSize);
4077     bind(l);
4078   }
4079   {
4080     Label l;
4081     test_bit(t0, cnt, 0);
4082     beqz(t0, l);
4083     sd(zr, Address(ptr, 0));
4084     bind(l);
4085   }
4086 
4087   BLOCK_COMMENT("} zero_words");
4088   postcond(pc() != badAddress);
4089   return pc();
4090 }
4091 
4092 #define SmallArraySize (18 * BytesPerLong)
4093 
4094 // base:  Address of a buffer to be zeroed, 8 bytes aligned.
4095 // cnt:   Immediate count in HeapWords.
4096 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
4097   assert_different_registers(base, t0, t1);
4098 
4099   BLOCK_COMMENT("zero_words {");
4100 
4101   if (cnt <= SmallArraySize / BytesPerLong) {
4102     for (int i = 0; i < (int)cnt; i++) {
4103       sd(zr, Address(base, i * wordSize));
4104     }
4105   } else {
4106     const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
4107     int remainder = cnt % unroll;
4108     for (int i = 0; i < remainder; i++) {
4109       sd(zr, Address(base, i * wordSize));
4110     }
4111 
4112     Label loop;
4113     Register cnt_reg = t0;
4114     Register loop_base = t1;
4115     cnt = cnt - remainder;
4116     mv(cnt_reg, cnt);
4117     add(loop_base, base, remainder * wordSize);
4118     bind(loop);
4119     sub(cnt_reg, cnt_reg, unroll);
4120     for (int i = 0; i < unroll; i++) {
4121       sd(zr, Address(loop_base, i * wordSize));
4122     }
4123     add(loop_base, loop_base, unroll * wordSize);
4124     bnez(cnt_reg, loop);
4125   }
4126 
4127   BLOCK_COMMENT("} zero_words");
4128 }
4129 
4130 // base:   Address of a buffer to be filled, 8 bytes aligned.
4131 // cnt:    Count in 8-byte unit.
4132 // value:  Value to be filled with.
4133 // base will point to the end of the buffer after filling.
4134 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
4135 //  Algorithm:
4136 //
4137 //    t0 = cnt & 7
4138 //    cnt -= t0
4139 //    p += t0
4140 //    switch (t0):
4141 //      switch start:
4142 //      do while cnt
4143 //        cnt -= 8
4144 //          p[-8] = value
4145 //        case 7:
4146 //          p[-7] = value
4147 //        case 6:
4148 //          p[-6] = value
4149 //          // ...
4150 //        case 1:
4151 //          p[-1] = value
4152 //        case 0:
4153 //          p += 8
4154 //      do-while end
4155 //    switch end
4156 
4157   assert_different_registers(base, cnt, value, t0, t1);
4158 
4159   Label fini, skip, entry, loop;
4160   const int unroll = 8; // Number of sd instructions we'll unroll
4161 
4162   beqz(cnt, fini);
4163 
4164   andi(t0, cnt, unroll - 1);
4165   sub(cnt, cnt, t0);
4166   // align 8, so first sd n % 8 = mod, next loop sd 8 * n.
4167   shadd(base, t0, base, t1, 3);
4168   la(t1, entry);
4169   slli(t0, t0, 2); // sd_inst_nums * 4; t0 is cnt % 8, so t1 = t1 - sd_inst_nums * 4, 4 is sizeof(inst)
4170   sub(t1, t1, t0);
4171   jr(t1);
4172 
4173   bind(loop);
4174   add(base, base, unroll * 8);
4175   for (int i = -unroll; i < 0; i++) {
4176     sd(value, Address(base, i * 8));
4177   }
4178   bind(entry);
4179   sub(cnt, cnt, unroll);
4180   bgez(cnt, loop);
4181 
4182   bind(fini);
4183 }
4184 
4185 // Zero blocks of memory by using CBO.ZERO.
4186 //
4187 // Aligns the base address first sufficiently for CBO.ZERO, then uses
4188 // CBO.ZERO repeatedly for every full block.  cnt is the size to be
4189 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4190 // in cnt.
4191 //
4192 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4193 // you want to use it elsewhere, note that cnt must be >= CacheLineSize.
4194 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
4195   Label initial_table_end, loop;
4196 
4197   // Align base with cache line size.
4198   neg(tmp1, base);
4199   andi(tmp1, tmp1, CacheLineSize - 1);
4200 
4201   // tmp1: the number of bytes to be filled to align the base with cache line size.
4202   add(base, base, tmp1);
4203   srai(tmp2, tmp1, 3);
4204   sub(cnt, cnt, tmp2);
4205   srli(tmp2, tmp1, 1);
4206   la(tmp1, initial_table_end);
4207   sub(tmp2, tmp1, tmp2);
4208   jr(tmp2);
4209   for (int i = -CacheLineSize + wordSize; i < 0; i += wordSize) {
4210     sd(zr, Address(base, i));
4211   }
4212   bind(initial_table_end);
4213 
4214   mv(tmp1, CacheLineSize / wordSize);
4215   bind(loop);
4216   cbo_zero(base);
4217   sub(cnt, cnt, tmp1);
4218   add(base, base, CacheLineSize);
4219   bge(cnt, tmp1, loop);
4220 }
4221 
4222 #define FCVT_SAFE(FLOATCVT, FLOATSIG)                                                     \
4223 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) {     \
4224   Label done;                                                                             \
4225   assert_different_registers(dst, tmp);                                                   \
4226   fclass_##FLOATSIG(tmp, src);                                                            \
4227   mv(dst, zr);                                                                            \
4228   /* check if src is NaN */                                                               \
4229   andi(tmp, tmp, 0b1100000000);                                                           \
4230   bnez(tmp, done);                                                                        \
4231   FLOATCVT(dst, src);                                                                     \
4232   bind(done);                                                                             \
4233 }
4234 
4235 FCVT_SAFE(fcvt_w_s, s);
4236 FCVT_SAFE(fcvt_l_s, s);
4237 FCVT_SAFE(fcvt_w_d, d);
4238 FCVT_SAFE(fcvt_l_d, d);
4239 
4240 #undef FCVT_SAFE
4241 
4242 #define FCMP(FLOATTYPE, FLOATSIG)                                                       \
4243 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1,            \
4244                                          FloatRegister Rs2, int unordered_result) {     \
4245   Label Ldone;                                                                          \
4246   if (unordered_result < 0) {                                                           \
4247     /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */    \
4248     /* installs 1 if gt else 0 */                                                       \
4249     flt_##FLOATSIG(result, Rs2, Rs1);                                                   \
4250     /* Rs1 > Rs2, install 1 */                                                          \
4251     bgtz(result, Ldone);                                                                \
4252     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4253     addi(result, result, -1);                                                           \
4254     /* Rs1 = Rs2, install 0 */                                                          \
4255     /* NaN or Rs1 < Rs2, install -1 */                                                  \
4256     bind(Ldone);                                                                        \
4257   } else {                                                                              \
4258     /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */    \
4259     /* installs 1 if gt or unordered else 0 */                                          \
4260     flt_##FLOATSIG(result, Rs1, Rs2);                                                   \
4261     /* Rs1 < Rs2, install -1 */                                                         \
4262     bgtz(result, Ldone);                                                                \
4263     feq_##FLOATSIG(result, Rs1, Rs2);                                                   \
4264     addi(result, result, -1);                                                           \
4265     /* Rs1 = Rs2, install 0 */                                                          \
4266     /* NaN or Rs1 > Rs2, install 1 */                                                   \
4267     bind(Ldone);                                                                        \
4268     neg(result, result);                                                                \
4269   }                                                                                     \
4270 }
4271 
4272 FCMP(float, s);
4273 FCMP(double, d);
4274 
4275 #undef FCMP
4276 
4277 // Zero words; len is in bytes
4278 // Destroys all registers except addr
4279 // len must be a nonzero multiple of wordSize
4280 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
4281   assert_different_registers(addr, len, tmp, t0, t1);
4282 
4283 #ifdef ASSERT
4284   {
4285     Label L;
4286     andi(t0, len, BytesPerWord - 1);
4287     beqz(t0, L);
4288     stop("len is not a multiple of BytesPerWord");
4289     bind(L);
4290   }
4291 #endif // ASSERT
4292 
4293 #ifndef PRODUCT
4294   block_comment("zero memory");
4295 #endif // PRODUCT
4296 
4297   Label loop;
4298   Label entry;
4299 
4300   // Algorithm:
4301   //
4302   //  t0 = cnt & 7
4303   //  cnt -= t0
4304   //  p += t0
4305   //  switch (t0) {
4306   //    do {
4307   //      cnt -= 8
4308   //        p[-8] = 0
4309   //      case 7:
4310   //        p[-7] = 0
4311   //      case 6:
4312   //        p[-6] = 0
4313   //        ...
4314   //      case 1:
4315   //        p[-1] = 0
4316   //      case 0:
4317   //        p += 8
4318   //     } while (cnt)
4319   //  }
4320 
4321   const int unroll = 8;   // Number of sd(zr) instructions we'll unroll
4322 
4323   srli(len, len, LogBytesPerWord);
4324   andi(t0, len, unroll - 1);  // t0 = cnt % unroll
4325   sub(len, len, t0);          // cnt -= unroll
4326   // tmp always points to the end of the region we're about to zero
4327   shadd(tmp, t0, addr, t1, LogBytesPerWord);
4328   la(t1, entry);
4329   slli(t0, t0, 2);
4330   sub(t1, t1, t0);
4331   jr(t1);
4332   bind(loop);
4333   sub(len, len, unroll);
4334   for (int i = -unroll; i < 0; i++) {
4335     sd(zr, Address(tmp, i * wordSize));
4336   }
4337   bind(entry);
4338   add(tmp, tmp, unroll * wordSize);
4339   bnez(len, loop);
4340 }
4341 
4342 // shift left by shamt and add
4343 // Rd = (Rs1 << shamt) + Rs2
4344 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
4345   if (UseZba) {
4346     if (shamt == 1) {
4347       sh1add(Rd, Rs1, Rs2);
4348       return;
4349     } else if (shamt == 2) {
4350       sh2add(Rd, Rs1, Rs2);
4351       return;
4352     } else if (shamt == 3) {
4353       sh3add(Rd, Rs1, Rs2);
4354       return;
4355     }
4356   }
4357 
4358   if (shamt != 0) {
4359     assert_different_registers(Rs2, tmp);
4360     slli(tmp, Rs1, shamt);
4361     add(Rd, Rs2, tmp);
4362   } else {
4363     add(Rd, Rs1, Rs2);
4364   }
4365 }
4366 
4367 void MacroAssembler::zero_extend(Register dst, Register src, int bits) {
4368   if (UseZba && bits == 32) {
4369     zext_w(dst, src);
4370     return;
4371   }
4372 
4373   if (UseZbb && bits == 16) {
4374     zext_h(dst, src);
4375     return;
4376   }
4377 
4378   if (bits == 8) {
4379     zext_b(dst, src);
4380   } else {
4381     slli(dst, src, XLEN - bits);
4382     srli(dst, dst, XLEN - bits);
4383   }
4384 }
4385 
4386 void MacroAssembler::sign_extend(Register dst, Register src, int bits) {
4387   if (UseZbb) {
4388     if (bits == 8) {
4389       sext_b(dst, src);
4390       return;
4391     } else if (bits == 16) {
4392       sext_h(dst, src);
4393       return;
4394     }
4395   }
4396 
4397   if (bits == 32) {
4398     sext_w(dst, src);
4399   } else {
4400     slli(dst, src, XLEN - bits);
4401     srai(dst, dst, XLEN - bits);
4402   }
4403 }
4404 
4405 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
4406 {
4407   if (src1 == src2) {
4408     mv(dst, zr);
4409     return;
4410   }
4411   Label done;
4412   Register left = src1;
4413   Register right = src2;
4414   if (dst == src1) {
4415     assert_different_registers(dst, src2, tmp);
4416     mv(tmp, src1);
4417     left = tmp;
4418   } else if (dst == src2) {
4419     assert_different_registers(dst, src1, tmp);
4420     mv(tmp, src2);
4421     right = tmp;
4422   }
4423 
4424   // installs 1 if gt else 0
4425   slt(dst, right, left);
4426   bnez(dst, done);
4427   slt(dst, left, right);
4428   // dst = -1 if lt; else if eq , dst = 0
4429   neg(dst, dst);
4430   bind(done);
4431 }
4432 
4433 // The java_calling_convention describes stack locations as ideal slots on
4434 // a frame with no abi restrictions. Since we must observe abi restrictions
4435 // (like the placement of the register window) the slots must be biased by
4436 // the following value.
4437 static int reg2offset_in(VMReg r) {
4438   // Account for saved fp and ra
4439   // This should really be in_preserve_stack_slots
4440   return r->reg2stack() * VMRegImpl::stack_slot_size;
4441 }
4442 
4443 static int reg2offset_out(VMReg r) {
4444   return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
4445 }
4446 
4447 // On 64 bit we will store integer like items to the stack as
4448 // 64 bits items (riscv64 abi) even though java would only store
4449 // 32bits for a parameter. On 32bit it will simply be 32 bits
4450 // So this routine will do 32->32 on 32bit and 32->64 on 64bit
4451 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
4452   if (src.first()->is_stack()) {
4453     if (dst.first()->is_stack()) {
4454       // stack to stack
4455       ld(tmp, Address(fp, reg2offset_in(src.first())));
4456       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4457     } else {
4458       // stack to reg
4459       lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4460     }
4461   } else if (dst.first()->is_stack()) {
4462     // reg to stack
4463     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4464   } else {
4465     if (dst.first() != src.first()) {
4466       sign_extend(dst.first()->as_Register(), src.first()->as_Register(), 32);
4467     }
4468   }
4469 }
4470 
4471 // An oop arg. Must pass a handle not the oop itself
4472 void MacroAssembler::object_move(OopMap* map,
4473                                  int oop_handle_offset,
4474                                  int framesize_in_slots,
4475                                  VMRegPair src,
4476                                  VMRegPair dst,
4477                                  bool is_receiver,
4478                                  int* receiver_offset) {
4479   assert_cond(map != nullptr && receiver_offset != nullptr);
4480 
4481   // must pass a handle. First figure out the location we use as a handle
4482   Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
4483 
4484   // See if oop is null if it is we need no handle
4485 
4486   if (src.first()->is_stack()) {
4487     // Oop is already on the stack as an argument
4488     int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
4489     map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
4490     if (is_receiver) {
4491       *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
4492     }
4493 
4494     ld(t0, Address(fp, reg2offset_in(src.first())));
4495     la(rHandle, Address(fp, reg2offset_in(src.first())));
4496     // conditionally move a null
4497     Label notZero1;
4498     bnez(t0, notZero1);
4499     mv(rHandle, zr);
4500     bind(notZero1);
4501   } else {
4502 
4503     // Oop is in a register we must store it to the space we reserve
4504     // on the stack for oop_handles and pass a handle if oop is non-null
4505 
4506     const Register rOop = src.first()->as_Register();
4507     int oop_slot = -1;
4508     if (rOop == j_rarg0) {
4509       oop_slot = 0;
4510     } else if (rOop == j_rarg1) {
4511       oop_slot = 1;
4512     } else if (rOop == j_rarg2) {
4513       oop_slot = 2;
4514     } else if (rOop == j_rarg3) {
4515       oop_slot = 3;
4516     } else if (rOop == j_rarg4) {
4517       oop_slot = 4;
4518     } else if (rOop == j_rarg5) {
4519       oop_slot = 5;
4520     } else if (rOop == j_rarg6) {
4521       oop_slot = 6;
4522     } else {
4523       assert(rOop == j_rarg7, "wrong register");
4524       oop_slot = 7;
4525     }
4526 
4527     oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
4528     int offset = oop_slot * VMRegImpl::stack_slot_size;
4529 
4530     map->set_oop(VMRegImpl::stack2reg(oop_slot));
4531     // Store oop in handle area, may be null
4532     sd(rOop, Address(sp, offset));
4533     if (is_receiver) {
4534       *receiver_offset = offset;
4535     }
4536 
4537     //rOop maybe the same as rHandle
4538     if (rOop == rHandle) {
4539       Label isZero;
4540       beqz(rOop, isZero);
4541       la(rHandle, Address(sp, offset));
4542       bind(isZero);
4543     } else {
4544       Label notZero2;
4545       la(rHandle, Address(sp, offset));
4546       bnez(rOop, notZero2);
4547       mv(rHandle, zr);
4548       bind(notZero2);
4549     }
4550   }
4551 
4552   // If arg is on the stack then place it otherwise it is already in correct reg.
4553   if (dst.first()->is_stack()) {
4554     sd(rHandle, Address(sp, reg2offset_out(dst.first())));
4555   }
4556 }
4557 
4558 // A float arg may have to do float reg int reg conversion
4559 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
4560   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4561          src.first()->is_reg() && dst.first()->is_reg() ||
4562          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4563   if (src.first()->is_stack()) {
4564     if (dst.first()->is_stack()) {
4565       lwu(tmp, Address(fp, reg2offset_in(src.first())));
4566       sw(tmp, Address(sp, reg2offset_out(dst.first())));
4567     } else if (dst.first()->is_Register()) {
4568       lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4569     } else {
4570       ShouldNotReachHere();
4571     }
4572   } else if (src.first() != dst.first()) {
4573     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4574       fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4575     } else {
4576       ShouldNotReachHere();
4577     }
4578   }
4579 }
4580 
4581 // A long move
4582 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
4583   if (src.first()->is_stack()) {
4584     if (dst.first()->is_stack()) {
4585       // stack to stack
4586       ld(tmp, Address(fp, reg2offset_in(src.first())));
4587       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4588     } else {
4589       // stack to reg
4590       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4591     }
4592   } else if (dst.first()->is_stack()) {
4593     // reg to stack
4594     sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
4595   } else {
4596     if (dst.first() != src.first()) {
4597       mv(dst.first()->as_Register(), src.first()->as_Register());
4598     }
4599   }
4600 }
4601 
4602 // A double move
4603 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
4604   assert(src.first()->is_stack() && dst.first()->is_stack() ||
4605          src.first()->is_reg() && dst.first()->is_reg() ||
4606          src.first()->is_stack() && dst.first()->is_reg(), "Unexpected error");
4607   if (src.first()->is_stack()) {
4608     if (dst.first()->is_stack()) {
4609       ld(tmp, Address(fp, reg2offset_in(src.first())));
4610       sd(tmp, Address(sp, reg2offset_out(dst.first())));
4611     } else if (dst.first()-> is_Register()) {
4612       ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
4613     } else {
4614       ShouldNotReachHere();
4615     }
4616   } else if (src.first() != dst.first()) {
4617     if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
4618       fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
4619     } else {
4620       ShouldNotReachHere();
4621     }
4622   }
4623 }
4624 
4625 void MacroAssembler::rt_call(address dest, Register tmp) {
4626   CodeBlob *cb = CodeCache::find_blob(dest);
4627   RuntimeAddress target(dest);
4628   if (cb) {
4629     far_call(target);
4630   } else {
4631     relocate(target.rspec(), [&] {
4632       int32_t offset;
4633       la_patchable(tmp, target, offset);
4634       jalr(x1, tmp, offset);
4635     });
4636   }
4637 }
4638 
4639 // Implements fast-locking.
4640 // Branches to slow upon failure to lock the object.
4641 // Falls through upon success.
4642 //
4643 //  - obj: the object to be locked
4644 //  - hdr: the header, already loaded from obj, will be destroyed
4645 //  - tmp1, tmp2: temporary registers, will be destroyed
4646 void MacroAssembler::fast_lock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow) {
4647   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4648   assert_different_registers(obj, hdr, tmp1, tmp2);
4649 
4650   // Check if we would have space on lock-stack for the object.
4651   lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4652   mv(tmp2, (unsigned)LockStack::end_offset());
4653   bge(tmp1, tmp2, slow, /* is_far */ true);
4654 
4655   // Load (object->mark() | 1) into hdr
4656   ori(hdr, hdr, markWord::unlocked_value);
4657   // Clear lock-bits, into tmp2
4658   xori(tmp2, hdr, markWord::unlocked_value);
4659 
4660   // Try to swing header from unlocked to locked
4661   Label success;
4662   cmpxchgptr(hdr, tmp2, obj, tmp1, success, &slow);
4663   bind(success);
4664 
4665   // After successful lock, push object on lock-stack
4666   lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4667   add(tmp2, xthread, tmp1);
4668   sd(obj, Address(tmp2, 0));
4669   addw(tmp1, tmp1, oopSize);
4670   sw(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4671 }
4672 
4673 // Implements fast-unlocking.
4674 // Branches to slow upon failure.
4675 // Falls through upon success.
4676 //
4677 // - obj: the object to be unlocked
4678 // - hdr: the (pre-loaded) header of the object
4679 // - tmp1, tmp2: temporary registers
4680 void MacroAssembler::fast_unlock(Register obj, Register hdr, Register tmp1, Register tmp2, Label& slow) {
4681   assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4682   assert_different_registers(obj, hdr, tmp1, tmp2);
4683 
4684 #ifdef ASSERT
4685   {
4686     // The following checks rely on the fact that LockStack is only ever modified by
4687     // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4688     // entries after inflation will happen delayed in that case.
4689 
4690     // Check for lock-stack underflow.
4691     Label stack_ok;
4692     lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4693     mv(tmp2, (unsigned)LockStack::start_offset());
4694     bgt(tmp1, tmp2, stack_ok);
4695     STOP("Lock-stack underflow");
4696     bind(stack_ok);
4697   }
4698   {
4699     // Check if the top of the lock-stack matches the unlocked object.
4700     Label tos_ok;
4701     subw(tmp1, tmp1, oopSize);
4702     add(tmp1, xthread, tmp1);
4703     ld(tmp1, Address(tmp1, 0));
4704     beq(tmp1, obj, tos_ok);
4705     STOP("Top of lock-stack does not match the unlocked object");
4706     bind(tos_ok);
4707   }
4708   {
4709     // Check that hdr is fast-locked.
4710    Label hdr_ok;
4711     andi(tmp1, hdr, markWord::lock_mask_in_place);
4712     beqz(tmp1, hdr_ok);
4713     STOP("Header is not fast-locked");
4714     bind(hdr_ok);
4715   }
4716 #endif
4717 
4718   // Load the new header (unlocked) into tmp1
4719   ori(tmp1, hdr, markWord::unlocked_value);
4720 
4721   // Try to swing header from locked to unlocked
4722   Label success;
4723   cmpxchgptr(hdr, tmp1, obj, tmp2, success, &slow);
4724   bind(success);
4725 
4726   // After successful unlock, pop object from lock-stack
4727   lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4728   subw(tmp1, tmp1, oopSize);
4729 #ifdef ASSERT
4730   add(tmp2, xthread, tmp1);
4731   sd(zr, Address(tmp2, 0));
4732 #endif
4733   sw(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
4734 }